repos / punycode.zig.git


punycode.zig.git / src
Evgenii Akentev  ·  2025-10-17

punycode.zig

  1const std = @import("std");
  2const unicode = @import("std").unicode;
  3const ArrayList = std.array_list.Managed;
  4const expectEqual = std.testing.expectEqual;
  5const expectEqualSlices = std.testing.expectEqualSlices;
  6
  7const base = 36;
  8
  9const tmin = 1;
 10const tmax = 26;
 11
 12const skew = 38;
 13const damp = 700;
 14
 15const initialBias = 72;
 16const initialN = 128;
 17
 18fn adapt(delta: usize, numpoints: usize, firstTime: bool) usize {
 19    var newDelta: usize = if (firstTime) (delta / damp) else (delta >> 1);
 20    newDelta += newDelta / numpoints; 
 21
 22    var k: u32 = 0;
 23    const deltaCond: u32 = (base - tmin) * tmax / 2;
 24    while (newDelta > deltaCond) : (k += base) {
 25        newDelta = newDelta / (base - tmin);
 26    }
 27    return k + (((base - tmin + 1) * newDelta) / (newDelta + skew));
 28}
 29
 30fn encodeDigit(i: usize) u8 {
 31    if (i < 26) {
 32        return @truncate(i + 97); // ascii code of 'a'
 33    } else {
 34        return @truncate(i + 22); // ascii code of '0' - 26
 35    }
 36}
 37
 38fn decodeDigit(cp: u21) u21 {
 39    if (cp - 48 < 10) {
 40        return cp - 22;
 41    } else if (cp - 65 < 26) {
 42        return cp - 65;
 43    } else if (cp - 97 < 26) {
 44        return cp - 97;
 45    } else return base;
 46}
 47
 48test "decode digit" {
 49    try expectEqual(@as(u21, 0), decodeDigit('a'));
 50    try expectEqual(@as(u21, 0), decodeDigit('A'));
 51    try expectEqual(@as(u21, 25), decodeDigit('z'));
 52    try expectEqual(@as(u21, 26), decodeDigit('0'));
 53    try expectEqual(@as(u21, 35), decodeDigit('9'));
 54    try expectEqual(@as(u21, 0), decodeDigit(65));
 55}
 56
 57pub fn encode(alloc: std.mem.Allocator, input: []const u21) !ArrayList(u8) {
 58    var result = ArrayList(u8).init(alloc);
 59
 60    for (input) |c| {
 61        if (c < initialN) {
 62            try result.append(@truncate(c));
 63        }
 64    }
 65
 66    const numOfBasics = result.items.len;
 67
 68    if (numOfBasics > 0) {
 69        try result.append('-');
 70    }
 71
 72    var bias: usize = initialBias;
 73    var delta: usize = 0;
 74    var n: usize = initialN;
 75    var h: usize = numOfBasics; 
 76
 77    while (h < input.len) {
 78        var m: usize = std.math.maxInt(usize);
 79
 80        for (input) |c| {
 81            if (c >= n and c < m) {
 82                m = c; 
 83            }
 84        }
 85
 86        delta += (m - n) * (h + 1);
 87        n = m;
 88
 89        for (input) |c| {
 90            if (c < n) {
 91                delta += 1;
 92            }
 93
 94            if (c == n) {
 95                var k: usize = base;
 96                var q: usize = delta;
 97
 98                while (true) : (k += base) {
 99                    const t = if (k <= bias + tmin) tmin else (if (k >= bias + tmax) tmax else (k - bias)); 
100                    if (q < t) break;
101
102                    const newChar: usize = t + ((q - t) % (base - t)); 
103
104                    try result.append(encodeDigit(newChar));
105                    q = (q - t) / (base - t);
106                }
107
108                try result.append(encodeDigit(q));
109
110                bias = adapt(delta, h + 1, h == numOfBasics);
111                delta = 0;
112                h += 1;
113            } 
114        }
115
116        delta += 1;
117        n += 1;
118    }
119
120    return result;
121}
122
123const DecodeError = error{
124    BadInput,
125};
126
127pub fn decode(alloc: std.mem.Allocator, input: []const u8) error{BadInput, OutOfMemory}!ArrayList(u21) {
128    var result = ArrayList(u21).init(alloc);
129
130    var b: usize = 0;
131    for (input, 0..) |c, i| {
132        if (c == '-') {
133            b = i;
134        }
135    }
136
137    var j: usize = 0;
138    while (j < b) : (j += 1) {
139        if (input[j] >= initialN) {
140            return error.BadInput;
141        }
142
143        try result.append(input[j]);
144    }
145
146    var n: usize = initialN;
147    var i: usize = 0;
148    var bias: usize = initialBias;
149
150    var in = if (0 < b) b + 1 else 0; 
151    while (in < input.len) {
152        const oldI: usize = i;
153        var w: usize = 1;
154        var k: usize = base;
155
156        while(true) : (k += base) {
157            if (in >= input.len) {
158                return DecodeError.BadInput;
159            }
160
161            const digit = decodeDigit(input[in]);
162            in += 1;
163
164            if (digit >= base) {
165                return DecodeError.BadInput;
166            }
167
168            i += digit * w;
169
170            const t = if (k <= bias) tmin else (if (k >= bias + tmax) tmax else (k - bias)); 
171
172            if (digit < t) break;
173
174            w *= base - t;
175        }
176
177        const resultSize = result.items.len;
178        bias = adapt(i - oldI, resultSize + 1, oldI == 0);
179        n += i / (resultSize + 1);
180        i %= (resultSize + 1);
181
182        try result.insert(i, @truncate(n));
183        i += 1;
184    }
185
186    return result;
187}
188
189// tests
190
191test {
192    const alloc = std.testing.allocator;
193
194    var input = ArrayList(u21).init(alloc);
195    defer input.deinit();
196
197    var utf8 = (try std.unicode.Utf8View.init("3年b組金八先生")).iterator();
198    while (utf8.nextCodepoint()) |codepoint| {
199        try input.append(codepoint);
200    }
201
202    const encoded = try encode(alloc, input.items); 
203    defer encoded.deinit();
204
205    try expectEqualSlices(u8, encoded.items, "3b-ww4c5e180e575a65lsy2b");
206}
207
208test {
209    const alloc = std.testing.allocator;
210
211    const decoded = try decode(alloc, "3b-ww4c5e180e575a65lsy2b");
212    defer decoded.deinit();
213
214    try expectEqualSlices(u21, &.{ 0x0033, 0x5E74, 0x0062, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F }, decoded.items);
215}
216
217const testValues = [_]struct {[]const u21, []const u8}
218    {   // Arabic (Egyptian):
219        .{ &.{ 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, 0x0644,
220               0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, 0x061F } , "egbpdaj6bu4bxfgehfvwxn" },
221
222        // Chinese (simplified):
223        .{ &.{ 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587} , "ihqwcrb4cv8a8dqg056pqjye" },
224
225        // Chinese (traditional):
226        .{ &.{ 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587} , "ihqwctvzc91f659drss3x8bo0yb" },
227
228        // Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
229        .{ &.{ 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, 0x0074,
230                0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, 0x00ED, 0x010D,
231                0x0065, 0x0073, 0x006B, 0x0079 } , "Proprostnemluvesky-uyb24dma41a" },
232
233        // Hebrew:
234        .{ &.{ 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, 0x05D8,
235                0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x05DD, 0x05E2,
236                0x05D1, 0x05E8, 0x05D9, 0x05EA } , "4dbcagdahymbxekheh6e0a7fei0b" },
237
238        // Hindi (Devanagari):
239        .{ &.{ 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, 0x094D,
240                0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, 0x0928, 0x0939,
241                0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, 0x0915, 0x0924, 0x0947,
242                0x0939, 0x0948, 0x0902 } , "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" },
243
244        // Japanese (kanji and hiragana):
245        .{ &.{ 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, 0x3092,
246                0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, 0x306E, 0x304B } , "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" },
247
248        // Korean (Hangul syllables): 
249        .{ &.{ 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774,
250                0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74,
251                0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C } , "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" },
252
253        // Russian (Cyrillic):
254        .{ &.{ 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, 0x043E,
255                0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, 0x043E, 0x0440,
256                0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, 0x0441, 0x0441, 0x043A,
257                0x0438 } , "b1abfaaepdrnnbgefbadotcwatmq2g4l" },
258
259        // Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol:
260        .{ &.{ 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, 0x0070,
261                0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, 0x006D, 0x0070,
262                0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, 0x0065, 0x0068, 0x0061,
263                0x0062, 0x006C, 0x0061, 0x0072, 0x0065, 0x006E, 0x0045, 0x0073, 0x0070,
264                0x0061, 0x00F1, 0x006F, 0x006C } , "PorqunopuedensimplementehablarenEspaol-fmd56a" },
265
266        // Vietnamese: T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch<ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t:
267        .{ &.{ 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, 0x006B,
268                0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, 0x0063, 0x0068,
269                0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, 0x1EBF, 0x006E, 0x0067,
270                0x0056, 0x0069, 0x1EC7, 0x0074 } , "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" },
271
272        // 3<nen>B<gumi><kinpachi><sensei>
273        .{ &.{ 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F } , "3B-ww4c5e180e575a65lsy2b"},
274
275
276        // <amuro><namie>-with-SUPER-MONKEYS 
277        .{ &.{ 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, 0x0074,
278                0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, 0x002D, 0x004D,
279                0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053 } , "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"},
280
281        // Hello-Another-Way-<sorezore><no><basho>
282        .{ &.{ 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, 0x006F,
283                0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, 0x0079, 0x002D,
284                0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, 0x6240 } , "Hello-Another-Way--fc4qua05auwb3674vfr0b"},
285
286        // <hitotsu><yane><no><shita>2 
287        .{ &.{ 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032 } , "2-u9tlzr9756bt3uc0v"},
288
289        // Maji<de>Koi<suru>5<byou><mae>
290        .{ &.{ 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, 0x3059,
291                0x308B, 0x0035, 0x79D2, 0x524D } , "MajiKoi5-783gue6qz075azm5e"},
292
293        // <pafii>de<runba> 
294        .{ &.{ 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0 } , "de-jg4avhby1noc0d" },
295
296        // <sono><supiido><de>
297        .{ &.{ 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067 } , "d9juau41awczczp" },
298
299        // -> $1.00 <- 
300        .{ &.{ 0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0x002E, 0x0030, 0x0030, 0x0020, 0x003C, 0x002D } , "-> $1.00 <--" }
301
302    };
303
304test {
305    const alloc = std.testing.allocator;
306
307    for (testValues) |value| {
308        const encoded = try encode(alloc, value.@"0"); 
309        defer encoded.deinit();
310
311        try expectEqualSlices(u8, value.@"1", encoded.items);
312
313        const decoded = try decode(alloc, encoded.items); 
314        defer decoded.deinit();
315
316        try expectEqualSlices(u21, value.@"0", decoded.items);
317    }
318}
319