repos / punycode.zig.git


commit
e21b28b
parent
e21b28b
author
Evgenii Akentev
date
2025-10-17 19:04:36 +0400 +04
Create punycode library
6 files changed,  +490, -0
A .gitignore
+23, -0
 1@@ -0,0 +1,23 @@
 2+# This file is for zig-specific build artifacts.
 3+# If you have OS-specific or editor-specific files to ignore,
 4+# such as *.swp or .DS_Store, put those in your global
 5+# ~/.gitignore and put this in your ~/.gitconfig:
 6+#
 7+# [core]
 8+#     excludesfile = ~/.gitignore
 9+#
10+# Cheers!
11+# -andrewrk
12+
13+.zig-cache/
14+zig-out/
15+/release/
16+/debug/
17+/build/
18+/build-*/
19+/docgen_tmp/
20+
21+# Although this was renamed to .zig-cache, let's leave it here for a few
22+# releases to make it less annoying to work with multiple branches.
23+zig-cache/
24+
A LICENSE
+20, -0
 1@@ -0,0 +1,20 @@
 2+Copyright (c) 2025 Evgenii Akentev
 3+
 4+Permission is hereby granted, free of charge, to any person obtaining
 5+a copy of this software and associated documentation files (the
 6+"Software"), to deal in the Software without restriction, including
 7+without limitation the rights to use, copy, modify, merge, publish,
 8+distribute, sublicense, and/or sell copies of the Software, and to
 9+permit persons to whom the Software is furnished to do so, subject to
10+the following conditions:
11+
12+The above copyright notice and this permission notice shall be included
13+in all copies or substantial portions of the Software.
14+
15+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
A README.md
+1, -0
1@@ -0,0 +1 @@
2+Implementation of https://www.rfc-editor.org/rfc/rfc3492 in Zig
A build.zig
+46, -0
 1@@ -0,0 +1,46 @@
 2+const std = @import("std");
 3+
 4+pub fn build(b: *std.Build) void {
 5+    const target = b.standardTargetOptions(.{});
 6+    const optimize = b.standardOptimizeOption(.{});
 7+
 8+    const libpunycode = b.addLibrary(.{
 9+        .name = "punycode",
10+        .linkage = .static,
11+        .root_module = b.createModule(.{
12+            .root_source_file = b.path("src/punycode.zig"),
13+            .target = target,
14+            .optimize = optimize,
15+        }),
16+    });
17+
18+    b.installArtifact(libpunycode);
19+
20+    // Creates an executable that will run `test` blocks from the provided module.
21+    // Here `mod` needs to define a target, which is why earlier we made sure to
22+    // set the releative field.
23+    const mod_tests = b.addTest(.{
24+        .root_module = libpunycode.root_module,
25+    });
26+
27+    // A run step that will run the test executable.
28+    const run_mod_tests = b.addRunArtifact(mod_tests);
29+
30+    // A top level step for running all tests. dependOn can be called multiple
31+    // times and since the two run steps do not depend on one another, this will
32+    // make the two of them run in parallel.
33+    const test_step = b.step("test", "Run tests");
34+    test_step.dependOn(&run_mod_tests.step);
35+
36+    // Just like flags, top level steps are also listed in the `--help` menu.
37+    //
38+    // The Zig build system is entirely implemented in userland, which means
39+    // that it cannot hook into private compiler APIs. All compilation work
40+    // orchestrated by the build system will result in other Zig compiler
41+    // subcommands being invoked with the right flags defined. You can observe
42+    // these invocations when one fails (or you pass a flag to increase
43+    // verbosity) to validate assumptions and diagnose problems.
44+    //
45+    // Lastly, the Zig build system is relatively simple and self-contained,
46+    // and reading its source code will allow you to master it.
47+}
A build.zig.zon
+81, -0
 1@@ -0,0 +1,81 @@
 2+.{
 3+    // This is the default name used by packages depending on this one. For
 4+    // example, when a user runs `zig fetch --save <url>`, this field is used
 5+    // as the key in the `dependencies` table. Although the user can choose a
 6+    // different name, most users will stick with this provided value.
 7+    //
 8+    // It is redundant to include "zig" in this name because it is already
 9+    // within the Zig package namespace.
10+    .name = .punycode_zig,
11+    // This is a [Semantic Version](https://semver.org/).
12+    // In a future version of Zig it will be used for package deduplication.
13+    .version = "0.0.0",
14+    // Together with name, this represents a globally unique package
15+    // identifier. This field is generated by the Zig toolchain when the
16+    // package is first created, and then *never changes*. This allows
17+    // unambiguous detection of one package being an updated version of
18+    // another.
19+    //
20+    // When forking a Zig project, this id should be regenerated (delete the
21+    // field and run `zig build`) if the upstream project is still maintained.
22+    // Otherwise, the fork is *hostile*, attempting to take control over the
23+    // original project's identity. Thus it is recommended to leave the comment
24+    // on the following line intact, so that it shows up in code reviews that
25+    // modify the field.
26+    .fingerprint = 0xf0800252b14743b, // Changing this has security and trust implications.
27+    // Tracks the earliest Zig version that the package considers to be a
28+    // supported use case.
29+    .minimum_zig_version = "0.15.1",
30+    // This field is optional.
31+    // Each dependency must either provide a `url` and `hash`, or a `path`.
32+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
33+    // Once all dependencies are fetched, `zig build` no longer requires
34+    // internet connectivity.
35+    .dependencies = .{
36+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
37+        //.example = .{
38+        //    // When updating this field to a new URL, be sure to delete the corresponding
39+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
40+        //    // the new URL. If the contents of a URL change this will result in a hash mismatch
41+        //    // which will prevent zig from using it.
42+        //    .url = "https://example.com/foo.tar.gz",
43+        //
44+        //    // This is computed from the file contents of the directory of files that is
45+        //    // obtained after fetching `url` and applying the inclusion rules given by
46+        //    // `paths`.
47+        //    //
48+        //    // This field is the source of truth; packages do not come from a `url`; they
49+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
50+        //    // obtain a package matching this `hash`.
51+        //    //
52+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
53+        //    .hash = "...",
54+        //
55+        //    // When this is provided, the package is found in a directory relative to the
56+        //    // build root. In this case the package's hash is irrelevant and therefore not
57+        //    // computed. This field and `url` are mutually exclusive.
58+        //    .path = "foo",
59+        //
60+        //    // When this is set to `true`, a package is declared to be lazily
61+        //    // fetched. This makes the dependency only get fetched if it is
62+        //    // actually used.
63+        //    .lazy = false,
64+        //},
65+    },
66+    // Specifies the set of files and directories that are included in this package.
67+    // Only files and directories listed here are included in the `hash` that
68+    // is computed for this package. Only files listed here will remain on disk
69+    // when using the zig package manager. As a rule of thumb, one should list
70+    // files required for compilation plus any license(s).
71+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
72+    // the build root itself.
73+    // A directory listed here means that all files within, recursively, are included.
74+    .paths = .{
75+        "build.zig",
76+        "build.zig.zon",
77+        "src",
78+        // For example...
79+        //"LICENSE",
80+        //"README.md",
81+    },
82+}
A src/punycode.zig
+319, -0
  1@@ -0,0 +1,319 @@
  2+const std = @import("std");
  3+const unicode = @import("std").unicode;
  4+const ArrayList = std.array_list.Managed;
  5+const expectEqual = std.testing.expectEqual;
  6+const expectEqualSlices = std.testing.expectEqualSlices;
  7+
  8+const base = 36;
  9+
 10+const tmin = 1;
 11+const tmax = 26;
 12+
 13+const skew = 38;
 14+const damp = 700;
 15+
 16+const initialBias = 72;
 17+const initialN = 128;
 18+
 19+fn adapt(delta: usize, numpoints: usize, firstTime: bool) usize {
 20+    var newDelta: usize = if (firstTime) (delta / damp) else (delta >> 1);
 21+    newDelta += newDelta / numpoints; 
 22+
 23+    var k: u32 = 0;
 24+    const deltaCond: u32 = (base - tmin) * tmax / 2;
 25+    while (newDelta > deltaCond) : (k += base) {
 26+        newDelta = newDelta / (base - tmin);
 27+    }
 28+    return k + (((base - tmin + 1) * newDelta) / (newDelta + skew));
 29+}
 30+
 31+fn encodeDigit(i: usize) u8 {
 32+    if (i < 26) {
 33+        return @truncate(i + 97); // ascii code of 'a'
 34+    } else {
 35+        return @truncate(i + 22); // ascii code of '0' - 26
 36+    }
 37+}
 38+
 39+fn decodeDigit(cp: u21) u21 {
 40+    if (cp - 48 < 10) {
 41+        return cp - 22;
 42+    } else if (cp - 65 < 26) {
 43+        return cp - 65;
 44+    } else if (cp - 97 < 26) {
 45+        return cp - 97;
 46+    } else return base;
 47+}
 48+
 49+test "decode digit" {
 50+    try expectEqual(@as(u21, 0), decodeDigit('a'));
 51+    try expectEqual(@as(u21, 0), decodeDigit('A'));
 52+    try expectEqual(@as(u21, 25), decodeDigit('z'));
 53+    try expectEqual(@as(u21, 26), decodeDigit('0'));
 54+    try expectEqual(@as(u21, 35), decodeDigit('9'));
 55+    try expectEqual(@as(u21, 0), decodeDigit(65));
 56+}
 57+
 58+pub fn encode(alloc: std.mem.Allocator, input: []const u21) !ArrayList(u8) {
 59+    var result = ArrayList(u8).init(alloc);
 60+
 61+    for (input) |c| {
 62+        if (c < initialN) {
 63+            try result.append(@truncate(c));
 64+        }
 65+    }
 66+
 67+    const numOfBasics = result.items.len;
 68+
 69+    if (numOfBasics > 0) {
 70+        try result.append('-');
 71+    }
 72+
 73+    var bias: usize = initialBias;
 74+    var delta: usize = 0;
 75+    var n: usize = initialN;
 76+    var h: usize = numOfBasics; 
 77+
 78+    while (h < input.len) {
 79+        var m: usize = std.math.maxInt(usize);
 80+
 81+        for (input) |c| {
 82+            if (c >= n and c < m) {
 83+                m = c; 
 84+            }
 85+        }
 86+
 87+        delta += (m - n) * (h + 1);
 88+        n = m;
 89+
 90+        for (input) |c| {
 91+            if (c < n) {
 92+                delta += 1;
 93+            }
 94+
 95+            if (c == n) {
 96+                var k: usize = base;
 97+                var q: usize = delta;
 98+
 99+                while (true) : (k += base) {
100+                    const t = if (k <= bias + tmin) tmin else (if (k >= bias + tmax) tmax else (k - bias)); 
101+                    if (q < t) break;
102+
103+                    const newChar: usize = t + ((q - t) % (base - t)); 
104+
105+                    try result.append(encodeDigit(newChar));
106+                    q = (q - t) / (base - t);
107+                }
108+
109+                try result.append(encodeDigit(q));
110+
111+                bias = adapt(delta, h + 1, h == numOfBasics);
112+                delta = 0;
113+                h += 1;
114+            } 
115+        }
116+
117+        delta += 1;
118+        n += 1;
119+    }
120+
121+    return result;
122+}
123+
124+const DecodeError = error{
125+    BadInput,
126+};
127+
128+pub fn decode(alloc: std.mem.Allocator, input: []const u8) error{BadInput, OutOfMemory}!ArrayList(u21) {
129+    var result = ArrayList(u21).init(alloc);
130+
131+    var b: usize = 0;
132+    for (input, 0..) |c, i| {
133+        if (c == '-') {
134+            b = i;
135+        }
136+    }
137+
138+    var j: usize = 0;
139+    while (j < b) : (j += 1) {
140+        if (input[j] >= initialN) {
141+            return error.BadInput;
142+        }
143+
144+        try result.append(input[j]);
145+    }
146+
147+    var n: usize = initialN;
148+    var i: usize = 0;
149+    var bias: usize = initialBias;
150+
151+    var in = if (0 < b) b + 1 else 0; 
152+    while (in < input.len) {
153+        const oldI: usize = i;
154+        var w: usize = 1;
155+        var k: usize = base;
156+
157+        while(true) : (k += base) {
158+            if (in >= input.len) {
159+                return DecodeError.BadInput;
160+            }
161+
162+            const digit = decodeDigit(input[in]);
163+            in += 1;
164+
165+            if (digit >= base) {
166+                return DecodeError.BadInput;
167+            }
168+
169+            i += digit * w;
170+
171+            const t = if (k <= bias) tmin else (if (k >= bias + tmax) tmax else (k - bias)); 
172+
173+            if (digit < t) break;
174+
175+            w *= base - t;
176+        }
177+
178+        const resultSize = result.items.len;
179+        bias = adapt(i - oldI, resultSize + 1, oldI == 0);
180+        n += i / (resultSize + 1);
181+        i %= (resultSize + 1);
182+
183+        try result.insert(i, @truncate(n));
184+        i += 1;
185+    }
186+
187+    return result;
188+}
189+
190+// tests
191+
192+test {
193+    const alloc = std.testing.allocator;
194+
195+    var input = ArrayList(u21).init(alloc);
196+    defer input.deinit();
197+
198+    var utf8 = (try std.unicode.Utf8View.init("3年b組金八先生")).iterator();
199+    while (utf8.nextCodepoint()) |codepoint| {
200+        try input.append(codepoint);
201+    }
202+
203+    const encoded = try encode(alloc, input.items); 
204+    defer encoded.deinit();
205+
206+    try expectEqualSlices(u8, encoded.items, "3b-ww4c5e180e575a65lsy2b");
207+}
208+
209+test {
210+    const alloc = std.testing.allocator;
211+
212+    const decoded = try decode(alloc, "3b-ww4c5e180e575a65lsy2b");
213+    defer decoded.deinit();
214+
215+    try expectEqualSlices(u21, &.{ 0x0033, 0x5E74, 0x0062, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F }, decoded.items);
216+}
217+
218+const testValues = [_]struct {[]const u21, []const u8}
219+    {   // Arabic (Egyptian):
220+        .{ &.{ 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, 0x0644,
221+               0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, 0x061F } , "egbpdaj6bu4bxfgehfvwxn" },
222+
223+        // Chinese (simplified):
224+        .{ &.{ 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587} , "ihqwcrb4cv8a8dqg056pqjye" },
225+
226+        // Chinese (traditional):
227+        .{ &.{ 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587} , "ihqwctvzc91f659drss3x8bo0yb" },
228+
229+        // Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
230+        .{ &.{ 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, 0x0074,
231+                0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, 0x00ED, 0x010D,
232+                0x0065, 0x0073, 0x006B, 0x0079 } , "Proprostnemluvesky-uyb24dma41a" },
233+
234+        // Hebrew:
235+        .{ &.{ 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, 0x05D8,
236+                0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x05DD, 0x05E2,
237+                0x05D1, 0x05E8, 0x05D9, 0x05EA } , "4dbcagdahymbxekheh6e0a7fei0b" },
238+
239+        // Hindi (Devanagari):
240+        .{ &.{ 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, 0x094D,
241+                0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, 0x0928, 0x0939,
242+                0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, 0x0915, 0x0924, 0x0947,
243+                0x0939, 0x0948, 0x0902 } , "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" },
244+
245+        // Japanese (kanji and hiragana):
246+        .{ &.{ 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, 0x3092,
247+                0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, 0x306E, 0x304B } , "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" },
248+
249+        // Korean (Hangul syllables): 
250+        .{ &.{ 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774,
251+                0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74,
252+                0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C } , "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" },
253+
254+        // Russian (Cyrillic):
255+        .{ &.{ 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, 0x043E,
256+                0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, 0x043E, 0x0440,
257+                0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, 0x0441, 0x0441, 0x043A,
258+                0x0438 } , "b1abfaaepdrnnbgefbadotcwatmq2g4l" },
259+
260+        // Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol:
261+        .{ &.{ 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, 0x0070,
262+                0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, 0x006D, 0x0070,
263+                0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, 0x0065, 0x0068, 0x0061,
264+                0x0062, 0x006C, 0x0061, 0x0072, 0x0065, 0x006E, 0x0045, 0x0073, 0x0070,
265+                0x0061, 0x00F1, 0x006F, 0x006C } , "PorqunopuedensimplementehablarenEspaol-fmd56a" },
266+
267+        // Vietnamese: T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch<ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t:
268+        .{ &.{ 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, 0x006B,
269+                0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, 0x0063, 0x0068,
270+                0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, 0x1EBF, 0x006E, 0x0067,
271+                0x0056, 0x0069, 0x1EC7, 0x0074 } , "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" },
272+
273+        // 3<nen>B<gumi><kinpachi><sensei>
274+        .{ &.{ 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F } , "3B-ww4c5e180e575a65lsy2b"},
275+
276+
277+        // <amuro><namie>-with-SUPER-MONKEYS 
278+        .{ &.{ 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, 0x0074,
279+                0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, 0x002D, 0x004D,
280+                0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053 } , "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"},
281+
282+        // Hello-Another-Way-<sorezore><no><basho>
283+        .{ &.{ 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, 0x006F,
284+                0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, 0x0079, 0x002D,
285+                0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, 0x6240 } , "Hello-Another-Way--fc4qua05auwb3674vfr0b"},
286+
287+        // <hitotsu><yane><no><shita>2 
288+        .{ &.{ 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032 } , "2-u9tlzr9756bt3uc0v"},
289+
290+        // Maji<de>Koi<suru>5<byou><mae>
291+        .{ &.{ 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, 0x3059,
292+                0x308B, 0x0035, 0x79D2, 0x524D } , "MajiKoi5-783gue6qz075azm5e"},
293+
294+        // <pafii>de<runba> 
295+        .{ &.{ 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0 } , "de-jg4avhby1noc0d" },
296+
297+        // <sono><supiido><de>
298+        .{ &.{ 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067 } , "d9juau41awczczp" },
299+
300+        // -> $1.00 <- 
301+        .{ &.{ 0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0x002E, 0x0030, 0x0030, 0x0020, 0x003C, 0x002D } , "-> $1.00 <--" }
302+
303+    };
304+
305+test {
306+    const alloc = std.testing.allocator;
307+
308+    for (testValues) |value| {
309+        const encoded = try encode(alloc, value.@"0"); 
310+        defer encoded.deinit();
311+
312+        try expectEqualSlices(u8, value.@"1", encoded.items);
313+
314+        const decoded = try decode(alloc, encoded.items); 
315+        defer decoded.deinit();
316+
317+        try expectEqualSlices(u21, value.@"0", decoded.items);
318+    }
319+}
320+