From: Evgenii Akentev Date: Fri, 17 Oct 2025 15:04:36 +0000 (+0400) Subject: Create punycode library X-Git-Url: https://git.xn--bdkaa.com/?a=commitdiff_plain;ds=inline;p=punycode.zig.git Create punycode library --- e21b28b654111f0a88c184bf74cd884b6477a57b diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bc0b345 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# This file is for zig-specific build artifacts. +# If you have OS-specific or editor-specific files to ignore, +# such as *.swp or .DS_Store, put those in your global +# ~/.gitignore and put this in your ~/.gitconfig: +# +# [core] +# excludesfile = ~/.gitignore +# +# Cheers! +# -andrewrk + +.zig-cache/ +zig-out/ +/release/ +/debug/ +/build/ +/build-*/ +/docgen_tmp/ + +# Although this was renamed to .zig-cache, let's leave it here for a few +# releases to make it less annoying to work with multiple branches. +zig-cache/ + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..755feff --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2025 Evgenii Akentev + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8f0d674 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Implementation of https://www.rfc-editor.org/rfc/rfc3492 in Zig diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..f81b341 --- /dev/null +++ b/build.zig @@ -0,0 +1,46 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const libpunycode = b.addLibrary(.{ + .name = "punycode", + .linkage = .static, + .root_module = b.createModule(.{ + .root_source_file = b.path("src/punycode.zig"), + .target = target, + .optimize = optimize, + }), + }); + + b.installArtifact(libpunycode); + + // Creates an executable that will run `test` blocks from the provided module. + // Here `mod` needs to define a target, which is why earlier we made sure to + // set the releative field. + const mod_tests = b.addTest(.{ + .root_module = libpunycode.root_module, + }); + + // A run step that will run the test executable. + const run_mod_tests = b.addRunArtifact(mod_tests); + + // A top level step for running all tests. dependOn can be called multiple + // times and since the two run steps do not depend on one another, this will + // make the two of them run in parallel. + const test_step = b.step("test", "Run tests"); + test_step.dependOn(&run_mod_tests.step); + + // Just like flags, top level steps are also listed in the `--help` menu. + // + // The Zig build system is entirely implemented in userland, which means + // that it cannot hook into private compiler APIs. All compilation work + // orchestrated by the build system will result in other Zig compiler + // subcommands being invoked with the right flags defined. You can observe + // these invocations when one fails (or you pass a flag to increase + // verbosity) to validate assumptions and diagnose problems. + // + // Lastly, the Zig build system is relatively simple and self-contained, + // and reading its source code will allow you to master it. +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..1588516 --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,81 @@ +.{ + // This is the default name used by packages depending on this one. For + // example, when a user runs `zig fetch --save `, this field is used + // as the key in the `dependencies` table. Although the user can choose a + // different name, most users will stick with this provided value. + // + // It is redundant to include "zig" in this name because it is already + // within the Zig package namespace. + .name = .punycode_zig, + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + // Together with name, this represents a globally unique package + // identifier. This field is generated by the Zig toolchain when the + // package is first created, and then *never changes*. This allows + // unambiguous detection of one package being an updated version of + // another. + // + // When forking a Zig project, this id should be regenerated (delete the + // field and run `zig build`) if the upstream project is still maintained. + // Otherwise, the fork is *hostile*, attempting to take control over the + // original project's identity. Thus it is recommended to leave the comment + // on the following line intact, so that it shows up in code reviews that + // modify the field. + .fingerprint = 0xf0800252b14743b, // Changing this has security and trust implications. + // Tracks the earliest Zig version that the package considers to be a + // supported use case. + .minimum_zig_version = "0.15.1", + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save ` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. If the contents of a URL change this will result in a hash mismatch + // // which will prevent zig from using it. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + // + // // When this is set to `true`, a package is declared to be lazily + // // fetched. This makes the dependency only get fetched if it is + // // actually used. + // .lazy = false, + //}, + }, + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. Only files listed here will remain on disk + // when using the zig package manager. As a rule of thumb, one should list + // files required for compilation plus any license(s). + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + // For example... + //"LICENSE", + //"README.md", + }, +} diff --git a/src/punycode.zig b/src/punycode.zig new file mode 100644 index 0000000..71cc24f --- /dev/null +++ b/src/punycode.zig @@ -0,0 +1,319 @@ +const std = @import("std"); +const unicode = @import("std").unicode; +const ArrayList = std.array_list.Managed; +const expectEqual = std.testing.expectEqual; +const expectEqualSlices = std.testing.expectEqualSlices; + +const base = 36; + +const tmin = 1; +const tmax = 26; + +const skew = 38; +const damp = 700; + +const initialBias = 72; +const initialN = 128; + +fn adapt(delta: usize, numpoints: usize, firstTime: bool) usize { + var newDelta: usize = if (firstTime) (delta / damp) else (delta >> 1); + newDelta += newDelta / numpoints; + + var k: u32 = 0; + const deltaCond: u32 = (base - tmin) * tmax / 2; + while (newDelta > deltaCond) : (k += base) { + newDelta = newDelta / (base - tmin); + } + return k + (((base - tmin + 1) * newDelta) / (newDelta + skew)); +} + +fn encodeDigit(i: usize) u8 { + if (i < 26) { + return @truncate(i + 97); // ascii code of 'a' + } else { + return @truncate(i + 22); // ascii code of '0' - 26 + } +} + +fn decodeDigit(cp: u21) u21 { + if (cp - 48 < 10) { + return cp - 22; + } else if (cp - 65 < 26) { + return cp - 65; + } else if (cp - 97 < 26) { + return cp - 97; + } else return base; +} + +test "decode digit" { + try expectEqual(@as(u21, 0), decodeDigit('a')); + try expectEqual(@as(u21, 0), decodeDigit('A')); + try expectEqual(@as(u21, 25), decodeDigit('z')); + try expectEqual(@as(u21, 26), decodeDigit('0')); + try expectEqual(@as(u21, 35), decodeDigit('9')); + try expectEqual(@as(u21, 0), decodeDigit(65)); +} + +pub fn encode(alloc: std.mem.Allocator, input: []const u21) !ArrayList(u8) { + var result = ArrayList(u8).init(alloc); + + for (input) |c| { + if (c < initialN) { + try result.append(@truncate(c)); + } + } + + const numOfBasics = result.items.len; + + if (numOfBasics > 0) { + try result.append('-'); + } + + var bias: usize = initialBias; + var delta: usize = 0; + var n: usize = initialN; + var h: usize = numOfBasics; + + while (h < input.len) { + var m: usize = std.math.maxInt(usize); + + for (input) |c| { + if (c >= n and c < m) { + m = c; + } + } + + delta += (m - n) * (h + 1); + n = m; + + for (input) |c| { + if (c < n) { + delta += 1; + } + + if (c == n) { + var k: usize = base; + var q: usize = delta; + + while (true) : (k += base) { + const t = if (k <= bias + tmin) tmin else (if (k >= bias + tmax) tmax else (k - bias)); + if (q < t) break; + + const newChar: usize = t + ((q - t) % (base - t)); + + try result.append(encodeDigit(newChar)); + q = (q - t) / (base - t); + } + + try result.append(encodeDigit(q)); + + bias = adapt(delta, h + 1, h == numOfBasics); + delta = 0; + h += 1; + } + } + + delta += 1; + n += 1; + } + + return result; +} + +const DecodeError = error{ + BadInput, +}; + +pub fn decode(alloc: std.mem.Allocator, input: []const u8) error{BadInput, OutOfMemory}!ArrayList(u21) { + var result = ArrayList(u21).init(alloc); + + var b: usize = 0; + for (input, 0..) |c, i| { + if (c == '-') { + b = i; + } + } + + var j: usize = 0; + while (j < b) : (j += 1) { + if (input[j] >= initialN) { + return error.BadInput; + } + + try result.append(input[j]); + } + + var n: usize = initialN; + var i: usize = 0; + var bias: usize = initialBias; + + var in = if (0 < b) b + 1 else 0; + while (in < input.len) { + const oldI: usize = i; + var w: usize = 1; + var k: usize = base; + + while(true) : (k += base) { + if (in >= input.len) { + return DecodeError.BadInput; + } + + const digit = decodeDigit(input[in]); + in += 1; + + if (digit >= base) { + return DecodeError.BadInput; + } + + i += digit * w; + + const t = if (k <= bias) tmin else (if (k >= bias + tmax) tmax else (k - bias)); + + if (digit < t) break; + + w *= base - t; + } + + const resultSize = result.items.len; + bias = adapt(i - oldI, resultSize + 1, oldI == 0); + n += i / (resultSize + 1); + i %= (resultSize + 1); + + try result.insert(i, @truncate(n)); + i += 1; + } + + return result; +} + +// tests + +test { + const alloc = std.testing.allocator; + + var input = ArrayList(u21).init(alloc); + defer input.deinit(); + + var utf8 = (try std.unicode.Utf8View.init("3年b組金八先生")).iterator(); + while (utf8.nextCodepoint()) |codepoint| { + try input.append(codepoint); + } + + const encoded = try encode(alloc, input.items); + defer encoded.deinit(); + + try expectEqualSlices(u8, encoded.items, "3b-ww4c5e180e575a65lsy2b"); +} + +test { + const alloc = std.testing.allocator; + + const decoded = try decode(alloc, "3b-ww4c5e180e575a65lsy2b"); + defer decoded.deinit(); + + try expectEqualSlices(u21, &.{ 0x0033, 0x5E74, 0x0062, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F }, decoded.items); +} + +const testValues = [_]struct {[]const u21, []const u8} + { // Arabic (Egyptian): + .{ &.{ 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, 0x0644, + 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, 0x061F } , "egbpdaj6bu4bxfgehfvwxn" }, + + // Chinese (simplified): + .{ &.{ 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587} , "ihqwcrb4cv8a8dqg056pqjye" }, + + // Chinese (traditional): + .{ &.{ 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587} , "ihqwctvzc91f659drss3x8bo0yb" }, + + // Czech: Proprostnemluvesky + .{ &.{ 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, 0x0074, + 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, 0x00ED, 0x010D, + 0x0065, 0x0073, 0x006B, 0x0079 } , "Proprostnemluvesky-uyb24dma41a" }, + + // Hebrew: + .{ &.{ 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, 0x05D8, + 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x05DD, 0x05E2, + 0x05D1, 0x05E8, 0x05D9, 0x05EA } , "4dbcagdahymbxekheh6e0a7fei0b" }, + + // Hindi (Devanagari): + .{ &.{ 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, 0x094D, + 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, 0x0928, 0x0939, + 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, 0x0915, 0x0924, 0x0947, + 0x0939, 0x0948, 0x0902 } , "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" }, + + // Japanese (kanji and hiragana): + .{ &.{ 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, 0x3092, + 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, 0x306E, 0x304B } , "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" }, + + // Korean (Hangul syllables): + .{ &.{ 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C } , "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" }, + + // Russian (Cyrillic): + .{ &.{ 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, 0x043E, + 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, 0x043E, 0x0440, + 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, 0x0441, 0x0441, 0x043A, + 0x0438 } , "b1abfaaepdrnnbgefbadotcwatmq2g4l" }, + + // Spanish: PorqunopuedensimplementehablarenEspaol: + .{ &.{ 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, 0x0070, + 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, 0x006D, 0x0070, + 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, 0x0065, 0x0068, 0x0061, + 0x0062, 0x006C, 0x0061, 0x0072, 0x0065, 0x006E, 0x0045, 0x0073, 0x0070, + 0x0061, 0x00F1, 0x006F, 0x006C } , "PorqunopuedensimplementehablarenEspaol-fmd56a" }, + + // Vietnamese: TisaohkhngthchnitingVit: + .{ &.{ 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, 0x006B, + 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, 0x0063, 0x0068, + 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, 0x1EBF, 0x006E, 0x0067, + 0x0056, 0x0069, 0x1EC7, 0x0074 } , "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" }, + + // 3B + .{ &.{ 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F } , "3B-ww4c5e180e575a65lsy2b"}, + + + // -with-SUPER-MONKEYS + .{ &.{ 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, 0x0074, + 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, 0x002D, 0x004D, + 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053 } , "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"}, + + // Hello-Another-Way- + .{ &.{ 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, 0x006F, + 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, 0x0079, 0x002D, + 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, 0x6240 } , "Hello-Another-Way--fc4qua05auwb3674vfr0b"}, + + // 2 + .{ &.{ 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032 } , "2-u9tlzr9756bt3uc0v"}, + + // MajiKoi5 + .{ &.{ 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, 0x3059, + 0x308B, 0x0035, 0x79D2, 0x524D } , "MajiKoi5-783gue6qz075azm5e"}, + + // de + .{ &.{ 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0 } , "de-jg4avhby1noc0d" }, + + // + .{ &.{ 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067 } , "d9juau41awczczp" }, + + // -> $1.00 <- + .{ &.{ 0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0x002E, 0x0030, 0x0030, 0x0020, 0x003C, 0x002D } , "-> $1.00 <--" } + + }; + +test { + const alloc = std.testing.allocator; + + for (testValues) |value| { + const encoded = try encode(alloc, value.@"0"); + defer encoded.deinit(); + + try expectEqualSlices(u8, value.@"1", encoded.items); + + const decoded = try decode(alloc, encoded.items); + defer decoded.deinit(); + + try expectEqualSlices(u21, value.@"0", decoded.items); + } +} +