first commit

2025-08-15 13:34:17 +02:00
commit 104b32694b
8 changed files with 11993 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+zig-out
+.zig-cache
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+Create random firstnames with markov chain generation.
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,116 @@
+const std = @import("std");
+
+// Although this function looks imperative, note that its job is to
+// declaratively construct a build graph that will be executed by an external
+// runner.
+pub fn build(b: *std.Build) void {
+    // Standard target options allows the person running `zig build` to choose
+    // what target to build for. Here we do not override the defaults, which
+    // means any target is allowed, and the default is native. Other options
+    // for restricting supported target set are available.
+    const target = b.standardTargetOptions(.{});
+
+    // Standard optimization options allow the person running `zig build` to select
+    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
+    // set a preferred release mode, allowing the user to decide how to optimize.
+    const optimize = b.standardOptimizeOption(.{});
+
+    // This creates a "module", which represents a collection of source files alongside
+    // some compilation options, such as optimization mode and linked system libraries.
+    // Every executable or library we compile will be based on one or more modules.
+    const lib_mod = b.createModule(.{
+        // `root_source_file` is the Zig "entry point" of the module. If a module
+        // only contains e.g. external object files, you can make this `null`.
+        // In this case the main source file is merely a path, however, in more
+        // complicated build scripts, this could be a generated file.
+        .root_source_file = b.path("src/lib.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // We will also create a module for our other entry point, 'main.zig'.
+    const exe_mod = b.createModule(.{
+        // `root_source_file` is the Zig "entry point" of the module. If a module
+        // only contains e.g. external object files, you can make this `null`.
+        // In this case the main source file is merely a path, however, in more
+        // complicated build scripts, this could be a generated file.
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // Modules can depend on one another using the `std.Build.Module.addImport` function.
+    // This is what allows Zig source code to use `@import("foo")` where 'foo' is not a
+    // file path. In this case, we set up `exe_mod` to import `lib_mod`.
+    exe_mod.addImport("markov", lib_mod);
+
+    // Now, we will create a static library based on the module we created above.
+    // This creates a `std.Build.Step.Compile`, which is the build step responsible
+    // for actually invoking the compiler.
+    const lib = b.addLibrary(.{
+        .linkage = .static,
+        .name = "markov",
+        .root_module = lib_mod,
+    });
+
+    // This declares intent for the library to be installed into the standard
+    // location when the user invokes the "install" step (the default step when
+    // running `zig build`).
+    b.installArtifact(lib);
+
+    // This creates another `std.Build.Step.Compile`, but this one builds an executable
+    // rather than a static library.
+    const exe = b.addExecutable(.{
+        .name = "markov_prenoms",
+        .root_module = exe_mod,
+    });
+
+    // This declares intent for the executable to be installed into the
+    // standard location when the user invokes the "install" step (the default
+    // step when running `zig build`).
+    b.installArtifact(exe);
+
+    // This *creates* a Run step in the build graph, to be executed when another
+    // step is evaluated that depends on it. The next line below will establish
+    // such a dependency.
+    const run_cmd = b.addRunArtifact(exe);
+
+    // By making the run step depend on the install step, it will be run from the
+    // installation directory rather than directly from within the cache directory.
+    // This is not necessary, however, if the application depends on other installed
+    // files, this ensures they will be present and in the expected location.
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    // This allows the user to pass arguments to the application in the build
+    // command itself, like this: `zig build run -- arg1 arg2 etc`
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    // This creates a build step. It will be visible in the `zig build --help` menu,
+    // and can be selected like this: `zig build run`
+    // This will evaluate the `run` step rather than the default, which is "install".
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+
+    // Creates a step for unit testing. This only builds the test executable
+    // but does not run it.
+    const lib_unit_tests = b.addTest(.{
+        .root_module = lib_mod,
+    });
+
+    const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
+
+    const exe_unit_tests = b.addTest(.{
+        .root_module = exe_mod,
+    });
+
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    // Similar to creating the run step earlier, this exposes a `test` step to
+    // the `zig build --help` menu, providing a way for the user to request
+    // running the unit tests.
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_lib_unit_tests.step);
+    test_step.dependOn(&run_exe_unit_tests.step);
+}
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -0,0 +1,86 @@
+.{
+    // This is the default name used by packages depending on this one. For
+    // example, when a user runs `zig fetch --save <url>`, this field is used
+    // as the key in the `dependencies` table. Although the user can choose a
+    // different name, most users will stick with this provided value.
+    //
+    // It is redundant to include "zig" in this name because it is already
+    // within the Zig package namespace.
+    .name = .markov,
+
+    // This is a [Semantic Version](https://semver.org/).
+    // In a future version of Zig it will be used for package deduplication.
+    .version = "0.1.0",
+
+    // Together with name, this represents a globally unique package
+    // identifier. This field is generated by the Zig toolchain when the
+    // package is first created, and then *never changes*. This allows
+    // unambiguous detection of one package being an updated version of
+    // another.
+    //
+    // When forking a Zig project, this id should be regenerated (delete the
+    // field and run `zig build`) if the upstream project is still maintained.
+    // Otherwise, the fork is *hostile*, attempting to take control over the
+    // original project's identity. Thus it is recommended to leave the comment
+    // on the following line intact, so that it shows up in code reviews that
+    // modify the field.
+    .fingerprint = 0x7f4d0c6d23b24010, // Changing this has security and trust implications.
+
+    // Tracks the earliest Zig version that the package considers to be a
+    // supported use case.
+    .minimum_zig_version = "0.14.1",
+
+    // This field is optional.
+    // Each dependency must either provide a `url` and `hash`, or a `path`.
+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
+    // Once all dependencies are fetched, `zig build` no longer requires
+    // internet connectivity.
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL. If the contents of a URL change this will result in a hash mismatch
+        //    // which will prevent zig from using it.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+        //
+        //    // When this is set to `true`, a package is declared to be lazily
+        //    // fetched. This makes the dependency only get fetched if it is
+        //    // actually used.
+        //    .lazy = false,
+        //},
+    },
+
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package. Only files listed here will remain on disk
+    // when using the zig package manager. As a rule of thumb, one should list
+    // files required for compilation plus any license(s).
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
+    .paths = .{
+        "build.zig",
+        "build.zig.zon",
+        "src",
+        // For example...
+        //"LICENSE",
+        //"README.md",
+    },
+}
--- a/markov.bin
+++ b/markov.bin
--- a/prenoms.csv
+++ b/prenoms.csv
--- a/src/lib.zig
+++ b/src/lib.zig
@@ -0,0 +1,92 @@
+const std = @import("std");
+const rand = std.crypto.random;
+
+pub const DataPoint = struct {
+    char: u8,
+    prob: f32,
+
+    pub fn desc(context: void, a: DataPoint, b: DataPoint) bool {
+        _ = context;
+        return a.prob > b.prob;
+    }
+};
+
+pub const MarkovChain = struct {
+    allocator: std.mem.Allocator,
+    map: std.AutoHashMap(u8, []DataPoint),
+
+    pub fn init(path: []const u8, allocator: std.mem.Allocator) !MarkovChain {
+        var self = MarkovChain{
+            .allocator = allocator,
+            .map = std.AutoHashMap(u8, []DataPoint).init(allocator),
+        };
+
+        var markovBinFile = try std.fs.cwd().openFile(path, .{ .mode = .read_only });
+
+        var reader = markovBinFile.reader();
+
+        for (0..256) |prevChar| {
+            const cnt = try reader.readInt(u8, .little);
+            //std.debug.print("previous : {c} - cnt : {d}\n", .{ @as(u8, @intCast(prevChar)), cnt });
+            var nextChars: []DataPoint = try self.allocator.alloc(DataPoint, cnt);
+            for (0..cnt) |i| {
+                const nextByte = try reader.readByte();
+                //std.debug.print("next : {c}\n", .{nextByte});
+                const prob: f32 = @bitCast(try reader.readInt(u32, .little));
+                nextChars[i] = DataPoint{
+                    .char = nextByte,
+                    .prob = prob,
+                };
+            }
+            try self.map.put(@as(u8, @intCast(prevChar)), nextChars);
+        }
+
+        //std.debug.print("choices for A : {any}\n", .{self.map.get(97).?});
+
+        return self;
+    }
+
+    pub fn deinit(self: *MarkovChain) void {
+        var iter = self.map.iterator();
+        while (iter.next()) |entry| {
+            self.allocator.free(entry.value_ptr.*);
+        }
+        self.map.deinit();
+    }
+
+    pub fn generate(self: *MarkovChain, size: u8, allocator: std.mem.Allocator) ![]u8 {
+        var result = try allocator.alloc(u8, size);
+        var previous: u8 = 0;
+        for (0..size) |i| {
+            const choices = self.map.get(previous).?;
+            const randFloat = rand.float(f32);
+            var cumul: f32 = 0;
+            for (0..choices.len) |j| {
+                cumul += choices[j].prob;
+                if (randFloat < cumul) {
+                    result[i] = choices[j].char;
+                    previous = choices[j].char;
+                    break;
+                }
+            }
+        }
+        return result;
+    }
+};
+
+test "basic test" {
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    const allocator = gpa.allocator();
+    defer {
+        _ = gpa.deinit();
+    }
+
+    var markov = try MarkovChain.init("markov.bin", allocator);
+    defer markov.deinit();
+
+    for (0..24) |_| {
+        const randName = try markov.generate(8, allocator);
+        defer allocator.free(randName);
+        std.debug.print("generated : {s}\n", .{randName});
+    }
+}
--- a/src/main.zig
+++ b/src/main.zig
@@ -0,0 +1,69 @@
+const std = @import("std");
+const DataPoint = @import("markov").DataPoint;
+
+pub fn main() !void {
+    const path = "prenoms.csv";
+    var markovCnt: [256][256]u32 = undefined;
+    var computed: [256][256]DataPoint = undefined;
+
+    var file = try std.fs.cwd().openFile(path, .{});
+    defer file.close();
+
+    var buf_reader = std.io.bufferedReader(file.reader());
+    var in_stream = buf_reader.reader();
+
+    // init
+    for (0..256) |i| {
+        for (0..256) |j| {
+            markovCnt[i][j] = 0;
+        }
+    }
+
+    // stats
+
+    var buf: [1024]u8 = undefined;
+    while (try in_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
+        var previous: u8 = 0;
+        for (line) |char| {
+            if (char == ';') break;
+            markovCnt[previous][char] += 1;
+            previous = char;
+        }
+    }
+
+    for (0..256) |i| {
+        var acc: u32 = 0;
+        for (0..256) |j| {
+            acc += markovCnt[i][j];
+        }
+        for (0..256) |j| {
+            const ratio = @as(f32, @floatFromInt(markovCnt[i][j])) / @as(f32, @floatFromInt(acc));
+            computed[i][j] = DataPoint{ .char = @intCast(j), .prob = ratio };
+        }
+    }
+
+    for (0..256) |i| {
+        std.mem.sort(DataPoint, &computed[i], {}, DataPoint.desc);
+    }
+
+    var outFile = try std.fs.cwd().createFile("markov.bin", .{ .truncate = true });
+    defer outFile.close();
+
+    const writer = outFile.writer();
+
+    for (0..256) |i| {
+        var cntnonzero: u8 = 0;
+        for (0..256) |j| {
+            if (computed[i][j].prob > 0) {
+                cntnonzero += 1;
+            } else {
+                break;
+            }
+        }
+        try writer.writeInt(u8, cntnonzero, .little);
+        for (0..cntnonzero) |j| {
+            try writer.writeByte(computed[i][j].char);
+            try writer.writeInt(u32, @bitCast(computed[i][j].prob), .little);
+        }
+    }
+}
				`@@ -0,0 +1 @@`
				`Create random firstnames with markov chain generation.`