first commit
This commit is contained in:
commit
104b32694b
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
zig-out
|
||||
.zig-cache
|
||||
1
README.md
Normal file
1
README.md
Normal file
@ -0,0 +1 @@
|
||||
Create random firstnames with markov chain generation.
|
||||
116
build.zig
Normal file
116
build.zig
Normal file
@ -0,0 +1,116 @@
|
||||
const std = @import("std");
|
||||
|
||||
// Although this function looks imperative, note that its job is to
|
||||
// declaratively construct a build graph that will be executed by an external
|
||||
// runner.
|
||||
pub fn build(b: *std.Build) void {
|
||||
// Standard target options allows the person running `zig build` to choose
|
||||
// what target to build for. Here we do not override the defaults, which
|
||||
// means any target is allowed, and the default is native. Other options
|
||||
// for restricting supported target set are available.
|
||||
const target = b.standardTargetOptions(.{});
|
||||
|
||||
// Standard optimization options allow the person running `zig build` to select
|
||||
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
|
||||
// set a preferred release mode, allowing the user to decide how to optimize.
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
// This creates a "module", which represents a collection of source files alongside
|
||||
// some compilation options, such as optimization mode and linked system libraries.
|
||||
// Every executable or library we compile will be based on one or more modules.
|
||||
const lib_mod = b.createModule(.{
|
||||
// `root_source_file` is the Zig "entry point" of the module. If a module
|
||||
// only contains e.g. external object files, you can make this `null`.
|
||||
// In this case the main source file is merely a path, however, in more
|
||||
// complicated build scripts, this could be a generated file.
|
||||
.root_source_file = b.path("src/lib.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
// We will also create a module for our other entry point, 'main.zig'.
|
||||
const exe_mod = b.createModule(.{
|
||||
// `root_source_file` is the Zig "entry point" of the module. If a module
|
||||
// only contains e.g. external object files, you can make this `null`.
|
||||
// In this case the main source file is merely a path, however, in more
|
||||
// complicated build scripts, this could be a generated file.
|
||||
.root_source_file = b.path("src/main.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
// Modules can depend on one another using the `std.Build.Module.addImport` function.
|
||||
// This is what allows Zig source code to use `@import("foo")` where 'foo' is not a
|
||||
// file path. In this case, we set up `exe_mod` to import `lib_mod`.
|
||||
exe_mod.addImport("markov", lib_mod);
|
||||
|
||||
// Now, we will create a static library based on the module we created above.
|
||||
// This creates a `std.Build.Step.Compile`, which is the build step responsible
|
||||
// for actually invoking the compiler.
|
||||
const lib = b.addLibrary(.{
|
||||
.linkage = .static,
|
||||
.name = "markov",
|
||||
.root_module = lib_mod,
|
||||
});
|
||||
|
||||
// This declares intent for the library to be installed into the standard
|
||||
// location when the user invokes the "install" step (the default step when
|
||||
// running `zig build`).
|
||||
b.installArtifact(lib);
|
||||
|
||||
// This creates another `std.Build.Step.Compile`, but this one builds an executable
|
||||
// rather than a static library.
|
||||
const exe = b.addExecutable(.{
|
||||
.name = "markov_prenoms",
|
||||
.root_module = exe_mod,
|
||||
});
|
||||
|
||||
// This declares intent for the executable to be installed into the
|
||||
// standard location when the user invokes the "install" step (the default
|
||||
// step when running `zig build`).
|
||||
b.installArtifact(exe);
|
||||
|
||||
// This *creates* a Run step in the build graph, to be executed when another
|
||||
// step is evaluated that depends on it. The next line below will establish
|
||||
// such a dependency.
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
|
||||
// By making the run step depend on the install step, it will be run from the
|
||||
// installation directory rather than directly from within the cache directory.
|
||||
// This is not necessary, however, if the application depends on other installed
|
||||
// files, this ensures they will be present and in the expected location.
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
|
||||
// This allows the user to pass arguments to the application in the build
|
||||
// command itself, like this: `zig build run -- arg1 arg2 etc`
|
||||
if (b.args) |args| {
|
||||
run_cmd.addArgs(args);
|
||||
}
|
||||
|
||||
// This creates a build step. It will be visible in the `zig build --help` menu,
|
||||
// and can be selected like this: `zig build run`
|
||||
// This will evaluate the `run` step rather than the default, which is "install".
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
|
||||
// Creates a step for unit testing. This only builds the test executable
|
||||
// but does not run it.
|
||||
const lib_unit_tests = b.addTest(.{
|
||||
.root_module = lib_mod,
|
||||
});
|
||||
|
||||
const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
|
||||
|
||||
const exe_unit_tests = b.addTest(.{
|
||||
.root_module = exe_mod,
|
||||
});
|
||||
|
||||
const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
|
||||
|
||||
// Similar to creating the run step earlier, this exposes a `test` step to
|
||||
// the `zig build --help` menu, providing a way for the user to request
|
||||
// running the unit tests.
|
||||
const test_step = b.step("test", "Run unit tests");
|
||||
test_step.dependOn(&run_lib_unit_tests.step);
|
||||
test_step.dependOn(&run_exe_unit_tests.step);
|
||||
}
|
||||
86
build.zig.zon
Normal file
86
build.zig.zon
Normal file
@ -0,0 +1,86 @@
|
||||
.{
|
||||
// This is the default name used by packages depending on this one. For
|
||||
// example, when a user runs `zig fetch --save <url>`, this field is used
|
||||
// as the key in the `dependencies` table. Although the user can choose a
|
||||
// different name, most users will stick with this provided value.
|
||||
//
|
||||
// It is redundant to include "zig" in this name because it is already
|
||||
// within the Zig package namespace.
|
||||
.name = .markov,
|
||||
|
||||
// This is a [Semantic Version](https://semver.org/).
|
||||
// In a future version of Zig it will be used for package deduplication.
|
||||
.version = "0.1.0",
|
||||
|
||||
// Together with name, this represents a globally unique package
|
||||
// identifier. This field is generated by the Zig toolchain when the
|
||||
// package is first created, and then *never changes*. This allows
|
||||
// unambiguous detection of one package being an updated version of
|
||||
// another.
|
||||
//
|
||||
// When forking a Zig project, this id should be regenerated (delete the
|
||||
// field and run `zig build`) if the upstream project is still maintained.
|
||||
// Otherwise, the fork is *hostile*, attempting to take control over the
|
||||
// original project's identity. Thus it is recommended to leave the comment
|
||||
// on the following line intact, so that it shows up in code reviews that
|
||||
// modify the field.
|
||||
.fingerprint = 0x7f4d0c6d23b24010, // Changing this has security and trust implications.
|
||||
|
||||
// Tracks the earliest Zig version that the package considers to be a
|
||||
// supported use case.
|
||||
.minimum_zig_version = "0.14.1",
|
||||
|
||||
// This field is optional.
|
||||
// Each dependency must either provide a `url` and `hash`, or a `path`.
|
||||
// `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
|
||||
// Once all dependencies are fetched, `zig build` no longer requires
|
||||
// internet connectivity.
|
||||
.dependencies = .{
|
||||
// See `zig fetch --save <url>` for a command-line interface for adding dependencies.
|
||||
//.example = .{
|
||||
// // When updating this field to a new URL, be sure to delete the corresponding
|
||||
// // `hash`, otherwise you are communicating that you expect to find the old hash at
|
||||
// // the new URL. If the contents of a URL change this will result in a hash mismatch
|
||||
// // which will prevent zig from using it.
|
||||
// .url = "https://example.com/foo.tar.gz",
|
||||
//
|
||||
// // This is computed from the file contents of the directory of files that is
|
||||
// // obtained after fetching `url` and applying the inclusion rules given by
|
||||
// // `paths`.
|
||||
// //
|
||||
// // This field is the source of truth; packages do not come from a `url`; they
|
||||
// // come from a `hash`. `url` is just one of many possible mirrors for how to
|
||||
// // obtain a package matching this `hash`.
|
||||
// //
|
||||
// // Uses the [multihash](https://multiformats.io/multihash/) format.
|
||||
// .hash = "...",
|
||||
//
|
||||
// // When this is provided, the package is found in a directory relative to the
|
||||
// // build root. In this case the package's hash is irrelevant and therefore not
|
||||
// // computed. This field and `url` are mutually exclusive.
|
||||
// .path = "foo",
|
||||
//
|
||||
// // When this is set to `true`, a package is declared to be lazily
|
||||
// // fetched. This makes the dependency only get fetched if it is
|
||||
// // actually used.
|
||||
// .lazy = false,
|
||||
//},
|
||||
},
|
||||
|
||||
// Specifies the set of files and directories that are included in this package.
|
||||
// Only files and directories listed here are included in the `hash` that
|
||||
// is computed for this package. Only files listed here will remain on disk
|
||||
// when using the zig package manager. As a rule of thumb, one should list
|
||||
// files required for compilation plus any license(s).
|
||||
// Paths are relative to the build root. Use the empty string (`""`) to refer to
|
||||
// the build root itself.
|
||||
// A directory listed here means that all files within, recursively, are included.
|
||||
.paths = .{
|
||||
"build.zig",
|
||||
"build.zig.zon",
|
||||
"src",
|
||||
// For example...
|
||||
//"LICENSE",
|
||||
//"README.md",
|
||||
},
|
||||
}
|
||||
BIN
markov.bin
Normal file
BIN
markov.bin
Normal file
Binary file not shown.
11627
prenoms.csv
Normal file
11627
prenoms.csv
Normal file
File diff suppressed because it is too large
Load Diff
92
src/lib.zig
Normal file
92
src/lib.zig
Normal file
@ -0,0 +1,92 @@
|
||||
const std = @import("std");
|
||||
const rand = std.crypto.random;
|
||||
|
||||
pub const DataPoint = struct {
|
||||
char: u8,
|
||||
prob: f32,
|
||||
|
||||
pub fn desc(context: void, a: DataPoint, b: DataPoint) bool {
|
||||
_ = context;
|
||||
return a.prob > b.prob;
|
||||
}
|
||||
};
|
||||
|
||||
pub const MarkovChain = struct {
|
||||
allocator: std.mem.Allocator,
|
||||
map: std.AutoHashMap(u8, []DataPoint),
|
||||
|
||||
pub fn init(path: []const u8, allocator: std.mem.Allocator) !MarkovChain {
|
||||
var self = MarkovChain{
|
||||
.allocator = allocator,
|
||||
.map = std.AutoHashMap(u8, []DataPoint).init(allocator),
|
||||
};
|
||||
|
||||
var markovBinFile = try std.fs.cwd().openFile(path, .{ .mode = .read_only });
|
||||
|
||||
var reader = markovBinFile.reader();
|
||||
|
||||
for (0..256) |prevChar| {
|
||||
const cnt = try reader.readInt(u8, .little);
|
||||
//std.debug.print("previous : {c} - cnt : {d}\n", .{ @as(u8, @intCast(prevChar)), cnt });
|
||||
var nextChars: []DataPoint = try self.allocator.alloc(DataPoint, cnt);
|
||||
for (0..cnt) |i| {
|
||||
const nextByte = try reader.readByte();
|
||||
//std.debug.print("next : {c}\n", .{nextByte});
|
||||
const prob: f32 = @bitCast(try reader.readInt(u32, .little));
|
||||
nextChars[i] = DataPoint{
|
||||
.char = nextByte,
|
||||
.prob = prob,
|
||||
};
|
||||
}
|
||||
try self.map.put(@as(u8, @intCast(prevChar)), nextChars);
|
||||
}
|
||||
|
||||
//std.debug.print("choices for A : {any}\n", .{self.map.get(97).?});
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *MarkovChain) void {
|
||||
var iter = self.map.iterator();
|
||||
while (iter.next()) |entry| {
|
||||
self.allocator.free(entry.value_ptr.*);
|
||||
}
|
||||
self.map.deinit();
|
||||
}
|
||||
|
||||
pub fn generate(self: *MarkovChain, size: u8, allocator: std.mem.Allocator) ![]u8 {
|
||||
var result = try allocator.alloc(u8, size);
|
||||
var previous: u8 = 0;
|
||||
for (0..size) |i| {
|
||||
const choices = self.map.get(previous).?;
|
||||
const randFloat = rand.float(f32);
|
||||
var cumul: f32 = 0;
|
||||
for (0..choices.len) |j| {
|
||||
cumul += choices[j].prob;
|
||||
if (randFloat < cumul) {
|
||||
result[i] = choices[j].char;
|
||||
previous = choices[j].char;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
test "basic test" {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
const allocator = gpa.allocator();
|
||||
defer {
|
||||
_ = gpa.deinit();
|
||||
}
|
||||
|
||||
var markov = try MarkovChain.init("markov.bin", allocator);
|
||||
defer markov.deinit();
|
||||
|
||||
for (0..24) |_| {
|
||||
const randName = try markov.generate(8, allocator);
|
||||
defer allocator.free(randName);
|
||||
std.debug.print("generated : {s}\n", .{randName});
|
||||
}
|
||||
}
|
||||
69
src/main.zig
Normal file
69
src/main.zig
Normal file
@ -0,0 +1,69 @@
|
||||
const std = @import("std");
|
||||
const DataPoint = @import("markov").DataPoint;
|
||||
|
||||
pub fn main() !void {
|
||||
const path = "prenoms.csv";
|
||||
var markovCnt: [256][256]u32 = undefined;
|
||||
var computed: [256][256]DataPoint = undefined;
|
||||
|
||||
var file = try std.fs.cwd().openFile(path, .{});
|
||||
defer file.close();
|
||||
|
||||
var buf_reader = std.io.bufferedReader(file.reader());
|
||||
var in_stream = buf_reader.reader();
|
||||
|
||||
// init
|
||||
for (0..256) |i| {
|
||||
for (0..256) |j| {
|
||||
markovCnt[i][j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// stats
|
||||
|
||||
var buf: [1024]u8 = undefined;
|
||||
while (try in_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
|
||||
var previous: u8 = 0;
|
||||
for (line) |char| {
|
||||
if (char == ';') break;
|
||||
markovCnt[previous][char] += 1;
|
||||
previous = char;
|
||||
}
|
||||
}
|
||||
|
||||
for (0..256) |i| {
|
||||
var acc: u32 = 0;
|
||||
for (0..256) |j| {
|
||||
acc += markovCnt[i][j];
|
||||
}
|
||||
for (0..256) |j| {
|
||||
const ratio = @as(f32, @floatFromInt(markovCnt[i][j])) / @as(f32, @floatFromInt(acc));
|
||||
computed[i][j] = DataPoint{ .char = @intCast(j), .prob = ratio };
|
||||
}
|
||||
}
|
||||
|
||||
for (0..256) |i| {
|
||||
std.mem.sort(DataPoint, &computed[i], {}, DataPoint.desc);
|
||||
}
|
||||
|
||||
var outFile = try std.fs.cwd().createFile("markov.bin", .{ .truncate = true });
|
||||
defer outFile.close();
|
||||
|
||||
const writer = outFile.writer();
|
||||
|
||||
for (0..256) |i| {
|
||||
var cntnonzero: u8 = 0;
|
||||
for (0..256) |j| {
|
||||
if (computed[i][j].prob > 0) {
|
||||
cntnonzero += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
try writer.writeInt(u8, cntnonzero, .little);
|
||||
for (0..cntnonzero) |j| {
|
||||
try writer.writeByte(computed[i][j].char);
|
||||
try writer.writeInt(u32, @bitCast(computed[i][j].prob), .little);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user