11 files changed, 658 insertions, 0 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..1209f7b
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,5 @@
+root = true
+
+[src/**.zig]
+indent_style = tab
+indent_size = 2
diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..1d953f4
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+use nix
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d8c8979
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.zig-cache
+zig-out
diff --git a/build.zig b/build.zig
new file mode 100644
index 0000000..7caf737
--- /dev/null
+++ b/build.zig
@@ -0,0 +1,71 @@
+const std = @import("std");
+
+// Although this function looks imperative, note that its job is to
+// declaratively construct a build graph that will be executed by an external
+// runner.
+pub fn build(b: *std.Build) void {
+    // Standard target options allows the person running `zig build` to choose
+    // what target to build for. Here we do not override the defaults, which
+    // means any target is allowed, and the default is native. Other options
+    // for restricting supported target set are available.
+    const target = b.standardTargetOptions(.{});
+
+    // Standard optimization options allow the person running `zig build` to select
+    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
+    // set a preferred release mode, allowing the user to decide how to optimize.
+    const optimize = b.standardOptimizeOption(.{});
+
+    // This declares intent for the library to be installed into the standard
+    // location when the user invokes the "install" step (the default step when
+    // running `zig build`).
+    const exe = b.addExecutable(.{
+        .name = "gll",
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // This declares intent for the executable to be installed into the
+    // standard location when the user invokes the "install" step (the default
+    // step when running `zig build`).
+    b.installArtifact(exe);
+
+    // This *creates* a Run step in the build graph, to be executed when another
+    // step is evaluated that depends on it. The next line below will establish
+    // such a dependency.
+    const run_cmd = b.addRunArtifact(exe);
+
+    // By making the run step depend on the install step, it will be run from the
+    // installation directory rather than directly from within the cache directory.
+    // This is not necessary, however, if the application depends on other installed
+    // files, this ensures they will be present and in the expected location.
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    // This allows the user to pass arguments to the application in the build
+    // command itself, like this: `zig build run -- arg1 arg2 etc`
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    // This creates a build step. It will be visible in the `zig build --help` menu,
+    // and can be selected like this: `zig build run`
+    // This will evaluate the `run` step rather than the default, which is "install".
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+
+    // Creates a step for unit testing. This only builds the test executable
+    // but does not run it.
+    const exe_unit_tests = b.addTest(.{
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    // Similar to creating the run step earlier, this exposes a `test` step to
+    // the `zig build --help` menu, providing a way for the user to request
+    // running the unit tests.
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_exe_unit_tests.step);
+}
diff --git a/build.zig.zon b/build.zig.zon
new file mode 100644
index 0000000..7c468cb
--- /dev/null
+++ b/build.zig.zon
@@ -0,0 +1,72 @@
+.{
+    // This is the default name used by packages depending on this one. For
+    // example, when a user runs `zig fetch --save <url>`, this field is used
+    // as the key in the `dependencies` table. Although the user can choose a
+    // different name, most users will stick with this provided value.
+    //
+    // It is redundant to include "zig" in this name because it is already
+    // within the Zig package namespace.
+    .name = "gll",
+
+    // This is a [Semantic Version](https://semver.org/).
+    // In a future version of Zig it will be used for package deduplication.
+    .version = "0.0.0",
+
+    // This field is optional.
+    // This is currently advisory only; Zig does not yet do anything
+    // with this value.
+    //.minimum_zig_version = "0.11.0",
+
+    // This field is optional.
+    // Each dependency must either provide a `url` and `hash`, or a `path`.
+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
+    // Once all dependencies are fetched, `zig build` no longer requires
+    // internet connectivity.
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+
+        //    // When this is set to `true`, a package is declared to be lazily
+        //    // fetched. This makes the dependency only get fetched if it is
+        //    // actually used.
+        //    .lazy = false,
+        //},
+    },
+
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package. Only files listed here will remain on disk
+    // when using the zig package manager. As a rule of thumb, one should list
+    // files required for compilation plus any license(s).
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
+    .paths = .{
+        "build.zig",
+        "build.zig.zon",
+        "src",
+        // For example...
+        //"LICENSE",
+        //"README.md",
+    },
+}
diff --git a/default.nix b/default.nix
new file mode 100644
index 0000000..0bac3a3
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,6 @@
+{ pkgs ? import <nixpkgs> { }, ... }:
+pkgs.mkShell {
+	packages = [
+		pkgs.zig
+	];
+}
diff --git a/src/char-set.zig b/src/char-set.zig
new file mode 100644
index 0000000..d244540
--- /dev/null
+++ b/src/char-set.zig
@@ -0,0 +1,90 @@
+const std = @import("std");
+
+const Character = @import("character.zig").Character;
+
+const Self = @This();
+
+charset: [256]bool = std.mem.zeroes([256]bool),
+
+pub inline fn is_set(self: *const Self, c: usize) bool {
+	return self.charset[c];
+}
+
+pub inline fn set(self: *Self, c: usize) void {
+	self.charset[c] = true;
+}
+
+pub inline fn reset(self: *Self, c: usize) void {
+	self.charset[c] = false;
+}
+
+pub inline fn set_if(self: *Self, c: usize, flag: bool) void {
+	self.charset[c] = self.charset[c] or flag;
+}
+
+pub fn format(
+	self: *const Self,
+	comptime fmt: []const u8,
+	options: std.fmt.FormatOptions,
+	writer: anytype,
+) !void {
+	_ = fmt;
+	_ = options;
+
+	try writer.print("{{ ", .{});
+
+	for (self.charset, 0..) |is_first, index| {
+		if (is_first) {
+			if (index == Character.EPSILON) {
+				try writer.print("{} ", .{Character { .epsilon = void{} }});
+			} else if (index == Character.END) {
+				try writer.print("$ ", .{});
+			} else {
+				try writer.print("'{c}' ", .{@as(u8, @truncate(index))});
+			}
+		}
+	}
+
+	try writer.print("}}", .{});
+}
+
+pub fn expect(self: *const Self, expected: []const u8) !void {
+	var matrix = std.mem.zeroes([255]bool);
+
+	for (expected) |c| {
+		matrix[c] = true;
+	}
+
+	for (matrix, 0..) |contained, index| {
+		if (self.is_set(index) != contained) {
+			std.debug.print("expected {{ ", .{});
+
+			for (expected) |c| {
+				if (c == Character.EPSILON) {
+					std.debug.print("{} ", .{Character { .epsilon = void{} }});
+				} else if (c == Character.END) {
+					std.debug.print("$ ", .{});
+				} else {
+					std.debug.print("'{c}' ", .{c});
+				}
+			}
+
+			std.debug.print("}} but got {} (", .{self});
+
+			if (index == Character.EPSILON) {
+				std.debug.print("{}", .{Character { .epsilon = void{} }});
+			} else if (index == Character.END) {
+				std.debug.print("$", .{});
+			} else {
+				std.debug.print("'{c}'", .{@as(u8, @truncate(index))});
+			}
+			if (contained) {
+				std.debug.print(" missing)\n", .{});
+			} else {
+				std.debug.print(" not expected)\n", .{});
+			}
+
+			return error.ExpectEqualError;
+		}
+	}
+}
diff --git a/src/character.zig b/src/character.zig
new file mode 100644
index 0000000..bd11ccd
--- /dev/null
+++ b/src/character.zig
@@ -0,0 +1,38 @@
+const std = @import("std");
+
+pub const Character = union(enum) {
+	const Self = @This();
+
+	pub const EPSILON: u8 = '_';
+	pub const END: u8 = 0;
+
+	epsilon: void,
+	terminal: u8,
+	non_terminal: u8,
+
+	pub fn from_u8(c: u8) Self {
+		if (c == '_') {
+			return Self { .epsilon = void{} };
+		} else if (std.ascii.isUpper(c)) {
+			return Self { .non_terminal = c };
+		} else {
+			return Self { .terminal = c };
+		}
+	}
+
+	pub fn format(
+		self: *const Self,
+		comptime fmt: []const u8,
+		options: std.fmt.FormatOptions,
+		writer: anytype,
+	) !void {
+		_ = fmt;
+		_ = options;
+
+		switch (self.*) {
+			.epsilon => _ = try writer.writeAll("ε"),
+			.terminal => |c| try writer.writeByte(c),
+			.non_terminal => |c| try writer.writeByte(c),
+		}
+	}
+};
diff --git a/src/grammar.zig b/src/grammar.zig
new file mode 100644
index 0000000..0f25095
--- /dev/null
+++ b/src/grammar.zig
@@ -0,0 +1,272 @@
+const std = @import("std");
+
+const Character = @import("character.zig").Character;
+const RuleList = @import("rule-list.zig");
+
+const Grammar = struct {
+	const Self = @This();
+
+	rules: [26]RuleList,
+
+	pub fn init(buffer: []const u8, allocator: std.mem.Allocator) !Self {
+		var lines = std.mem.tokenizeScalar(u8, buffer, '\n');
+
+		var self: Self = undefined;
+		errdefer self.deinit(allocator);
+
+		for (&self.rules, 'A'..) |*rule, name| {
+			rule.* = RuleList.init(
+				@truncate(name),
+				allocator
+			);
+		}
+
+		while (lines.next()) |line| {
+			const name, const rhs = try RuleList.parse_rule(line, allocator);
+			try self.rule_by_name(name).add_rule(rhs);
+		}
+
+		self.generate_first();
+		self.generate_follows();
+
+		return self;
+	}
+
+	pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+		for (&self.rules) |*rule| {
+			rule.deinit(allocator);
+		}
+
+		self.* = undefined;
+	}
+
+	pub inline fn rule_by_name(self: *Self, name: u8) *RuleList {
+		return &self.rules[name - 'A'];
+	}
+
+	pub inline fn entry_point(self: *Self) *RuleList {
+		return self.rule('S');
+	}
+
+	fn generate_first(self: *Self) void {
+		var modified = true;
+
+		while (modified) {
+			modified = false;
+
+			for (&self.rules) |*rule| {
+				for (rule.rhs.items) |rhs| {
+					switch (rhs[0]) {
+						.terminal => |t| {
+							modified = modified or !rule.first.is_set(t);
+							rule.first.set(t);
+						},
+
+						.non_terminal, .epsilon => {
+							if (rule.first.is_set(Character.EPSILON)) continue;
+							var insert_epsilon = true;
+
+							for (rhs) |*char| {
+								switch (char.*) {
+									.terminal => |t| {
+										modified = modified or !rule.first.is_set(t);
+										rule.first.set(t);
+										insert_epsilon = false;
+										break;
+									},
+									.non_terminal => |n| {
+										const first = self.rule_by_name(n).first;
+										for (first.charset, 0..) |child_first, index| {
+											if (index == Character.EPSILON) continue;
+
+											modified = modified or (!rule.first.is_set(index) and child_first);
+											rule.first.set_if(index, child_first);
+										}
+
+										if (!first.is_set(Character.EPSILON)) {
+											insert_epsilon = false;
+											break;
+										}
+									},
+									.epsilon => {},
+								}
+							}
+
+							if (insert_epsilon) {
+								modified = true;
+								rule.first.set(Character.EPSILON);
+							}
+						},
+					}
+				}
+			}
+		}
+	}
+
+	fn generate_follows(self: *Self) void {
+		self.rule_by_name('S').follows.set(Character.END);
+		var modified = true;
+
+		while (modified) {
+			modified = false;
+
+			for (&self.rules) |*parent_rules| {
+				for (parent_rules.rhs.items) |rule| {
+					for (0..rule.len) |character_index| {
+
+						switch (rule[character_index]) {
+							.non_terminal => |n| {
+								const current_rule = self.rule_by_name(n);
+								const ends_with_epsilon = brk: {
+									for (character_index + 1..rule.len) |peek_index| {
+										switch (rule[peek_index]) {
+											.terminal => |t| {
+												modified = modified or !current_rule.follows.is_set(t);
+												current_rule.follows.set(t);
+												break :brk false;
+											},
+											.non_terminal => |peek_n| {
+												const peek_rule = self.rule_by_name(peek_n);
+												for (peek_rule.first.charset, 0..) |is_set, index| {
+													if (Character.EPSILON == index) continue;
+
+													modified = modified or (!current_rule.follows.is_set(index) and is_set);
+													current_rule.follows.set_if(index, is_set);
+												}
+
+												if (!peek_rule.first.is_set(Character.EPSILON)) {
+													break :brk false;
+												}
+											},
+											.epsilon => {},
+										}
+									}
+									break :brk true;
+								};
+
+								if (ends_with_epsilon) {
+									for (parent_rules.follows.charset, 0..) |is_set, char_index| {
+										modified = modified or (!current_rule.follows.is_set(char_index) and is_set);
+										current_rule.follows.set_if(char_index, is_set);
+									}
+								}
+							},
+							else => {}
+						}
+
+					}
+				}
+			}
+		}
+	}
+};
+
+test "expr" {
+	const text =
+				\\S -> B A
+				\\A -> + B A
+				\\A -> _
+				\\B -> D C
+				\\C -> * D C
+				\\C -> _
+				\\D -> ( S )
+				\\D -> a
+				\\D -> b
+				;
+
+	const allocator = std.testing.allocator;
+	var grammar = try Grammar.init(text, allocator);
+	defer grammar.deinit(allocator);
+
+	try grammar.rule_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' });
+	try grammar.rule_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON });
+	try grammar.rule_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' });
+	try grammar.rule_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON });
+	try grammar.rule_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' });
+
+	try grammar.rule_by_name('S').follows.expect(&[_]u8 { ')', Character.END });
+	try grammar.rule_by_name('A').follows.expect(&[_]u8 { ')', Character.END });
+	try grammar.rule_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END });
+	try grammar.rule_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END });
+	try grammar.rule_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END });
+}
+
+test "sample 0" {
+	const text =
+				\\S -> A C B
+				\\S -> C b b
+				\\S -> B a
+				\\A -> d a
+				\\A -> B C
+				\\B -> g
+				\\B -> _
+				\\C -> h
+				\\C -> _
+				;
+
+	const allocator = std.testing.allocator;
+	var grammar = try Grammar.init(text, allocator);
+	defer grammar.deinit(allocator);
+
+	try grammar.rule_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON });
+	try grammar.rule_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON });
+	try grammar.rule_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON });
+	try grammar.rule_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON });
+
+	try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+	try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END });
+	try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END });
+	try grammar.rule_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END });
+}
+
+test "sample 1" {
+	const text =
+		\\S -> a A B b
+		\\A -> c
+		\\A -> _
+		\\B -> d
+		\\B -> _
+		;
+
+	const allocator = std.testing.allocator;
+	var grammar = try Grammar.init(text, allocator);
+	defer grammar.deinit(allocator);
+
+	try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a' });
+	try grammar.rule_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON });
+	try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON });
+
+	try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+	try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'b', 'd' });
+	try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'b' });
+}
+
+test "sample 2" {
+	const text =
+		\\S -> A B
+		\\S -> e D a
+		\\A -> a b
+		\\A -> c
+		\\B -> d C
+		\\C -> e C
+		\\C -> _
+		\\D -> f D
+		\\D -> _
+		;
+
+	const allocator = std.testing.allocator;
+	var grammar = try Grammar.init(text, allocator);
+	defer grammar.deinit(allocator);
+
+	try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' });
+	try grammar.rule_by_name('A').first.expect(&[_]u8 { 'a', 'c' });
+	try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd' });
+	try grammar.rule_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON });
+	try grammar.rule_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON });
+
+	try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+	try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'd' });
+	try grammar.rule_by_name('B').follows.expect(&[_]u8 { Character.END });
+	try grammar.rule_by_name('C').follows.expect(&[_]u8 { Character.END });
+	try grammar.rule_by_name('D').follows.expect(&[_]u8 { 'a' });
+}
diff --git a/src/main.zig b/src/main.zig
new file mode 100644
index 0000000..62877cb
--- /dev/null
+++ b/src/main.zig
@@ -0,0 +1,10 @@
+const std = @import("std");
+
+pub const grammar = @import("grammar.zig");
+
+pub fn main() !void {
+}
+
+test {
+	std.testing.refAllDecls(@This());
+}
diff --git a/src/rule-list.zig b/src/rule-list.zig
new file mode 100644
index 0000000..a438750
--- /dev/null
+++ b/src/rule-list.zig
@@ -0,0 +1,91 @@
+const std = @import("std");
+const CharSet = @import("char-set.zig");
+const Character = @import("character.zig").Character;
+
+const Self = @This();
+const Rhs = []Character;
+
+name: u8,
+rhs: std.ArrayList(Rhs),
+first: CharSet,
+follows: CharSet,
+
+pub fn init(name: u8, allocator: std.mem.Allocator) Self {
+	return Self {
+		.name = name,
+		.rhs = std.ArrayList(Rhs).init(allocator),
+		.first = CharSet {},
+		.follows = CharSet {},
+	};
+}
+
+pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+	for (self.rhs.items) |item| {
+		allocator.free(item);
+	}
+
+	self.rhs.deinit();
+}
+
+pub fn add_rule(self: *Self, rhs: Rhs) !void {
+	try self.rhs.append(rhs);
+}
+
+pub fn parse_rule(
+	buffer: []const u8,
+	allocator: std.mem.Allocator
+) !struct { u8, Rhs } {
+
+	var rhs = std.ArrayList(Character).init(allocator);
+	errdefer rhs.deinit();
+
+	var tokens = std.mem.tokenizeAny(u8, buffer, &std.ascii.whitespace);
+
+	const lhs = tokens.next() orelse return error.MissingLhs;
+	if (lhs.len > 1) return error.InvalidLhs;
+
+	const arrow = tokens.next() orelse return error.MissingArrow;
+	if (!std.mem.eql(u8, arrow, "->")) return error.InvalidArrow;
+
+	while (tokens.next()) |token| {
+		if (token.len > 1) return error.InvalidCharacter;
+		try rhs.append(Character.from_u8(token[0]));
+	}
+
+	if (rhs.items.len == 0) {
+		return error.EmptyProduction;
+	}
+
+	return .{ lhs[0], try rhs.toOwnedSlice() };
+}
+
+pub inline fn is_empty(self: *const Self) bool {
+	return self.rhs.items.len == 0;
+}
+
+pub fn format(
+	self: *const Self,
+	comptime fmt: []const u8,
+	options: std.fmt.FormatOptions,
+	writer: anytype,
+) !void {
+	_ = fmt;
+	_ = options;
+
+	try writer.print("[{c} -> ", .{ self.name });
+
+	if (self.rhs.items.len > 0) {
+		for (self.rhs.items[0]) |c| {
+			try writer.print("{}", .{c});
+		}
+
+		for (self.rhs.items[1..]) |r| {
+			try writer.print(" | ", .{});
+			for (r) |c| {
+				try writer.print("{}", .{c});
+			}
+		}
+	}
+
+	try writer.writeByte(']');
+}