diff options
| -rw-r--r-- | .editorconfig | 5 | ||||
| -rw-r--r-- | .envrc | 1 | ||||
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | build.zig | 71 | ||||
| -rw-r--r-- | build.zig.zon | 72 | ||||
| -rw-r--r-- | default.nix | 6 | ||||
| -rw-r--r-- | src/char-set.zig | 90 | ||||
| -rw-r--r-- | src/character.zig | 38 | ||||
| -rw-r--r-- | src/grammar.zig | 272 | ||||
| -rw-r--r-- | src/main.zig | 10 | ||||
| -rw-r--r-- | src/rule-list.zig | 91 |
11 files changed, 658 insertions, 0 deletions
diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..1209f7b --- /dev/null +++ b/.editorconfig @@ -0,0 +1,5 @@ +root = true + +[src/**.zig] +indent_style = tab +indent_size = 2 @@ -0,0 +1 @@ +use nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d8c8979 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.zig-cache +zig-out diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..7caf737 --- /dev/null +++ b/build.zig @@ -0,0 +1,71 @@ +const std = @import("std"); + +// Although this function looks imperative, note that its job is to +// declaratively construct a build graph that will be executed by an external +// runner. +pub fn build(b: *std.Build) void { + // Standard target options allows the person running `zig build` to choose + // what target to build for. Here we do not override the defaults, which + // means any target is allowed, and the default is native. Other options + // for restricting supported target set are available. + const target = b.standardTargetOptions(.{}); + + // Standard optimization options allow the person running `zig build` to select + // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not + // set a preferred release mode, allowing the user to decide how to optimize. + const optimize = b.standardOptimizeOption(.{}); + + // This declares intent for the library to be installed into the standard + // location when the user invokes the "install" step (the default step when + // running `zig build`). + const exe = b.addExecutable(.{ + .name = "gll", + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }); + + // This declares intent for the executable to be installed into the + // standard location when the user invokes the "install" step (the default + // step when running `zig build`). + b.installArtifact(exe); + + // This *creates* a Run step in the build graph, to be executed when another + // step is evaluated that depends on it. The next line below will establish + // such a dependency. + const run_cmd = b.addRunArtifact(exe); + + // By making the run step depend on the install step, it will be run from the + // installation directory rather than directly from within the cache directory. + // This is not necessary, however, if the application depends on other installed + // files, this ensures they will be present and in the expected location. + run_cmd.step.dependOn(b.getInstallStep()); + + // This allows the user to pass arguments to the application in the build + // command itself, like this: `zig build run -- arg1 arg2 etc` + if (b.args) |args| { + run_cmd.addArgs(args); + } + + // This creates a build step. It will be visible in the `zig build --help` menu, + // and can be selected like this: `zig build run` + // This will evaluate the `run` step rather than the default, which is "install". + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); + + // Creates a step for unit testing. This only builds the test executable + // but does not run it. + const exe_unit_tests = b.addTest(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }); + + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + // Similar to creating the run step earlier, this exposes a `test` step to + // the `zig build --help` menu, providing a way for the user to request + // running the unit tests. + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_exe_unit_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..7c468cb --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,72 @@ +.{ + // This is the default name used by packages depending on this one. For + // example, when a user runs `zig fetch --save <url>`, this field is used + // as the key in the `dependencies` table. Although the user can choose a + // different name, most users will stick with this provided value. + // + // It is redundant to include "zig" in this name because it is already + // within the Zig package namespace. + .name = "gll", + + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + + // This field is optional. + // This is currently advisory only; Zig does not yet do anything + // with this value. + //.minimum_zig_version = "0.11.0", + + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save <url>` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + + // // When this is set to `true`, a package is declared to be lazily + // // fetched. This makes the dependency only get fetched if it is + // // actually used. + // .lazy = false, + //}, + }, + + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. Only files listed here will remain on disk + // when using the zig package manager. As a rule of thumb, one should list + // files required for compilation plus any license(s). + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + // For example... + //"LICENSE", + //"README.md", + }, +} diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..0bac3a3 --- /dev/null +++ b/default.nix @@ -0,0 +1,6 @@ +{ pkgs ? import <nixpkgs> { }, ... }: +pkgs.mkShell { + packages = [ + pkgs.zig + ]; +} diff --git a/src/char-set.zig b/src/char-set.zig new file mode 100644 index 0000000..d244540 --- /dev/null +++ b/src/char-set.zig @@ -0,0 +1,90 @@ +const std = @import("std"); + +const Character = @import("character.zig").Character; + +const Self = @This(); + +charset: [256]bool = std.mem.zeroes([256]bool), + +pub inline fn is_set(self: *const Self, c: usize) bool { + return self.charset[c]; +} + +pub inline fn set(self: *Self, c: usize) void { + self.charset[c] = true; +} + +pub inline fn reset(self: *Self, c: usize) void { + self.charset[c] = false; +} + +pub inline fn set_if(self: *Self, c: usize, flag: bool) void { + self.charset[c] = self.charset[c] or flag; +} + +pub fn format( + self: *const Self, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = fmt; + _ = options; + + try writer.print("{{ ", .{}); + + for (self.charset, 0..) |is_first, index| { + if (is_first) { + if (index == Character.EPSILON) { + try writer.print("{} ", .{Character { .epsilon = void{} }}); + } else if (index == Character.END) { + try writer.print("$ ", .{}); + } else { + try writer.print("'{c}' ", .{@as(u8, @truncate(index))}); + } + } + } + + try writer.print("}}", .{}); +} + +pub fn expect(self: *const Self, expected: []const u8) !void { + var matrix = std.mem.zeroes([255]bool); + + for (expected) |c| { + matrix[c] = true; + } + + for (matrix, 0..) |contained, index| { + if (self.is_set(index) != contained) { + std.debug.print("expected {{ ", .{}); + + for (expected) |c| { + if (c == Character.EPSILON) { + std.debug.print("{} ", .{Character { .epsilon = void{} }}); + } else if (c == Character.END) { + std.debug.print("$ ", .{}); + } else { + std.debug.print("'{c}' ", .{c}); + } + } + + std.debug.print("}} but got {} (", .{self}); + + if (index == Character.EPSILON) { + std.debug.print("{}", .{Character { .epsilon = void{} }}); + } else if (index == Character.END) { + std.debug.print("$", .{}); + } else { + std.debug.print("'{c}'", .{@as(u8, @truncate(index))}); + } + if (contained) { + std.debug.print(" missing)\n", .{}); + } else { + std.debug.print(" not expected)\n", .{}); + } + + return error.ExpectEqualError; + } + } +} diff --git a/src/character.zig b/src/character.zig new file mode 100644 index 0000000..bd11ccd --- /dev/null +++ b/src/character.zig @@ -0,0 +1,38 @@ +const std = @import("std"); + +pub const Character = union(enum) { + const Self = @This(); + + pub const EPSILON: u8 = '_'; + pub const END: u8 = 0; + + epsilon: void, + terminal: u8, + non_terminal: u8, + + pub fn from_u8(c: u8) Self { + if (c == '_') { + return Self { .epsilon = void{} }; + } else if (std.ascii.isUpper(c)) { + return Self { .non_terminal = c }; + } else { + return Self { .terminal = c }; + } + } + + pub fn format( + self: *const Self, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, + ) !void { + _ = fmt; + _ = options; + + switch (self.*) { + .epsilon => _ = try writer.writeAll("ε"), + .terminal => |c| try writer.writeByte(c), + .non_terminal => |c| try writer.writeByte(c), + } + } +}; diff --git a/src/grammar.zig b/src/grammar.zig new file mode 100644 index 0000000..0f25095 --- /dev/null +++ b/src/grammar.zig @@ -0,0 +1,272 @@ +const std = @import("std"); + +const Character = @import("character.zig").Character; +const RuleList = @import("rule-list.zig"); + +const Grammar = struct { + const Self = @This(); + + rules: [26]RuleList, + + pub fn init(buffer: []const u8, allocator: std.mem.Allocator) !Self { + var lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + + var self: Self = undefined; + errdefer self.deinit(allocator); + + for (&self.rules, 'A'..) |*rule, name| { + rule.* = RuleList.init( + @truncate(name), + allocator + ); + } + + while (lines.next()) |line| { + const name, const rhs = try RuleList.parse_rule(line, allocator); + try self.rule_by_name(name).add_rule(rhs); + } + + self.generate_first(); + self.generate_follows(); + + return self; + } + + pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { + for (&self.rules) |*rule| { + rule.deinit(allocator); + } + + self.* = undefined; + } + + pub inline fn rule_by_name(self: *Self, name: u8) *RuleList { + return &self.rules[name - 'A']; + } + + pub inline fn entry_point(self: *Self) *RuleList { + return self.rule('S'); + } + + fn generate_first(self: *Self) void { + var modified = true; + + while (modified) { + modified = false; + + for (&self.rules) |*rule| { + for (rule.rhs.items) |rhs| { + switch (rhs[0]) { + .terminal => |t| { + modified = modified or !rule.first.is_set(t); + rule.first.set(t); + }, + + .non_terminal, .epsilon => { + if (rule.first.is_set(Character.EPSILON)) continue; + var insert_epsilon = true; + + for (rhs) |*char| { + switch (char.*) { + .terminal => |t| { + modified = modified or !rule.first.is_set(t); + rule.first.set(t); + insert_epsilon = false; + break; + }, + .non_terminal => |n| { + const first = self.rule_by_name(n).first; + for (first.charset, 0..) |child_first, index| { + if (index == Character.EPSILON) continue; + + modified = modified or (!rule.first.is_set(index) and child_first); + rule.first.set_if(index, child_first); + } + + if (!first.is_set(Character.EPSILON)) { + insert_epsilon = false; + break; + } + }, + .epsilon => {}, + } + } + + if (insert_epsilon) { + modified = true; + rule.first.set(Character.EPSILON); + } + }, + } + } + } + } + } + + fn generate_follows(self: *Self) void { + self.rule_by_name('S').follows.set(Character.END); + var modified = true; + + while (modified) { + modified = false; + + for (&self.rules) |*parent_rules| { + for (parent_rules.rhs.items) |rule| { + for (0..rule.len) |character_index| { + + switch (rule[character_index]) { + .non_terminal => |n| { + const current_rule = self.rule_by_name(n); + const ends_with_epsilon = brk: { + for (character_index + 1..rule.len) |peek_index| { + switch (rule[peek_index]) { + .terminal => |t| { + modified = modified or !current_rule.follows.is_set(t); + current_rule.follows.set(t); + break :brk false; + }, + .non_terminal => |peek_n| { + const peek_rule = self.rule_by_name(peek_n); + for (peek_rule.first.charset, 0..) |is_set, index| { + if (Character.EPSILON == index) continue; + + modified = modified or (!current_rule.follows.is_set(index) and is_set); + current_rule.follows.set_if(index, is_set); + } + + if (!peek_rule.first.is_set(Character.EPSILON)) { + break :brk false; + } + }, + .epsilon => {}, + } + } + break :brk true; + }; + + if (ends_with_epsilon) { + for (parent_rules.follows.charset, 0..) |is_set, char_index| { + modified = modified or (!current_rule.follows.is_set(char_index) and is_set); + current_rule.follows.set_if(char_index, is_set); + } + } + }, + else => {} + } + + } + } + } + } + } +}; + +test "expr" { + const text = + \\S -> B A + \\A -> + B A + \\A -> _ + \\B -> D C + \\C -> * D C + \\C -> _ + \\D -> ( S ) + \\D -> a + \\D -> b + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON }); + try grammar.rule_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { ')', Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { ')', Character.END }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END }); + try grammar.rule_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END }); +} + +test "sample 0" { + const text = + \\S -> A C B + \\S -> C b b + \\S -> B a + \\A -> d a + \\A -> B C + \\B -> g + \\B -> _ + \\C -> h + \\C -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END }); +} + +test "sample 1" { + const text = + \\S -> a A B b + \\A -> c + \\A -> _ + \\B -> d + \\B -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'b', 'd' }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'b' }); +} + +test "sample 2" { + const text = + \\S -> A B + \\S -> e D a + \\A -> a b + \\A -> c + \\B -> d C + \\C -> e C + \\C -> _ + \\D -> f D + \\D -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'a', 'c' }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd' }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON }); + try grammar.rule_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'd' }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('D').follows.expect(&[_]u8 { 'a' }); +} diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..62877cb --- /dev/null +++ b/src/main.zig @@ -0,0 +1,10 @@ +const std = @import("std"); + +pub const grammar = @import("grammar.zig"); + +pub fn main() !void { +} + +test { + std.testing.refAllDecls(@This()); +} diff --git a/src/rule-list.zig b/src/rule-list.zig new file mode 100644 index 0000000..a438750 --- /dev/null +++ b/src/rule-list.zig @@ -0,0 +1,91 @@ +const std = @import("std"); +const CharSet = @import("char-set.zig"); +const Character = @import("character.zig").Character; + +const Self = @This(); +const Rhs = []Character; + +name: u8, +rhs: std.ArrayList(Rhs), +first: CharSet, +follows: CharSet, + +pub fn init(name: u8, allocator: std.mem.Allocator) Self { + return Self { + .name = name, + .rhs = std.ArrayList(Rhs).init(allocator), + .first = CharSet {}, + .follows = CharSet {}, + }; +} + +pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { + for (self.rhs.items) |item| { + allocator.free(item); + } + + self.rhs.deinit(); +} + +pub fn add_rule(self: *Self, rhs: Rhs) !void { + try self.rhs.append(rhs); +} + +pub fn parse_rule( + buffer: []const u8, + allocator: std.mem.Allocator +) !struct { u8, Rhs } { + + var rhs = std.ArrayList(Character).init(allocator); + errdefer rhs.deinit(); + + var tokens = std.mem.tokenizeAny(u8, buffer, &std.ascii.whitespace); + + const lhs = tokens.next() orelse return error.MissingLhs; + if (lhs.len > 1) return error.InvalidLhs; + + const arrow = tokens.next() orelse return error.MissingArrow; + if (!std.mem.eql(u8, arrow, "->")) return error.InvalidArrow; + + while (tokens.next()) |token| { + if (token.len > 1) return error.InvalidCharacter; + try rhs.append(Character.from_u8(token[0])); + } + + if (rhs.items.len == 0) { + return error.EmptyProduction; + } + + return .{ lhs[0], try rhs.toOwnedSlice() }; +} + +pub inline fn is_empty(self: *const Self) bool { + return self.rhs.items.len == 0; +} + +pub fn format( + self: *const Self, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = fmt; + _ = options; + + try writer.print("[{c} -> ", .{ self.name }); + + if (self.rhs.items.len > 0) { + for (self.rhs.items[0]) |c| { + try writer.print("{}", .{c}); + } + + for (self.rhs.items[1..]) |r| { + try writer.print(" | ", .{}); + for (r) |c| { + try writer.print("{}", .{c}); + } + } + } + + try writer.writeByte(']'); +} |