aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/char-set.zig90
-rw-r--r--src/character.zig38
-rw-r--r--src/grammar.zig272
-rw-r--r--src/main.zig10
-rw-r--r--src/rule-list.zig91
5 files changed, 501 insertions, 0 deletions
diff --git a/src/char-set.zig b/src/char-set.zig
new file mode 100644
index 0000000..d244540
--- /dev/null
+++ b/src/char-set.zig
@@ -0,0 +1,90 @@
+const std = @import("std");
+
+const Character = @import("character.zig").Character;
+
+const Self = @This();
+
+charset: [256]bool = std.mem.zeroes([256]bool),
+
+pub inline fn is_set(self: *const Self, c: usize) bool {
+ return self.charset[c];
+}
+
+pub inline fn set(self: *Self, c: usize) void {
+ self.charset[c] = true;
+}
+
+pub inline fn reset(self: *Self, c: usize) void {
+ self.charset[c] = false;
+}
+
+pub inline fn set_if(self: *Self, c: usize, flag: bool) void {
+ self.charset[c] = self.charset[c] or flag;
+}
+
+pub fn format(
+ self: *const Self,
+ comptime fmt: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+) !void {
+ _ = fmt;
+ _ = options;
+
+ try writer.print("{{ ", .{});
+
+ for (self.charset, 0..) |is_first, index| {
+ if (is_first) {
+ if (index == Character.EPSILON) {
+ try writer.print("{} ", .{Character { .epsilon = void{} }});
+ } else if (index == Character.END) {
+ try writer.print("$ ", .{});
+ } else {
+ try writer.print("'{c}' ", .{@as(u8, @truncate(index))});
+ }
+ }
+ }
+
+ try writer.print("}}", .{});
+}
+
+pub fn expect(self: *const Self, expected: []const u8) !void {
+ var matrix = std.mem.zeroes([255]bool);
+
+ for (expected) |c| {
+ matrix[c] = true;
+ }
+
+ for (matrix, 0..) |contained, index| {
+ if (self.is_set(index) != contained) {
+ std.debug.print("expected {{ ", .{});
+
+ for (expected) |c| {
+ if (c == Character.EPSILON) {
+ std.debug.print("{} ", .{Character { .epsilon = void{} }});
+ } else if (c == Character.END) {
+ std.debug.print("$ ", .{});
+ } else {
+ std.debug.print("'{c}' ", .{c});
+ }
+ }
+
+ std.debug.print("}} but got {} (", .{self});
+
+ if (index == Character.EPSILON) {
+ std.debug.print("{}", .{Character { .epsilon = void{} }});
+ } else if (index == Character.END) {
+ std.debug.print("$", .{});
+ } else {
+ std.debug.print("'{c}'", .{@as(u8, @truncate(index))});
+ }
+ if (contained) {
+ std.debug.print(" missing)\n", .{});
+ } else {
+ std.debug.print(" not expected)\n", .{});
+ }
+
+ return error.ExpectEqualError;
+ }
+ }
+}
diff --git a/src/character.zig b/src/character.zig
new file mode 100644
index 0000000..bd11ccd
--- /dev/null
+++ b/src/character.zig
@@ -0,0 +1,38 @@
+const std = @import("std");
+
+pub const Character = union(enum) {
+ const Self = @This();
+
+ pub const EPSILON: u8 = '_';
+ pub const END: u8 = 0;
+
+ epsilon: void,
+ terminal: u8,
+ non_terminal: u8,
+
+ pub fn from_u8(c: u8) Self {
+ if (c == '_') {
+ return Self { .epsilon = void{} };
+ } else if (std.ascii.isUpper(c)) {
+ return Self { .non_terminal = c };
+ } else {
+ return Self { .terminal = c };
+ }
+ }
+
+ pub fn format(
+ self: *const Self,
+ comptime fmt: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+ ) !void {
+ _ = fmt;
+ _ = options;
+
+ switch (self.*) {
+ .epsilon => _ = try writer.writeAll("ε"),
+ .terminal => |c| try writer.writeByte(c),
+ .non_terminal => |c| try writer.writeByte(c),
+ }
+ }
+};
diff --git a/src/grammar.zig b/src/grammar.zig
new file mode 100644
index 0000000..0f25095
--- /dev/null
+++ b/src/grammar.zig
@@ -0,0 +1,272 @@
+const std = @import("std");
+
+const Character = @import("character.zig").Character;
+const RuleList = @import("rule-list.zig");
+
+const Grammar = struct {
+ const Self = @This();
+
+ rules: [26]RuleList,
+
+ pub fn init(buffer: []const u8, allocator: std.mem.Allocator) !Self {
+ var lines = std.mem.tokenizeScalar(u8, buffer, '\n');
+
+ var self: Self = undefined;
+ errdefer self.deinit(allocator);
+
+ for (&self.rules, 'A'..) |*rule, name| {
+ rule.* = RuleList.init(
+ @truncate(name),
+ allocator
+ );
+ }
+
+ while (lines.next()) |line| {
+ const name, const rhs = try RuleList.parse_rule(line, allocator);
+ try self.rule_by_name(name).add_rule(rhs);
+ }
+
+ self.generate_first();
+ self.generate_follows();
+
+ return self;
+ }
+
+ pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+ for (&self.rules) |*rule| {
+ rule.deinit(allocator);
+ }
+
+ self.* = undefined;
+ }
+
+ pub inline fn rule_by_name(self: *Self, name: u8) *RuleList {
+ return &self.rules[name - 'A'];
+ }
+
+ pub inline fn entry_point(self: *Self) *RuleList {
+ return self.rule('S');
+ }
+
+ fn generate_first(self: *Self) void {
+ var modified = true;
+
+ while (modified) {
+ modified = false;
+
+ for (&self.rules) |*rule| {
+ for (rule.rhs.items) |rhs| {
+ switch (rhs[0]) {
+ .terminal => |t| {
+ modified = modified or !rule.first.is_set(t);
+ rule.first.set(t);
+ },
+
+ .non_terminal, .epsilon => {
+ if (rule.first.is_set(Character.EPSILON)) continue;
+ var insert_epsilon = true;
+
+ for (rhs) |*char| {
+ switch (char.*) {
+ .terminal => |t| {
+ modified = modified or !rule.first.is_set(t);
+ rule.first.set(t);
+ insert_epsilon = false;
+ break;
+ },
+ .non_terminal => |n| {
+ const first = self.rule_by_name(n).first;
+ for (first.charset, 0..) |child_first, index| {
+ if (index == Character.EPSILON) continue;
+
+ modified = modified or (!rule.first.is_set(index) and child_first);
+ rule.first.set_if(index, child_first);
+ }
+
+ if (!first.is_set(Character.EPSILON)) {
+ insert_epsilon = false;
+ break;
+ }
+ },
+ .epsilon => {},
+ }
+ }
+
+ if (insert_epsilon) {
+ modified = true;
+ rule.first.set(Character.EPSILON);
+ }
+ },
+ }
+ }
+ }
+ }
+ }
+
+ fn generate_follows(self: *Self) void {
+ self.rule_by_name('S').follows.set(Character.END);
+ var modified = true;
+
+ while (modified) {
+ modified = false;
+
+ for (&self.rules) |*parent_rules| {
+ for (parent_rules.rhs.items) |rule| {
+ for (0..rule.len) |character_index| {
+
+ switch (rule[character_index]) {
+ .non_terminal => |n| {
+ const current_rule = self.rule_by_name(n);
+ const ends_with_epsilon = brk: {
+ for (character_index + 1..rule.len) |peek_index| {
+ switch (rule[peek_index]) {
+ .terminal => |t| {
+ modified = modified or !current_rule.follows.is_set(t);
+ current_rule.follows.set(t);
+ break :brk false;
+ },
+ .non_terminal => |peek_n| {
+ const peek_rule = self.rule_by_name(peek_n);
+ for (peek_rule.first.charset, 0..) |is_set, index| {
+ if (Character.EPSILON == index) continue;
+
+ modified = modified or (!current_rule.follows.is_set(index) and is_set);
+ current_rule.follows.set_if(index, is_set);
+ }
+
+ if (!peek_rule.first.is_set(Character.EPSILON)) {
+ break :brk false;
+ }
+ },
+ .epsilon => {},
+ }
+ }
+ break :brk true;
+ };
+
+ if (ends_with_epsilon) {
+ for (parent_rules.follows.charset, 0..) |is_set, char_index| {
+ modified = modified or (!current_rule.follows.is_set(char_index) and is_set);
+ current_rule.follows.set_if(char_index, is_set);
+ }
+ }
+ },
+ else => {}
+ }
+
+ }
+ }
+ }
+ }
+ }
+};
+
+test "expr" {
+ const text =
+ \\S -> B A
+ \\A -> + B A
+ \\A -> _
+ \\B -> D C
+ \\C -> * D C
+ \\C -> _
+ \\D -> ( S )
+ \\D -> a
+ \\D -> b
+ ;
+
+ const allocator = std.testing.allocator;
+ var grammar = try Grammar.init(text, allocator);
+ defer grammar.deinit(allocator);
+
+ try grammar.rule_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' });
+ try grammar.rule_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON });
+ try grammar.rule_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' });
+ try grammar.rule_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON });
+ try grammar.rule_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' });
+
+ try grammar.rule_by_name('S').follows.expect(&[_]u8 { ')', Character.END });
+ try grammar.rule_by_name('A').follows.expect(&[_]u8 { ')', Character.END });
+ try grammar.rule_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END });
+ try grammar.rule_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END });
+ try grammar.rule_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END });
+}
+
+test "sample 0" {
+ const text =
+ \\S -> A C B
+ \\S -> C b b
+ \\S -> B a
+ \\A -> d a
+ \\A -> B C
+ \\B -> g
+ \\B -> _
+ \\C -> h
+ \\C -> _
+ ;
+
+ const allocator = std.testing.allocator;
+ var grammar = try Grammar.init(text, allocator);
+ defer grammar.deinit(allocator);
+
+ try grammar.rule_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON });
+ try grammar.rule_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON });
+ try grammar.rule_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON });
+ try grammar.rule_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON });
+
+ try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+ try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END });
+ try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END });
+ try grammar.rule_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END });
+}
+
+test "sample 1" {
+ const text =
+ \\S -> a A B b
+ \\A -> c
+ \\A -> _
+ \\B -> d
+ \\B -> _
+ ;
+
+ const allocator = std.testing.allocator;
+ var grammar = try Grammar.init(text, allocator);
+ defer grammar.deinit(allocator);
+
+ try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a' });
+ try grammar.rule_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON });
+ try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON });
+
+ try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+ try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'b', 'd' });
+ try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'b' });
+}
+
+test "sample 2" {
+ const text =
+ \\S -> A B
+ \\S -> e D a
+ \\A -> a b
+ \\A -> c
+ \\B -> d C
+ \\C -> e C
+ \\C -> _
+ \\D -> f D
+ \\D -> _
+ ;
+
+ const allocator = std.testing.allocator;
+ var grammar = try Grammar.init(text, allocator);
+ defer grammar.deinit(allocator);
+
+ try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' });
+ try grammar.rule_by_name('A').first.expect(&[_]u8 { 'a', 'c' });
+ try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd' });
+ try grammar.rule_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON });
+ try grammar.rule_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON });
+
+ try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END });
+ try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'd' });
+ try grammar.rule_by_name('B').follows.expect(&[_]u8 { Character.END });
+ try grammar.rule_by_name('C').follows.expect(&[_]u8 { Character.END });
+ try grammar.rule_by_name('D').follows.expect(&[_]u8 { 'a' });
+}
diff --git a/src/main.zig b/src/main.zig
new file mode 100644
index 0000000..62877cb
--- /dev/null
+++ b/src/main.zig
@@ -0,0 +1,10 @@
+const std = @import("std");
+
+pub const grammar = @import("grammar.zig");
+
+pub fn main() !void {
+}
+
+test {
+ std.testing.refAllDecls(@This());
+}
diff --git a/src/rule-list.zig b/src/rule-list.zig
new file mode 100644
index 0000000..a438750
--- /dev/null
+++ b/src/rule-list.zig
@@ -0,0 +1,91 @@
+const std = @import("std");
+const CharSet = @import("char-set.zig");
+const Character = @import("character.zig").Character;
+
+const Self = @This();
+const Rhs = []Character;
+
+name: u8,
+rhs: std.ArrayList(Rhs),
+first: CharSet,
+follows: CharSet,
+
+pub fn init(name: u8, allocator: std.mem.Allocator) Self {
+ return Self {
+ .name = name,
+ .rhs = std.ArrayList(Rhs).init(allocator),
+ .first = CharSet {},
+ .follows = CharSet {},
+ };
+}
+
+pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+ for (self.rhs.items) |item| {
+ allocator.free(item);
+ }
+
+ self.rhs.deinit();
+}
+
+pub fn add_rule(self: *Self, rhs: Rhs) !void {
+ try self.rhs.append(rhs);
+}
+
+pub fn parse_rule(
+ buffer: []const u8,
+ allocator: std.mem.Allocator
+) !struct { u8, Rhs } {
+
+ var rhs = std.ArrayList(Character).init(allocator);
+ errdefer rhs.deinit();
+
+ var tokens = std.mem.tokenizeAny(u8, buffer, &std.ascii.whitespace);
+
+ const lhs = tokens.next() orelse return error.MissingLhs;
+ if (lhs.len > 1) return error.InvalidLhs;
+
+ const arrow = tokens.next() orelse return error.MissingArrow;
+ if (!std.mem.eql(u8, arrow, "->")) return error.InvalidArrow;
+
+ while (tokens.next()) |token| {
+ if (token.len > 1) return error.InvalidCharacter;
+ try rhs.append(Character.from_u8(token[0]));
+ }
+
+ if (rhs.items.len == 0) {
+ return error.EmptyProduction;
+ }
+
+ return .{ lhs[0], try rhs.toOwnedSlice() };
+}
+
+pub inline fn is_empty(self: *const Self) bool {
+ return self.rhs.items.len == 0;
+}
+
+pub fn format(
+ self: *const Self,
+ comptime fmt: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+) !void {
+ _ = fmt;
+ _ = options;
+
+ try writer.print("[{c} -> ", .{ self.name });
+
+ if (self.rhs.items.len > 0) {
+ for (self.rhs.items[0]) |c| {
+ try writer.print("{}", .{c});
+ }
+
+ for (self.rhs.items[1..]) |r| {
+ try writer.print(" | ", .{});
+ for (r) |c| {
+ try writer.print("{}", .{c});
+ }
+ }
+ }
+
+ try writer.writeByte(']');
+}