diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2025-05-06 10:43:09 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2025-05-06 10:43:09 +0200 |
| commit | eff19cc15a9bf4df60e7f90c3a7ee525c65266c0 (patch) | |
| tree | d6034b574e8e2218054d42caadbf25fa17862abf /src/grammar.zig | |
| parent | e2f01d5df22704bfe62396a0f9a260f86edbde0e (diff) | |
add full grammar parsing
Diffstat (limited to 'src/grammar.zig')
| -rw-r--r-- | src/grammar.zig | 248 |
1 files changed, 151 insertions, 97 deletions
diff --git a/src/grammar.zig b/src/grammar.zig index ed7eee6..298932c 100644 --- a/src/grammar.zig +++ b/src/grammar.zig @@ -5,24 +5,47 @@ const NonTerminal = @import("non-terminal.zig"); const Self = @This(); -non_terminals: [26]NonTerminal, +non_terminals: []NonTerminal, +entry_id: usize, +allocator: std.mem.Allocator, -pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self { +pub fn parse(entry: []const u8, buffer: []const u8, allocator: std.mem.Allocator) !Self { var lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); - var self: Self = undefined; - errdefer self.deinit(allocator); + while (lines.next()) |line| { + const name = try NonTerminal.parse_name(line); + + if (!names.contains(name)) { + try names.put(name, names.count()); + } + } + + lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + var self = Self { + .non_terminals = try allocator.alloc(NonTerminal, names.count()), + .entry_id = names.get(entry) orelse return error.MissingRule, + .allocator = allocator, + }; + errdefer self.deinit(); - for (&self.non_terminals, 'A'..) |*non_terminal, name| { - non_terminal.* = NonTerminal.init( - @truncate(name), - allocator - ); + { + var iterator = names.iterator(); + while (iterator.next()) |name_entry| { + self.non_terminals[name_entry.value_ptr.*] = try NonTerminal.init( + name_entry.key_ptr.*, + name_entry.value_ptr.*, + allocator + ); + } } while (lines.next()) |line| { - const name, const rule = try NonTerminal.parse_rule(line, allocator); - try self.non_terminal_by_name(name).add_rule(rule); + const name = try NonTerminal.parse_name(line); + try self.non_terminal_by_id( + names.get(name) orelse return error.MissingRule + ).parse_and_add_variants(line, &names); } self.generate_first(); @@ -31,20 +54,21 @@ pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self { return self; } -pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { - for (&self.non_terminals) |*non_terminal| { - non_terminal.deinit(allocator); +pub fn deinit(self: *Self) void { + for (self.non_terminals) |*non_terminal| { + non_terminal.deinit(); } + self.allocator.free(self.non_terminals); self.* = undefined; } -pub inline fn non_terminal_by_name(self: *Self, name: u8) *NonTerminal { - return &self.non_terminals[name - 'A']; +pub inline fn non_terminal_by_id(self: *Self, id: usize) *NonTerminal { + return &self.non_terminals[id]; } pub inline fn entry_point(self: *Self) *NonTerminal { - return self.non_terminal_by_name('S'); + return self.non_terminal_by_id(self.entry_id); } fn generate_first(self: *Self) void { @@ -53,7 +77,7 @@ fn generate_first(self: *Self) void { while (modified) { modified = false; - for (&self.non_terminals) |*non_terminal| { + for (self.non_terminals) |*non_terminal| { for (non_terminal.rules()) |rule| { switch (rule[0]) { .terminal => |t| { @@ -62,7 +86,6 @@ fn generate_first(self: *Self) void { }, .non_terminal, .epsilon => { - if (non_terminal.first.is_set(Character.EPSILON)) continue; var insert_epsilon = true; for (rule) |*char| { @@ -74,7 +97,7 @@ fn generate_first(self: *Self) void { break; }, .non_terminal => |n| { - const first = self.non_terminal_by_name(n).first; + const first = self.non_terminal_by_id(n).first; for (first.charset, 0..) |child_first, index| { if (index == Character.EPSILON) continue; @@ -91,7 +114,7 @@ fn generate_first(self: *Self) void { } } - if (insert_epsilon) { + if (insert_epsilon and !non_terminal.first.is_set(Character.EPSILON)) { modified = true; non_terminal.first.set(Character.EPSILON); } @@ -101,21 +124,20 @@ fn generate_first(self: *Self) void { } } } - fn generate_follows(self: *Self) void { - self.non_terminal_by_name('S').follows.set(Character.END); + self.entry_point().follows.set(Character.END); var modified = true; while (modified) { modified = false; - for (&self.non_terminals) |*non_terminal| { + for (self.non_terminals) |*non_terminal| { for (non_terminal.rules()) |rule| { for (0..rule.len) |character_index| { switch (rule[character_index]) { .non_terminal => |n| { - const current_non_terminal = self.non_terminal_by_name(n); + const current_non_terminal = self.non_terminal_by_id(n); const ends_with_epsilon = brk: { for (character_index + 1..rule.len) |peek_index| { switch (rule[peek_index]) { @@ -125,7 +147,7 @@ fn generate_follows(self: *Self) void { break :brk false; }, .non_terminal => |peek_n| { - const peek_non_terminal = self.non_terminal_by_name(peek_n); + const peek_non_terminal = self.non_terminal_by_id(peek_n); for (peek_non_terminal.first.charset, 0..) |is_set, index| { if (Character.EPSILON == index) continue; @@ -159,112 +181,144 @@ fn generate_follows(self: *Self) void { } } +pub fn format( + self: *const Self, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = fmt; + _ = options; + + for (self.non_terminals) |non_terminal| { + try writer.print("[{}] ", .{non_terminal}); + } +} + test "expr" { const text = \\S -> B A - \\A -> + B A - \\A -> _ + \\A -> '+' B A + \\A -> '' \\B -> D C - \\C -> * D C - \\C -> _ - \\D -> ( S ) - \\D -> a - \\D -> b + \\C -> '*' D C + \\C -> '' + \\D -> '(' S ')' + \\D -> 'a' | 'b' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON }); - try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { '+', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { '*', Character.EPSILON }); + try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { ')', Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { ')', Character.END }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END }); - try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { ')', Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { ')', Character.END }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { '+', ')', Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { '+', ')', Character.END }); + try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { '*', '+', ')', Character.END }); } test "sample 0" { const text = - \\S -> A C B - \\S -> C b b - \\S -> B a - \\A -> d a - \\A -> B C - \\B -> g - \\B -> _ - \\C -> h - \\C -> _ + \\S -> A C B | C 'bb' | B 'a' + \\A -> 'da' | B C + \\B -> 'g' | '' + \\C -> 'h' | '' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'd', 'g', 'h', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'g', Character.EPSILON }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'h', Character.EPSILON }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { 'b', 'g', 'h', Character.END }); } test "sample 1" { const text = - \\S -> a A B b - \\A -> c - \\A -> _ - \\B -> d - \\B -> _ + \\S -> 'a' A B 'b' + \\A -> 'c' + \\A -> '' + \\B -> 'd' + \\B -> '' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'c', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd', Character.EPSILON }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'b', 'd' }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'b' }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'b', 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'b' }); } test "sample 2" { const text = \\S -> A B - \\S -> e D a - \\A -> a b - \\A -> c - \\B -> d C - \\C -> e C - \\C -> _ - \\D -> f D - \\D -> _ + \\S -> 'e' D 'a' + \\A -> 'ab' + \\A -> 'c' + \\B -> 'd' C + \\C -> 'e' C + \\C -> '' + \\D -> 'f' D + \\D -> '' + ; + + const allocator = std.testing.allocator; + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); + + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'c', 'e' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd' }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'e', Character.EPSILON }); + try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { 'f', Character.EPSILON }); + + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { 'a' }); +} + +test "sample 3" { + const text = + \\S -> A S 'd' + \\S -> B S + \\S -> '' + \\A -> 'a' + \\A -> 'c' + \\B -> 'a' + \\B -> 'b' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'a', 'c' }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd' }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON }); - try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'b', 'c', Character.EPSILON }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'a', 'b' }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'd' }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { 'a' }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { 'd', Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd', Character.END }); } |