diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2025-04-24 15:10:14 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2025-04-24 15:10:14 +0200 |
| commit | 9456f4e2b569b393d6c01784a4545b11b32e16ef (patch) | |
| tree | 7b4eeab3d0dd4c273de025db19f6be52d9cd1f1d /src/grammar.zig | |
Implement grammar parser
This implements the grammar struct together with
some other helpful structures like char-set and character.
The grammar struct parses a grammar file and also generates
the FIRST and FOLLOWS tables.
Diffstat (limited to 'src/grammar.zig')
| -rw-r--r-- | src/grammar.zig | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/src/grammar.zig b/src/grammar.zig new file mode 100644 index 0000000..0f25095 --- /dev/null +++ b/src/grammar.zig @@ -0,0 +1,272 @@ +const std = @import("std"); + +const Character = @import("character.zig").Character; +const RuleList = @import("rule-list.zig"); + +const Grammar = struct { + const Self = @This(); + + rules: [26]RuleList, + + pub fn init(buffer: []const u8, allocator: std.mem.Allocator) !Self { + var lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + + var self: Self = undefined; + errdefer self.deinit(allocator); + + for (&self.rules, 'A'..) |*rule, name| { + rule.* = RuleList.init( + @truncate(name), + allocator + ); + } + + while (lines.next()) |line| { + const name, const rhs = try RuleList.parse_rule(line, allocator); + try self.rule_by_name(name).add_rule(rhs); + } + + self.generate_first(); + self.generate_follows(); + + return self; + } + + pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { + for (&self.rules) |*rule| { + rule.deinit(allocator); + } + + self.* = undefined; + } + + pub inline fn rule_by_name(self: *Self, name: u8) *RuleList { + return &self.rules[name - 'A']; + } + + pub inline fn entry_point(self: *Self) *RuleList { + return self.rule('S'); + } + + fn generate_first(self: *Self) void { + var modified = true; + + while (modified) { + modified = false; + + for (&self.rules) |*rule| { + for (rule.rhs.items) |rhs| { + switch (rhs[0]) { + .terminal => |t| { + modified = modified or !rule.first.is_set(t); + rule.first.set(t); + }, + + .non_terminal, .epsilon => { + if (rule.first.is_set(Character.EPSILON)) continue; + var insert_epsilon = true; + + for (rhs) |*char| { + switch (char.*) { + .terminal => |t| { + modified = modified or !rule.first.is_set(t); + rule.first.set(t); + insert_epsilon = false; + break; + }, + .non_terminal => |n| { + const first = self.rule_by_name(n).first; + for (first.charset, 0..) |child_first, index| { + if (index == Character.EPSILON) continue; + + modified = modified or (!rule.first.is_set(index) and child_first); + rule.first.set_if(index, child_first); + } + + if (!first.is_set(Character.EPSILON)) { + insert_epsilon = false; + break; + } + }, + .epsilon => {}, + } + } + + if (insert_epsilon) { + modified = true; + rule.first.set(Character.EPSILON); + } + }, + } + } + } + } + } + + fn generate_follows(self: *Self) void { + self.rule_by_name('S').follows.set(Character.END); + var modified = true; + + while (modified) { + modified = false; + + for (&self.rules) |*parent_rules| { + for (parent_rules.rhs.items) |rule| { + for (0..rule.len) |character_index| { + + switch (rule[character_index]) { + .non_terminal => |n| { + const current_rule = self.rule_by_name(n); + const ends_with_epsilon = brk: { + for (character_index + 1..rule.len) |peek_index| { + switch (rule[peek_index]) { + .terminal => |t| { + modified = modified or !current_rule.follows.is_set(t); + current_rule.follows.set(t); + break :brk false; + }, + .non_terminal => |peek_n| { + const peek_rule = self.rule_by_name(peek_n); + for (peek_rule.first.charset, 0..) |is_set, index| { + if (Character.EPSILON == index) continue; + + modified = modified or (!current_rule.follows.is_set(index) and is_set); + current_rule.follows.set_if(index, is_set); + } + + if (!peek_rule.first.is_set(Character.EPSILON)) { + break :brk false; + } + }, + .epsilon => {}, + } + } + break :brk true; + }; + + if (ends_with_epsilon) { + for (parent_rules.follows.charset, 0..) |is_set, char_index| { + modified = modified or (!current_rule.follows.is_set(char_index) and is_set); + current_rule.follows.set_if(char_index, is_set); + } + } + }, + else => {} + } + + } + } + } + } + } +}; + +test "expr" { + const text = + \\S -> B A + \\A -> + B A + \\A -> _ + \\B -> D C + \\C -> * D C + \\C -> _ + \\D -> ( S ) + \\D -> a + \\D -> b + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON }); + try grammar.rule_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { ')', Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { ')', Character.END }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END }); + try grammar.rule_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END }); +} + +test "sample 0" { + const text = + \\S -> A C B + \\S -> C b b + \\S -> B a + \\A -> d a + \\A -> B C + \\B -> g + \\B -> _ + \\C -> h + \\C -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END }); +} + +test "sample 1" { + const text = + \\S -> a A B b + \\A -> c + \\A -> _ + \\B -> d + \\B -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'b', 'd' }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { 'b' }); +} + +test "sample 2" { + const text = + \\S -> A B + \\S -> e D a + \\A -> a b + \\A -> c + \\B -> d C + \\C -> e C + \\C -> _ + \\D -> f D + \\D -> _ + ; + + const allocator = std.testing.allocator; + var grammar = try Grammar.init(text, allocator); + defer grammar.deinit(allocator); + + try grammar.rule_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' }); + try grammar.rule_by_name('A').first.expect(&[_]u8 { 'a', 'c' }); + try grammar.rule_by_name('B').first.expect(&[_]u8 { 'd' }); + try grammar.rule_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON }); + try grammar.rule_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON }); + + try grammar.rule_by_name('S').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('A').follows.expect(&[_]u8 { 'd' }); + try grammar.rule_by_name('B').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('C').follows.expect(&[_]u8 { Character.END }); + try grammar.rule_by_name('D').follows.expect(&[_]u8 { 'a' }); +} |