diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/argument.zig | 21 | ||||
| -rw-r--r-- | src/char-set.zig | 8 | ||||
| -rw-r--r-- | src/character.zig | 26 | ||||
| -rw-r--r-- | src/generator.zig | 6 | ||||
| -rw-r--r-- | src/grammar.zig | 248 | ||||
| -rw-r--r-- | src/main.zig | 3 | ||||
| -rw-r--r-- | src/non-terminal.zig | 240 | ||||
| -rw-r--r-- | src/recognizer.zig | 136 | ||||
| -rw-r--r-- | src/string.zig | 41 |
9 files changed, 521 insertions, 208 deletions
diff --git a/src/argument.zig b/src/argument.zig index b553352..a9c823d 100644 --- a/src/argument.zig +++ b/src/argument.zig @@ -15,6 +15,8 @@ fn help(err: ?anyerror) noreturn { \\ \\ generate [grammar] [options] \\ Options: + \\ -e, --entry label Name of the entry point. (default: main) + \\ \\ -o, --output entry Output string to file \\ By default stdout will be used. \\ @@ -26,6 +28,8 @@ fn help(err: ?anyerror) noreturn { \\ \\ recognize [grammar] [options] \\ Options: + \\ -e, --entry label Name of the entry point. (default: main) + \\ \\ -i, --input entry Specify input source, if the path \\ points to a directory it will scan \\ all files in it. By default stdin will @@ -114,6 +118,7 @@ pub const Entry = struct { pub const RecognizeArgs = struct { input: Entry, grammar: Grammar, + entry: []const u8, }; pub const GenerateArgs = struct { @@ -121,6 +126,7 @@ pub const GenerateArgs = struct { min_length: usize, output: Entry, grammar: Grammar, + entry: []const u8, }; pub const Args = union(Mode) { @@ -138,6 +144,7 @@ pub const Args = union(Mode) { const text = Entry.read_file(check(args.next()), allocator); defer allocator.free(text); const grammar = Grammar.parse( + "main", text, allocator ) catch |e| help(e); @@ -145,10 +152,13 @@ pub const Args = union(Mode) { switch (mode) { .recognize => { var input: ?Entry = null; + var entry: []const u8 = "main"; while (args.next()) |arg| { if (check_flags(arg, &[_][]const u8 { "-i", "--input" })) { input = Entry.open(check(args.next()), false); + } else if (check_flags(arg, &[_][]const u8 { "-e", "--entry" })) { + entry = check(args.next()); } else help(error.InvalidArgument); } @@ -156,6 +166,7 @@ pub const Args = union(Mode) { .recognize = .{ .input = input orelse Entry.stdin(), .grammar = grammar, + .entry = entry, }, }; }, @@ -164,6 +175,7 @@ pub const Args = union(Mode) { var count: usize = 1; var output: ?Entry = null; var min_length: usize = 0; + var entry: []const u8 = "main"; while (args.next()) |arg| { if (check_flags(arg, &[_][]const u8 { "-o", "--output" })) { @@ -174,6 +186,8 @@ pub const Args = union(Mode) { min_length = 1; } else if (check_flags(arg, &[_][]const u8 { "-m", "--min-length" })) { min_length = parse_int(check(args.next())); + } else if (check_flags(arg, &[_][]const u8 { "-e", "--entry" })) { + entry = check(args.next()); } else help(error.InvalidArgument); } @@ -183,19 +197,20 @@ pub const Args = union(Mode) { .min_length = min_length, .output = output orelse Entry.stdout(), .grammar = grammar, + .entry = entry, }, }; }, } } - pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { + pub fn deinit(self: *Self) void { switch (self.*) { .recognize => |*rec| { - rec.grammar.deinit(allocator); + rec.grammar.deinit(); }, .generate => |*gen| { - gen.grammar.deinit(allocator); + gen.grammar.deinit(); } } } diff --git a/src/char-set.zig b/src/char-set.zig index d244540..55c719c 100644 --- a/src/char-set.zig +++ b/src/char-set.zig @@ -4,7 +4,7 @@ const Character = @import("character.zig").Character; const Self = @This(); -charset: [256]bool = std.mem.zeroes([256]bool), +charset: [258]bool = std.mem.zeroes([258]bool), pub inline fn is_set(self: *const Self, c: usize) bool { return self.charset[c]; @@ -48,8 +48,8 @@ pub fn format( try writer.print("}}", .{}); } -pub fn expect(self: *const Self, expected: []const u8) !void { - var matrix = std.mem.zeroes([255]bool); +pub fn expect(self: *const Self, expected: []const u9) !void { + var matrix = std.mem.zeroes([258]bool); for (expected) |c| { matrix[c] = true; @@ -65,7 +65,7 @@ pub fn expect(self: *const Self, expected: []const u8) !void { } else if (c == Character.END) { std.debug.print("$ ", .{}); } else { - std.debug.print("'{c}' ", .{c}); + std.debug.print("'{c}' ", .{@as(u8, @truncate(c))}); } } diff --git a/src/character.zig b/src/character.zig index bd11ccd..10dd254 100644 --- a/src/character.zig +++ b/src/character.zig @@ -3,22 +3,12 @@ const std = @import("std"); pub const Character = union(enum) { const Self = @This(); - pub const EPSILON: u8 = '_'; - pub const END: u8 = 0; + pub const EPSILON: u9 = std.math.maxInt(u8) + 1; + pub const END: u9 = std.math.maxInt(u8) + 2; epsilon: void, terminal: u8, - non_terminal: u8, - - pub fn from_u8(c: u8) Self { - if (c == '_') { - return Self { .epsilon = void{} }; - } else if (std.ascii.isUpper(c)) { - return Self { .non_terminal = c }; - } else { - return Self { .terminal = c }; - } - } + non_terminal: usize, pub fn format( self: *const Self, @@ -31,8 +21,14 @@ pub const Character = union(enum) { switch (self.*) { .epsilon => _ = try writer.writeAll("ε"), - .terminal => |c| try writer.writeByte(c), - .non_terminal => |c| try writer.writeByte(c), + .terminal => |c| { + if (c == END) { + try writer.writeAll("$"); + } else { + try writer.print("'{c}'", .{c}); + } + }, + .non_terminal => |index| try writer.print("<{}>", .{index}), } } }; diff --git a/src/generator.zig b/src/generator.zig index 8d0c888..0676834 100644 --- a/src/generator.zig +++ b/src/generator.zig @@ -22,7 +22,7 @@ pub fn Generator(T: type) type { self.generate_sentential_recursive( grammar, &string, - grammar.entry_point().name, + grammar.entry_point().id, max_depth ) catch |err| switch (err) { error.MaxDepth => { @@ -41,14 +41,14 @@ pub fn Generator(T: type) type { self: *Self, grammar: *Grammar, string: *std.ArrayList(u8), - n: u8, + id: usize, max_depth: usize ) GenError!void { if (max_depth == 0) { return error.MaxDepth; } - const rules = grammar.non_terminal_by_name(n).rules(); + const rules = grammar.non_terminal_by_id(id).rules(); const index = self.inner.next(rules.len); const rule = rules[index]; diff --git a/src/grammar.zig b/src/grammar.zig index ed7eee6..298932c 100644 --- a/src/grammar.zig +++ b/src/grammar.zig @@ -5,24 +5,47 @@ const NonTerminal = @import("non-terminal.zig"); const Self = @This(); -non_terminals: [26]NonTerminal, +non_terminals: []NonTerminal, +entry_id: usize, +allocator: std.mem.Allocator, -pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self { +pub fn parse(entry: []const u8, buffer: []const u8, allocator: std.mem.Allocator) !Self { var lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); - var self: Self = undefined; - errdefer self.deinit(allocator); + while (lines.next()) |line| { + const name = try NonTerminal.parse_name(line); + + if (!names.contains(name)) { + try names.put(name, names.count()); + } + } + + lines = std.mem.tokenizeScalar(u8, buffer, '\n'); + var self = Self { + .non_terminals = try allocator.alloc(NonTerminal, names.count()), + .entry_id = names.get(entry) orelse return error.MissingRule, + .allocator = allocator, + }; + errdefer self.deinit(); - for (&self.non_terminals, 'A'..) |*non_terminal, name| { - non_terminal.* = NonTerminal.init( - @truncate(name), - allocator - ); + { + var iterator = names.iterator(); + while (iterator.next()) |name_entry| { + self.non_terminals[name_entry.value_ptr.*] = try NonTerminal.init( + name_entry.key_ptr.*, + name_entry.value_ptr.*, + allocator + ); + } } while (lines.next()) |line| { - const name, const rule = try NonTerminal.parse_rule(line, allocator); - try self.non_terminal_by_name(name).add_rule(rule); + const name = try NonTerminal.parse_name(line); + try self.non_terminal_by_id( + names.get(name) orelse return error.MissingRule + ).parse_and_add_variants(line, &names); } self.generate_first(); @@ -31,20 +54,21 @@ pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self { return self; } -pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { - for (&self.non_terminals) |*non_terminal| { - non_terminal.deinit(allocator); +pub fn deinit(self: *Self) void { + for (self.non_terminals) |*non_terminal| { + non_terminal.deinit(); } + self.allocator.free(self.non_terminals); self.* = undefined; } -pub inline fn non_terminal_by_name(self: *Self, name: u8) *NonTerminal { - return &self.non_terminals[name - 'A']; +pub inline fn non_terminal_by_id(self: *Self, id: usize) *NonTerminal { + return &self.non_terminals[id]; } pub inline fn entry_point(self: *Self) *NonTerminal { - return self.non_terminal_by_name('S'); + return self.non_terminal_by_id(self.entry_id); } fn generate_first(self: *Self) void { @@ -53,7 +77,7 @@ fn generate_first(self: *Self) void { while (modified) { modified = false; - for (&self.non_terminals) |*non_terminal| { + for (self.non_terminals) |*non_terminal| { for (non_terminal.rules()) |rule| { switch (rule[0]) { .terminal => |t| { @@ -62,7 +86,6 @@ fn generate_first(self: *Self) void { }, .non_terminal, .epsilon => { - if (non_terminal.first.is_set(Character.EPSILON)) continue; var insert_epsilon = true; for (rule) |*char| { @@ -74,7 +97,7 @@ fn generate_first(self: *Self) void { break; }, .non_terminal => |n| { - const first = self.non_terminal_by_name(n).first; + const first = self.non_terminal_by_id(n).first; for (first.charset, 0..) |child_first, index| { if (index == Character.EPSILON) continue; @@ -91,7 +114,7 @@ fn generate_first(self: *Self) void { } } - if (insert_epsilon) { + if (insert_epsilon and !non_terminal.first.is_set(Character.EPSILON)) { modified = true; non_terminal.first.set(Character.EPSILON); } @@ -101,21 +124,20 @@ fn generate_first(self: *Self) void { } } } - fn generate_follows(self: *Self) void { - self.non_terminal_by_name('S').follows.set(Character.END); + self.entry_point().follows.set(Character.END); var modified = true; while (modified) { modified = false; - for (&self.non_terminals) |*non_terminal| { + for (self.non_terminals) |*non_terminal| { for (non_terminal.rules()) |rule| { for (0..rule.len) |character_index| { switch (rule[character_index]) { .non_terminal => |n| { - const current_non_terminal = self.non_terminal_by_name(n); + const current_non_terminal = self.non_terminal_by_id(n); const ends_with_epsilon = brk: { for (character_index + 1..rule.len) |peek_index| { switch (rule[peek_index]) { @@ -125,7 +147,7 @@ fn generate_follows(self: *Self) void { break :brk false; }, .non_terminal => |peek_n| { - const peek_non_terminal = self.non_terminal_by_name(peek_n); + const peek_non_terminal = self.non_terminal_by_id(peek_n); for (peek_non_terminal.first.charset, 0..) |is_set, index| { if (Character.EPSILON == index) continue; @@ -159,112 +181,144 @@ fn generate_follows(self: *Self) void { } } +pub fn format( + self: *const Self, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = fmt; + _ = options; + + for (self.non_terminals) |non_terminal| { + try writer.print("[{}] ", .{non_terminal}); + } +} + test "expr" { const text = \\S -> B A - \\A -> + B A - \\A -> _ + \\A -> '+' B A + \\A -> '' \\B -> D C - \\C -> * D C - \\C -> _ - \\D -> ( S ) - \\D -> a - \\D -> b + \\C -> '*' D C + \\C -> '' + \\D -> '(' S ')' + \\D -> 'a' | 'b' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON }); - try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { '+', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { '(', 'a', 'b' }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { '*', Character.EPSILON }); + try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { '(', 'a', 'b' }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { ')', Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { ')', Character.END }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END }); - try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { ')', Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { ')', Character.END }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { '+', ')', Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { '+', ')', Character.END }); + try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { '*', '+', ')', Character.END }); } test "sample 0" { const text = - \\S -> A C B - \\S -> C b b - \\S -> B a - \\A -> d a - \\A -> B C - \\B -> g - \\B -> _ - \\C -> h - \\C -> _ + \\S -> A C B | C 'bb' | B 'a' + \\A -> 'da' | B C + \\B -> 'g' | '' + \\C -> 'h' | '' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'd', 'g', 'h', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'g', Character.EPSILON }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'h', Character.EPSILON }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'g', 'h', Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { 'b', 'g', 'h', Character.END }); } test "sample 1" { const text = - \\S -> a A B b - \\A -> c - \\A -> _ - \\B -> d - \\B -> _ + \\S -> 'a' A B 'b' + \\A -> 'c' + \\A -> '' + \\B -> 'd' + \\B -> '' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'c', Character.EPSILON }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd', Character.EPSILON }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'b', 'd' }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'b' }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'b', 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'b' }); } test "sample 2" { const text = \\S -> A B - \\S -> e D a - \\A -> a b - \\A -> c - \\B -> d C - \\C -> e C - \\C -> _ - \\D -> f D - \\D -> _ + \\S -> 'e' D 'a' + \\A -> 'ab' + \\A -> 'c' + \\B -> 'd' C + \\C -> 'e' C + \\C -> '' + \\D -> 'f' D + \\D -> '' + ; + + const allocator = std.testing.allocator; + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); + + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'c', 'e' }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd' }); + try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'e', Character.EPSILON }); + try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { 'f', Character.EPSILON }); + + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { Character.END }); + try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { 'a' }); +} + +test "sample 3" { + const text = + \\S -> A S 'd' + \\S -> B S + \\S -> '' + \\A -> 'a' + \\A -> 'c' + \\B -> 'a' + \\B -> 'b' ; const allocator = std.testing.allocator; - var grammar = try Self.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Self.parse("S", text, allocator); + defer grammar.deinit(); - try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' }); - try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'a', 'c' }); - try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd' }); - try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON }); - try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON }); + try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'b', 'c', Character.EPSILON }); + try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' }); + try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'a', 'b' }); - try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'd' }); - try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { Character.END }); - try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { 'a' }); + try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { 'd', Character.END }); + try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd' }); + try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd', Character.END }); } diff --git a/src/main.zig b/src/main.zig index ac52f8a..f350154 100644 --- a/src/main.zig +++ b/src/main.zig @@ -77,6 +77,7 @@ fn recognize(args: *RecognizeArgs, allocator: std.mem.Allocator) !void { index += 1; if (args.input.file.isTty()) { + try bufwriter.flush(); try stderr.writeAll("> "); } } @@ -141,7 +142,7 @@ pub fn main() !void { } var arguments = Args.parse(allocator); - defer arguments.deinit(allocator); + defer arguments.deinit(); try switch(arguments) { .recognize => |*args| recognize(args, allocator), diff --git a/src/non-terminal.zig b/src/non-terminal.zig index eed30e3..1665a80 100644 --- a/src/non-terminal.zig +++ b/src/non-terminal.zig @@ -1,30 +1,36 @@ const std = @import("std"); const CharSet = @import("char-set.zig"); const Character = @import("character.zig").Character; +const string = @import("string.zig"); const Self = @This(); pub const Rule = []Character; -name: u8, +id: usize, +name: []const u8, rule_list: std.ArrayList(Rule), -first: CharSet, -follows: CharSet, +first: CharSet = CharSet {}, +follows: CharSet = CharSet {}, +allocator: std.mem.Allocator, -pub fn init(name: u8, allocator: std.mem.Allocator) Self { +pub fn init(name: []const u8, id: usize, allocator: std.mem.Allocator) !Self { + const target = try allocator.alloc(u8, name.len); + @memcpy(target, name); return Self { - .name = name, + .name = target, + .id = id, .rule_list = std.ArrayList(Rule).init(allocator), - .first = CharSet {}, - .follows = CharSet {}, + .allocator = allocator, }; } -pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { +pub fn deinit(self: *Self) void { for (self.rules()) |rule| { - allocator.free(rule); + self.allocator.free(rule); } self.rule_list.deinit(); + self.allocator.free(self.name); } pub inline fn rules(self: *Self) []Rule { @@ -35,32 +41,78 @@ pub fn add_rule(self: *Self, rule: Rule) !void { try self.rule_list.append(rule); } -pub fn parse_rule( +pub fn parse_name(buffer: []const u8) ![]const u8 { + const arrow_start = std.mem.indexOf(u8, buffer, "->") orelse return error.MissingArrow; + + if (arrow_start == 0) { + return error.MissingRuleName; + } + + return std.mem.trim(u8, buffer[0..arrow_start], &std.ascii.whitespace); +} + +pub fn parse_and_add_variants( + self: *Self, buffer: []const u8, - allocator: std.mem.Allocator -) !struct { u8, Rule } { + names: *const std.StringHashMap(usize), +) !void { + const arrow_start = std.mem.indexOf(u8, buffer, "->") orelse return error.MissingArrow; + var head: []const u8 = buffer[arrow_start + 2..]; - var rule = std.ArrayList(Character).init(allocator); + if (std.mem.trimLeft(u8, head, &std.ascii.whitespace).len == 0) { + return error.MissingBody; + } + + var rule = std.ArrayList(Character).init(self.allocator); errdefer rule.deinit(); - var tokens = std.mem.tokenizeAny(u8, buffer, &std.ascii.whitespace); + head = std.mem.trimLeft(u8, head, &std.ascii.whitespace); + while (head.len > 0) { + if (head[0] == '|') { + try self.rule_list.append(try rule.toOwnedSlice()); + rule = std.ArrayList(Character).init(self.allocator); + head = head[1..]; + } else if (head[0] == '\'') { + head = head[1..]; + + if (head.len > 0 and head[0] == '\'') { + try rule.append(Character { .epsilon = void{} }); + } + + while (head.len > 0 and head[0] != '\'') { + const char, head = try string.read_first_from_literal(head); + try rule.append(Character { .terminal = char }); + } + + if (head.len == 0) { + return error.UnterminatedString; + } + + head = head[1..]; + } else { - const lhs = tokens.next() orelse return error.MissingLhs; - if (lhs.len > 1) return error.InvalidLhs; + const child_name, head = string.read_name(head); - const arrow = tokens.next() orelse return error.MissingArrow; - if (!std.mem.eql(u8, arrow, "->")) return error.InvalidArrow; + if (child_name.len == 0) { + std.debug.print("{s}\n", .{buffer}); + return error.EmptyVariant; + } + + try rule.append(Character { + .non_terminal = names.get(child_name) orelse { + return error.MissingRule; + } + }); + } - while (tokens.next()) |token| { - if (token.len > 1) return error.InvalidCharacter; - try rule.append(Character.from_u8(token[0])); + head = std.mem.trimLeft(u8, head, &std.ascii.whitespace); } if (rule.items.len == 0) { - return error.EmptyProduction; + return error.EmptyVariant; } - return .{ lhs[0], try rule.toOwnedSlice() }; + try self.rule_list.append(try rule.toOwnedSlice()); } pub inline fn is_empty(self: *const Self) bool { @@ -76,20 +128,152 @@ pub fn format( _ = fmt; _ = options; - try writer.print("[{c} -> ", .{ self.name }); + try writer.print("{s} ->", .{ self.name }); if (self.rule_list.items.len > 0) { for (self.rule_list.items[0]) |c| { - try writer.print("{}", .{c}); + try writer.print(" {}", .{c}); } for (self.rule_list.items[1..]) |r| { - try writer.print(" | ", .{}); + try writer.print(" |", .{}); for (r) |c| { - try writer.print("{}", .{c}); + try writer.print(" {}", .{c}); } } } +} + +test "single-non-terminal" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + try names.put("one", 1); + + try non_terminal.parse_and_add_variants("main -> one", &names); + + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { Character { .non_terminal = 1 } } + }, non_terminal.rules()); +} + +test "multiple-non-terminal" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + try names.put("one", 1); + try names.put("two", 2); + try names.put("three", 3); + + try non_terminal.parse_and_add_variants("main -> one | two | three", &names); + + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { Character { .non_terminal = 1 } }, + &[_] Character { Character { .non_terminal = 2 } }, + &[_] Character { Character { .non_terminal = 3 } }, + }, non_terminal.rules()); +} + +test "multiple-non-terminal-sequence" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + try names.put("one", 1); + try names.put("two", 2); + try names.put("three", 3); + + try non_terminal.parse_and_add_variants("main -> one two | two | three", &names); + + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { Character { .non_terminal = 1 }, Character { .non_terminal = 2 }, }, + &[_] Character { Character { .non_terminal = 2 } }, + &[_] Character { Character { .non_terminal = 3 } }, + }, non_terminal.rules()); +} + +test "single-terminal" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + try names.put("one", 1); + + try non_terminal.parse_and_add_variants("main -> 'one'", &names); + + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { + Character { .terminal = 'o' }, + Character { .terminal = 'n' }, + Character { .terminal = 'e' } + }, + }, non_terminal.rules()); +} + +test "single-terminal-with-escapes" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + + try non_terminal.parse_and_add_variants("main -> 'o\\r\\nn\\'e\\\\'", &names); + + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { + Character { .terminal = 'o' }, + Character { .terminal = '\r' }, + Character { .terminal = '\n' }, + Character { .terminal = 'n' }, + Character { .terminal = '\'' }, + Character { .terminal = 'e' }, + Character { .terminal = '\\' }, + }, + }, non_terminal.rules()); +} + +test "mix" { + const allocator = std.testing.allocator; + var non_terminal = try Self.init("main", 0, allocator); + defer non_terminal.deinit(); + + var names = std.StringHashMap(usize).init(allocator); + defer names.deinit(); + try names.put("main", 0); + try names.put("other", 1); + try names.put("some", 2); + + try non_terminal.parse_and_add_variants("main -> 'ab' other | other some 'a' | main", &names); - try writer.writeByte(']'); + try std.testing.expectEqualDeep(&[_][]const Character{ + &[_] Character { + Character { .terminal = 'a' }, + Character { .terminal = 'b' }, + Character { .non_terminal = 1 }, + }, + &[_] Character { + Character { .non_terminal = 1 }, + Character { .non_terminal = 2 }, + Character { .terminal = 'a' }, + }, + &[_] Character { + Character { .non_terminal = 0 }, + }, + }, non_terminal.rules()); } diff --git a/src/recognizer.zig b/src/recognizer.zig index 1783db1..25a7944 100644 --- a/src/recognizer.zig +++ b/src/recognizer.zig @@ -8,8 +8,8 @@ const gss = @import("gss.zig"); const State = struct { const Self = @This(); - name: u8, - rule: NonTerminal.Rule, + id: usize, + rule_index: usize, inner_position: usize, input_position: usize, @@ -22,13 +22,34 @@ const State = struct { _ = fmt; _ = options; - try writer.print("[ {c}, {s}, {}, {} ]", .{ - self.name, - self.rule, + try writer.print("[ {}, {}, {}, {} ]", .{ + self.id, + self.rule_index, self.inner_position, self.input_position, }); } + + pub fn debug(self: *const Self, grammar: *Grammar, input: []const u8) void { + const rule = grammar.non_terminal_by_id(self.id).rules()[self.rule_index]; + std.debug.print("input ({s}○{s}) state (", .{ + input[0..self.input_position], + input[self.input_position..], + }); + + for (rule, 0..) |char, index| { + if (index == self.inner_position) { + std.debug.print("○", .{}); + } + + std.debug.print("{}", .{char}); + } + + if (rule.len == self.inner_position) { + std.debug.print("○", .{}); + } + std.debug.print(")\n", .{}); + } }; pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allocator) !bool { @@ -44,11 +65,11 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo defer for (queues) |queue| queue.deinit(); var processing_queue = &queues[0]; - for (entry.rules()) |rule| { + for (0..entry.rules().len) |index| { const node = try allocator.create(gss.Node(State)); node.* = gss.Node(State).init(State { - .name = entry.name, - .rule = rule, + .id = entry.id, + .rule_index = index, .inner_position = 0, .input_position = 0, }); @@ -60,10 +81,12 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo while (processing_queue.items.len > 0) { for (processing_queue.items) |node| { - if (node.state.inner_position == node.state.rule.len) { + const rule = grammar.non_terminal_by_id(node.state.id).rules()[node.state.rule_index]; + + if (node.state.inner_position == rule.len) { if ( node.parent == null and - node.state.name == entry.name and + node.state.id == entry.id and node.state.input_position == input.len ) { return true; @@ -72,8 +95,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo const old_state, const parent_node = node.pop(allocator); if (parent_node) |parent| { const sibling = try parent.clone(State { - .name = parent.state.name, - .rule = parent.state.rule, + .id = parent.state.id, + .rule_index = parent.state.rule_index, .inner_position = parent.state.inner_position + 1, .input_position = old_state.input_position, }, allocator); @@ -82,12 +105,12 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo } else { const epsilon_only = node.state.input_position == input.len; - switch (node.state.rule[node.state.inner_position]) { + switch (rule[node.state.inner_position]) { .terminal => |t| { if (!epsilon_only and input[node.state.input_position] == t) { const sibling = try node.clone(State { - .name = node.state.name, - .rule = node.state.rule, + .id = node.state.id, + .rule_index = node.state.rule_index, .inner_position = node.state.inner_position + 1, .input_position = node.state.input_position + 1, }, allocator); @@ -95,14 +118,13 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo } }, .non_terminal => |n| { - const non_terminal = grammar.non_terminal_by_name(n); - const rules = non_terminal.rules(); + const non_terminal = grammar.non_terminal_by_id(n); if (!epsilon_only and non_terminal.first.is_set(input[node.state.input_position])) { - for (rules) |rule| { + for (0..non_terminal.rules().len) |index| { const child = try node.push(State { - .name = non_terminal.name, - .rule = rule, + .id = non_terminal.id, + .rule_index = index, .inner_position = 0, .input_position = node.state.input_position, }, allocator); @@ -112,8 +134,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo if (non_terminal.first.is_set(Character.EPSILON)) { const sibling = try node.clone(State { - .name = node.state.name, - .rule = node.state.rule, + .id = node.state.id, + .rule_index = node.state.rule_index, .inner_position = node.state.inner_position + 1, .input_position = node.state.input_position }, allocator); @@ -122,8 +144,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo }, .epsilon => { const sibling = try node.clone(State { - .name = node.state.name, - .rule = node.state.rule, + .id = node.state.id, + .rule_index = node.state.rule_index, .inner_position = node.state.inner_position + 1, .input_position = node.state.input_position }, allocator); @@ -145,83 +167,83 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo test "expr" { const text = \\S -> B A - \\A -> + B A - \\A -> _ + \\A -> '+' B A + \\A -> '' \\B -> D C - \\C -> * D C - \\C -> _ - \\D -> ( S ) - \\D -> a - \\D -> b + \\C -> '*' D C + \\C -> '' + \\D -> '(' S ')' + \\D -> 'a' + \\D -> 'b' ; const input = "b+a*b"; const allocator = std.testing.allocator; - var grammar = try Grammar.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Grammar.parse("S", text, allocator); + defer grammar.deinit(); try std.testing.expect(try check(&grammar, input, allocator)); } test "simple 0 - success" { const text = - \\S -> A S d + \\S -> A S 'd' \\S -> B S - \\S -> _ - \\A -> a - \\A -> c - \\B -> a - \\B -> b + \\S -> '' + \\A -> 'a' + \\A -> 'c' + \\B -> 'a' + \\B -> 'b' ; const input = "aad"; const allocator = std.testing.allocator; - var grammar = try Grammar.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Grammar.parse("S", text, allocator); + defer grammar.deinit(); try std.testing.expect(try check(&grammar, input, allocator)); } test "simple 0 - fail" { const text = - \\S -> A S d + \\S -> A S 'd' \\S -> B S - \\S -> _ - \\A -> a - \\A -> c - \\B -> a - \\B -> b + \\S -> '' + \\A -> 'a' + \\A -> 'c' + \\B -> 'a' + \\B -> 'b' ; const input = "accd"; const allocator = std.testing.allocator; - var grammar = try Grammar.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Grammar.parse("S", text, allocator); + defer grammar.deinit(); try std.testing.expect(!try check(&grammar, input, allocator)); } test "simple 0 - fuzzy" { const text = - \\S -> A S d + \\S -> A S 'd' \\S -> B S - \\S -> _ - \\A -> a - \\A -> c - \\B -> a - \\B -> b + \\S -> '' + \\A -> 'a' + \\A -> 'c' + \\B -> 'a' + \\B -> 'b' ; const allocator = std.testing.allocator; - var grammar = try Grammar.parse(text, allocator); - defer grammar.deinit(allocator); + var grammar = try Grammar.parse("S", text, allocator); + defer grammar.deinit(); var generator = Generator(struct { const Self = @This(); diff --git a/src/string.zig b/src/string.zig new file mode 100644 index 0000000..8217a77 --- /dev/null +++ b/src/string.zig @@ -0,0 +1,41 @@ +const std = @import("std"); + +pub fn isNameChar(char: u8) bool { + return switch(char) { + '0'...'9', 'A'...'Z', 'a'...'z', '_' => true, + else => false + }; +} + +pub fn read_name(buffer: []const u8) struct { []const u8, []const u8 } { + for (buffer, 0..) |char, index| { + if (!isNameChar(char)) { + return .{ buffer[0..index], buffer[index..] }; + } + } + + return .{ buffer[0..buffer.len], buffer[buffer.len..] }; +} + +pub fn read_first_from_literal(buffer: []const u8) !struct { u8, []const u8 } { + if (buffer[0] != '\\') { + return .{ buffer[0], buffer[1..] } ; + } + + if (buffer.len > 1 and buffer[1] != 'x') { + return switch(buffer[1]) { + 't' => return .{ '\t', buffer[2..] }, + 'n' => return .{ '\n', buffer[2..] }, + 'r' => return .{ '\r', buffer[2..] }, + '\'' => return .{ '\'', buffer[2..] }, + '\\' => return .{ '\\', buffer[2..] }, + else => return error.InvalidEscapeSequence, + }; + } + + if (buffer.len > 4 and buffer[1] == 'x') { + return .{ try std.fmt.parseInt(u8, buffer[2..4], 16), buffer[4..] }; + } + + return error.InvalidEscapeSequnce; +} |