aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2025-05-06 10:43:09 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2025-05-06 10:43:09 +0200
commiteff19cc15a9bf4df60e7f90c3a7ee525c65266c0 (patch)
treed6034b574e8e2218054d42caadbf25fa17862abf /src
parente2f01d5df22704bfe62396a0f9a260f86edbde0e (diff)
add full grammar parsing
Diffstat (limited to 'src')
-rw-r--r--src/argument.zig21
-rw-r--r--src/char-set.zig8
-rw-r--r--src/character.zig26
-rw-r--r--src/generator.zig6
-rw-r--r--src/grammar.zig248
-rw-r--r--src/main.zig3
-rw-r--r--src/non-terminal.zig240
-rw-r--r--src/recognizer.zig136
-rw-r--r--src/string.zig41
9 files changed, 521 insertions, 208 deletions
diff --git a/src/argument.zig b/src/argument.zig
index b553352..a9c823d 100644
--- a/src/argument.zig
+++ b/src/argument.zig
@@ -15,6 +15,8 @@ fn help(err: ?anyerror) noreturn {
\\
\\ generate [grammar] [options]
\\ Options:
+ \\ -e, --entry label Name of the entry point. (default: main)
+ \\
\\ -o, --output entry Output string to file
\\ By default stdout will be used.
\\
@@ -26,6 +28,8 @@ fn help(err: ?anyerror) noreturn {
\\
\\ recognize [grammar] [options]
\\ Options:
+ \\ -e, --entry label Name of the entry point. (default: main)
+ \\
\\ -i, --input entry Specify input source, if the path
\\ points to a directory it will scan
\\ all files in it. By default stdin will
@@ -114,6 +118,7 @@ pub const Entry = struct {
pub const RecognizeArgs = struct {
input: Entry,
grammar: Grammar,
+ entry: []const u8,
};
pub const GenerateArgs = struct {
@@ -121,6 +126,7 @@ pub const GenerateArgs = struct {
min_length: usize,
output: Entry,
grammar: Grammar,
+ entry: []const u8,
};
pub const Args = union(Mode) {
@@ -138,6 +144,7 @@ pub const Args = union(Mode) {
const text = Entry.read_file(check(args.next()), allocator);
defer allocator.free(text);
const grammar = Grammar.parse(
+ "main",
text,
allocator
) catch |e| help(e);
@@ -145,10 +152,13 @@ pub const Args = union(Mode) {
switch (mode) {
.recognize => {
var input: ?Entry = null;
+ var entry: []const u8 = "main";
while (args.next()) |arg| {
if (check_flags(arg, &[_][]const u8 { "-i", "--input" })) {
input = Entry.open(check(args.next()), false);
+ } else if (check_flags(arg, &[_][]const u8 { "-e", "--entry" })) {
+ entry = check(args.next());
} else help(error.InvalidArgument);
}
@@ -156,6 +166,7 @@ pub const Args = union(Mode) {
.recognize = .{
.input = input orelse Entry.stdin(),
.grammar = grammar,
+ .entry = entry,
},
};
},
@@ -164,6 +175,7 @@ pub const Args = union(Mode) {
var count: usize = 1;
var output: ?Entry = null;
var min_length: usize = 0;
+ var entry: []const u8 = "main";
while (args.next()) |arg| {
if (check_flags(arg, &[_][]const u8 { "-o", "--output" })) {
@@ -174,6 +186,8 @@ pub const Args = union(Mode) {
min_length = 1;
} else if (check_flags(arg, &[_][]const u8 { "-m", "--min-length" })) {
min_length = parse_int(check(args.next()));
+ } else if (check_flags(arg, &[_][]const u8 { "-e", "--entry" })) {
+ entry = check(args.next());
} else help(error.InvalidArgument);
}
@@ -183,19 +197,20 @@ pub const Args = union(Mode) {
.min_length = min_length,
.output = output orelse Entry.stdout(),
.grammar = grammar,
+ .entry = entry,
},
};
},
}
}
- pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+ pub fn deinit(self: *Self) void {
switch (self.*) {
.recognize => |*rec| {
- rec.grammar.deinit(allocator);
+ rec.grammar.deinit();
},
.generate => |*gen| {
- gen.grammar.deinit(allocator);
+ gen.grammar.deinit();
}
}
}
diff --git a/src/char-set.zig b/src/char-set.zig
index d244540..55c719c 100644
--- a/src/char-set.zig
+++ b/src/char-set.zig
@@ -4,7 +4,7 @@ const Character = @import("character.zig").Character;
const Self = @This();
-charset: [256]bool = std.mem.zeroes([256]bool),
+charset: [258]bool = std.mem.zeroes([258]bool),
pub inline fn is_set(self: *const Self, c: usize) bool {
return self.charset[c];
@@ -48,8 +48,8 @@ pub fn format(
try writer.print("}}", .{});
}
-pub fn expect(self: *const Self, expected: []const u8) !void {
- var matrix = std.mem.zeroes([255]bool);
+pub fn expect(self: *const Self, expected: []const u9) !void {
+ var matrix = std.mem.zeroes([258]bool);
for (expected) |c| {
matrix[c] = true;
@@ -65,7 +65,7 @@ pub fn expect(self: *const Self, expected: []const u8) !void {
} else if (c == Character.END) {
std.debug.print("$ ", .{});
} else {
- std.debug.print("'{c}' ", .{c});
+ std.debug.print("'{c}' ", .{@as(u8, @truncate(c))});
}
}
diff --git a/src/character.zig b/src/character.zig
index bd11ccd..10dd254 100644
--- a/src/character.zig
+++ b/src/character.zig
@@ -3,22 +3,12 @@ const std = @import("std");
pub const Character = union(enum) {
const Self = @This();
- pub const EPSILON: u8 = '_';
- pub const END: u8 = 0;
+ pub const EPSILON: u9 = std.math.maxInt(u8) + 1;
+ pub const END: u9 = std.math.maxInt(u8) + 2;
epsilon: void,
terminal: u8,
- non_terminal: u8,
-
- pub fn from_u8(c: u8) Self {
- if (c == '_') {
- return Self { .epsilon = void{} };
- } else if (std.ascii.isUpper(c)) {
- return Self { .non_terminal = c };
- } else {
- return Self { .terminal = c };
- }
- }
+ non_terminal: usize,
pub fn format(
self: *const Self,
@@ -31,8 +21,14 @@ pub const Character = union(enum) {
switch (self.*) {
.epsilon => _ = try writer.writeAll("ε"),
- .terminal => |c| try writer.writeByte(c),
- .non_terminal => |c| try writer.writeByte(c),
+ .terminal => |c| {
+ if (c == END) {
+ try writer.writeAll("$");
+ } else {
+ try writer.print("'{c}'", .{c});
+ }
+ },
+ .non_terminal => |index| try writer.print("<{}>", .{index}),
}
}
};
diff --git a/src/generator.zig b/src/generator.zig
index 8d0c888..0676834 100644
--- a/src/generator.zig
+++ b/src/generator.zig
@@ -22,7 +22,7 @@ pub fn Generator(T: type) type {
self.generate_sentential_recursive(
grammar,
&string,
- grammar.entry_point().name,
+ grammar.entry_point().id,
max_depth
) catch |err| switch (err) {
error.MaxDepth => {
@@ -41,14 +41,14 @@ pub fn Generator(T: type) type {
self: *Self,
grammar: *Grammar,
string: *std.ArrayList(u8),
- n: u8,
+ id: usize,
max_depth: usize
) GenError!void {
if (max_depth == 0) {
return error.MaxDepth;
}
- const rules = grammar.non_terminal_by_name(n).rules();
+ const rules = grammar.non_terminal_by_id(id).rules();
const index = self.inner.next(rules.len);
const rule = rules[index];
diff --git a/src/grammar.zig b/src/grammar.zig
index ed7eee6..298932c 100644
--- a/src/grammar.zig
+++ b/src/grammar.zig
@@ -5,24 +5,47 @@ const NonTerminal = @import("non-terminal.zig");
const Self = @This();
-non_terminals: [26]NonTerminal,
+non_terminals: []NonTerminal,
+entry_id: usize,
+allocator: std.mem.Allocator,
-pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self {
+pub fn parse(entry: []const u8, buffer: []const u8, allocator: std.mem.Allocator) !Self {
var lines = std.mem.tokenizeScalar(u8, buffer, '\n');
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
- var self: Self = undefined;
- errdefer self.deinit(allocator);
+ while (lines.next()) |line| {
+ const name = try NonTerminal.parse_name(line);
+
+ if (!names.contains(name)) {
+ try names.put(name, names.count());
+ }
+ }
+
+ lines = std.mem.tokenizeScalar(u8, buffer, '\n');
+ var self = Self {
+ .non_terminals = try allocator.alloc(NonTerminal, names.count()),
+ .entry_id = names.get(entry) orelse return error.MissingRule,
+ .allocator = allocator,
+ };
+ errdefer self.deinit();
- for (&self.non_terminals, 'A'..) |*non_terminal, name| {
- non_terminal.* = NonTerminal.init(
- @truncate(name),
- allocator
- );
+ {
+ var iterator = names.iterator();
+ while (iterator.next()) |name_entry| {
+ self.non_terminals[name_entry.value_ptr.*] = try NonTerminal.init(
+ name_entry.key_ptr.*,
+ name_entry.value_ptr.*,
+ allocator
+ );
+ }
}
while (lines.next()) |line| {
- const name, const rule = try NonTerminal.parse_rule(line, allocator);
- try self.non_terminal_by_name(name).add_rule(rule);
+ const name = try NonTerminal.parse_name(line);
+ try self.non_terminal_by_id(
+ names.get(name) orelse return error.MissingRule
+ ).parse_and_add_variants(line, &names);
}
self.generate_first();
@@ -31,20 +54,21 @@ pub fn parse(buffer: []const u8, allocator: std.mem.Allocator) !Self {
return self;
}
-pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
- for (&self.non_terminals) |*non_terminal| {
- non_terminal.deinit(allocator);
+pub fn deinit(self: *Self) void {
+ for (self.non_terminals) |*non_terminal| {
+ non_terminal.deinit();
}
+ self.allocator.free(self.non_terminals);
self.* = undefined;
}
-pub inline fn non_terminal_by_name(self: *Self, name: u8) *NonTerminal {
- return &self.non_terminals[name - 'A'];
+pub inline fn non_terminal_by_id(self: *Self, id: usize) *NonTerminal {
+ return &self.non_terminals[id];
}
pub inline fn entry_point(self: *Self) *NonTerminal {
- return self.non_terminal_by_name('S');
+ return self.non_terminal_by_id(self.entry_id);
}
fn generate_first(self: *Self) void {
@@ -53,7 +77,7 @@ fn generate_first(self: *Self) void {
while (modified) {
modified = false;
- for (&self.non_terminals) |*non_terminal| {
+ for (self.non_terminals) |*non_terminal| {
for (non_terminal.rules()) |rule| {
switch (rule[0]) {
.terminal => |t| {
@@ -62,7 +86,6 @@ fn generate_first(self: *Self) void {
},
.non_terminal, .epsilon => {
- if (non_terminal.first.is_set(Character.EPSILON)) continue;
var insert_epsilon = true;
for (rule) |*char| {
@@ -74,7 +97,7 @@ fn generate_first(self: *Self) void {
break;
},
.non_terminal => |n| {
- const first = self.non_terminal_by_name(n).first;
+ const first = self.non_terminal_by_id(n).first;
for (first.charset, 0..) |child_first, index| {
if (index == Character.EPSILON) continue;
@@ -91,7 +114,7 @@ fn generate_first(self: *Self) void {
}
}
- if (insert_epsilon) {
+ if (insert_epsilon and !non_terminal.first.is_set(Character.EPSILON)) {
modified = true;
non_terminal.first.set(Character.EPSILON);
}
@@ -101,21 +124,20 @@ fn generate_first(self: *Self) void {
}
}
}
-
fn generate_follows(self: *Self) void {
- self.non_terminal_by_name('S').follows.set(Character.END);
+ self.entry_point().follows.set(Character.END);
var modified = true;
while (modified) {
modified = false;
- for (&self.non_terminals) |*non_terminal| {
+ for (self.non_terminals) |*non_terminal| {
for (non_terminal.rules()) |rule| {
for (0..rule.len) |character_index| {
switch (rule[character_index]) {
.non_terminal => |n| {
- const current_non_terminal = self.non_terminal_by_name(n);
+ const current_non_terminal = self.non_terminal_by_id(n);
const ends_with_epsilon = brk: {
for (character_index + 1..rule.len) |peek_index| {
switch (rule[peek_index]) {
@@ -125,7 +147,7 @@ fn generate_follows(self: *Self) void {
break :brk false;
},
.non_terminal => |peek_n| {
- const peek_non_terminal = self.non_terminal_by_name(peek_n);
+ const peek_non_terminal = self.non_terminal_by_id(peek_n);
for (peek_non_terminal.first.charset, 0..) |is_set, index| {
if (Character.EPSILON == index) continue;
@@ -159,112 +181,144 @@ fn generate_follows(self: *Self) void {
}
}
+pub fn format(
+ self: *const Self,
+ comptime fmt: []const u8,
+ options: std.fmt.FormatOptions,
+ writer: anytype,
+) !void {
+ _ = fmt;
+ _ = options;
+
+ for (self.non_terminals) |non_terminal| {
+ try writer.print("[{}] ", .{non_terminal});
+ }
+}
+
test "expr" {
const text =
\\S -> B A
- \\A -> + B A
- \\A -> _
+ \\A -> '+' B A
+ \\A -> ''
\\B -> D C
- \\C -> * D C
- \\C -> _
- \\D -> ( S )
- \\D -> a
- \\D -> b
+ \\C -> '*' D C
+ \\C -> ''
+ \\D -> '(' S ')'
+ \\D -> 'a' | 'b'
;
const allocator = std.testing.allocator;
- var grammar = try Self.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Self.parse("S", text, allocator);
+ defer grammar.deinit();
- try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { '(', 'a', 'b' });
- try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { '+', Character.EPSILON });
- try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { '(', 'a', 'b' });
- try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { '*', Character.EPSILON });
- try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { '(', 'a', 'b' });
+ try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { '(', 'a', 'b' });
+ try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { '+', Character.EPSILON });
+ try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { '(', 'a', 'b' });
+ try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { '*', Character.EPSILON });
+ try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { '(', 'a', 'b' });
- try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { ')', Character.END });
- try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { ')', Character.END });
- try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { '+', ')', Character.END });
- try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { '+', ')', Character.END });
- try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { '*', '+', ')', Character.END });
+ try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { ')', Character.END });
+ try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { ')', Character.END });
+ try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { '+', ')', Character.END });
+ try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { '+', ')', Character.END });
+ try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { '*', '+', ')', Character.END });
}
test "sample 0" {
const text =
- \\S -> A C B
- \\S -> C b b
- \\S -> B a
- \\A -> d a
- \\A -> B C
- \\B -> g
- \\B -> _
- \\C -> h
- \\C -> _
+ \\S -> A C B | C 'bb' | B 'a'
+ \\A -> 'da' | B C
+ \\B -> 'g' | ''
+ \\C -> 'h' | ''
;
const allocator = std.testing.allocator;
- var grammar = try Self.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Self.parse("S", text, allocator);
+ defer grammar.deinit();
- try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON });
- try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'd', 'g', 'h', Character.EPSILON });
- try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'g', Character.EPSILON });
- try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'h', Character.EPSILON });
+ try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'd', 'g', 'h', 'b', 'a', Character.EPSILON });
+ try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'd', 'g', 'h', Character.EPSILON });
+ try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'g', Character.EPSILON });
+ try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'h', Character.EPSILON });
- try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END });
- try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'g', 'h', Character.END });
- try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'a', 'g', 'h', Character.END });
- try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { 'b', 'g', 'h', Character.END });
+ try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END });
+ try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'g', 'h', Character.END });
+ try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'g', 'h', Character.END });
+ try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { 'b', 'g', 'h', Character.END });
}
test "sample 1" {
const text =
- \\S -> a A B b
- \\A -> c
- \\A -> _
- \\B -> d
- \\B -> _
+ \\S -> 'a' A B 'b'
+ \\A -> 'c'
+ \\A -> ''
+ \\B -> 'd'
+ \\B -> ''
;
const allocator = std.testing.allocator;
- var grammar = try Self.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Self.parse("S", text, allocator);
+ defer grammar.deinit();
- try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a' });
- try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'c', Character.EPSILON });
- try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd', Character.EPSILON });
+ try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a' });
+ try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'c', Character.EPSILON });
+ try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd', Character.EPSILON });
- try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END });
- try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'b', 'd' });
- try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { 'b' });
+ try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END });
+ try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'b', 'd' });
+ try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'b' });
}
test "sample 2" {
const text =
\\S -> A B
- \\S -> e D a
- \\A -> a b
- \\A -> c
- \\B -> d C
- \\C -> e C
- \\C -> _
- \\D -> f D
- \\D -> _
+ \\S -> 'e' D 'a'
+ \\A -> 'ab'
+ \\A -> 'c'
+ \\B -> 'd' C
+ \\C -> 'e' C
+ \\C -> ''
+ \\D -> 'f' D
+ \\D -> ''
+ ;
+
+ const allocator = std.testing.allocator;
+ var grammar = try Self.parse("S", text, allocator);
+ defer grammar.deinit();
+
+ try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'c', 'e' });
+ try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' });
+ try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'd' });
+ try grammar.non_terminal_by_id(3).first.expect(&[_]u9 { 'e', Character.EPSILON });
+ try grammar.non_terminal_by_id(4).first.expect(&[_]u9 { 'f', Character.EPSILON });
+
+ try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { Character.END });
+ try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'd' });
+ try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { Character.END });
+ try grammar.non_terminal_by_id(3).follows.expect(&[_]u9 { Character.END });
+ try grammar.non_terminal_by_id(4).follows.expect(&[_]u9 { 'a' });
+}
+
+test "sample 3" {
+ const text =
+ \\S -> A S 'd'
+ \\S -> B S
+ \\S -> ''
+ \\A -> 'a'
+ \\A -> 'c'
+ \\B -> 'a'
+ \\B -> 'b'
;
const allocator = std.testing.allocator;
- var grammar = try Self.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Self.parse("S", text, allocator);
+ defer grammar.deinit();
- try grammar.non_terminal_by_name('S').first.expect(&[_]u8 { 'a', 'c', 'e' });
- try grammar.non_terminal_by_name('A').first.expect(&[_]u8 { 'a', 'c' });
- try grammar.non_terminal_by_name('B').first.expect(&[_]u8 { 'd' });
- try grammar.non_terminal_by_name('C').first.expect(&[_]u8 { 'e', Character.EPSILON });
- try grammar.non_terminal_by_name('D').first.expect(&[_]u8 { 'f', Character.EPSILON });
+ try grammar.non_terminal_by_id(0).first.expect(&[_]u9 { 'a', 'b', 'c', Character.EPSILON });
+ try grammar.non_terminal_by_id(1).first.expect(&[_]u9 { 'a', 'c' });
+ try grammar.non_terminal_by_id(2).first.expect(&[_]u9 { 'a', 'b' });
- try grammar.non_terminal_by_name('S').follows.expect(&[_]u8 { Character.END });
- try grammar.non_terminal_by_name('A').follows.expect(&[_]u8 { 'd' });
- try grammar.non_terminal_by_name('B').follows.expect(&[_]u8 { Character.END });
- try grammar.non_terminal_by_name('C').follows.expect(&[_]u8 { Character.END });
- try grammar.non_terminal_by_name('D').follows.expect(&[_]u8 { 'a' });
+ try grammar.non_terminal_by_id(0).follows.expect(&[_]u9 { 'd', Character.END });
+ try grammar.non_terminal_by_id(1).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd' });
+ try grammar.non_terminal_by_id(2).follows.expect(&[_]u9 { 'a', 'b', 'c', 'd', Character.END });
}
diff --git a/src/main.zig b/src/main.zig
index ac52f8a..f350154 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -77,6 +77,7 @@ fn recognize(args: *RecognizeArgs, allocator: std.mem.Allocator) !void {
index += 1;
if (args.input.file.isTty()) {
+ try bufwriter.flush();
try stderr.writeAll("> ");
}
}
@@ -141,7 +142,7 @@ pub fn main() !void {
}
var arguments = Args.parse(allocator);
- defer arguments.deinit(allocator);
+ defer arguments.deinit();
try switch(arguments) {
.recognize => |*args| recognize(args, allocator),
diff --git a/src/non-terminal.zig b/src/non-terminal.zig
index eed30e3..1665a80 100644
--- a/src/non-terminal.zig
+++ b/src/non-terminal.zig
@@ -1,30 +1,36 @@
const std = @import("std");
const CharSet = @import("char-set.zig");
const Character = @import("character.zig").Character;
+const string = @import("string.zig");
const Self = @This();
pub const Rule = []Character;
-name: u8,
+id: usize,
+name: []const u8,
rule_list: std.ArrayList(Rule),
-first: CharSet,
-follows: CharSet,
+first: CharSet = CharSet {},
+follows: CharSet = CharSet {},
+allocator: std.mem.Allocator,
-pub fn init(name: u8, allocator: std.mem.Allocator) Self {
+pub fn init(name: []const u8, id: usize, allocator: std.mem.Allocator) !Self {
+ const target = try allocator.alloc(u8, name.len);
+ @memcpy(target, name);
return Self {
- .name = name,
+ .name = target,
+ .id = id,
.rule_list = std.ArrayList(Rule).init(allocator),
- .first = CharSet {},
- .follows = CharSet {},
+ .allocator = allocator,
};
}
-pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+pub fn deinit(self: *Self) void {
for (self.rules()) |rule| {
- allocator.free(rule);
+ self.allocator.free(rule);
}
self.rule_list.deinit();
+ self.allocator.free(self.name);
}
pub inline fn rules(self: *Self) []Rule {
@@ -35,32 +41,78 @@ pub fn add_rule(self: *Self, rule: Rule) !void {
try self.rule_list.append(rule);
}
-pub fn parse_rule(
+pub fn parse_name(buffer: []const u8) ![]const u8 {
+ const arrow_start = std.mem.indexOf(u8, buffer, "->") orelse return error.MissingArrow;
+
+ if (arrow_start == 0) {
+ return error.MissingRuleName;
+ }
+
+ return std.mem.trim(u8, buffer[0..arrow_start], &std.ascii.whitespace);
+}
+
+pub fn parse_and_add_variants(
+ self: *Self,
buffer: []const u8,
- allocator: std.mem.Allocator
-) !struct { u8, Rule } {
+ names: *const std.StringHashMap(usize),
+) !void {
+ const arrow_start = std.mem.indexOf(u8, buffer, "->") orelse return error.MissingArrow;
+ var head: []const u8 = buffer[arrow_start + 2..];
- var rule = std.ArrayList(Character).init(allocator);
+ if (std.mem.trimLeft(u8, head, &std.ascii.whitespace).len == 0) {
+ return error.MissingBody;
+ }
+
+ var rule = std.ArrayList(Character).init(self.allocator);
errdefer rule.deinit();
- var tokens = std.mem.tokenizeAny(u8, buffer, &std.ascii.whitespace);
+ head = std.mem.trimLeft(u8, head, &std.ascii.whitespace);
+ while (head.len > 0) {
+ if (head[0] == '|') {
+ try self.rule_list.append(try rule.toOwnedSlice());
+ rule = std.ArrayList(Character).init(self.allocator);
+ head = head[1..];
+ } else if (head[0] == '\'') {
+ head = head[1..];
+
+ if (head.len > 0 and head[0] == '\'') {
+ try rule.append(Character { .epsilon = void{} });
+ }
+
+ while (head.len > 0 and head[0] != '\'') {
+ const char, head = try string.read_first_from_literal(head);
+ try rule.append(Character { .terminal = char });
+ }
+
+ if (head.len == 0) {
+ return error.UnterminatedString;
+ }
+
+ head = head[1..];
+ } else {
- const lhs = tokens.next() orelse return error.MissingLhs;
- if (lhs.len > 1) return error.InvalidLhs;
+ const child_name, head = string.read_name(head);
- const arrow = tokens.next() orelse return error.MissingArrow;
- if (!std.mem.eql(u8, arrow, "->")) return error.InvalidArrow;
+ if (child_name.len == 0) {
+ std.debug.print("{s}\n", .{buffer});
+ return error.EmptyVariant;
+ }
+
+ try rule.append(Character {
+ .non_terminal = names.get(child_name) orelse {
+ return error.MissingRule;
+ }
+ });
+ }
- while (tokens.next()) |token| {
- if (token.len > 1) return error.InvalidCharacter;
- try rule.append(Character.from_u8(token[0]));
+ head = std.mem.trimLeft(u8, head, &std.ascii.whitespace);
}
if (rule.items.len == 0) {
- return error.EmptyProduction;
+ return error.EmptyVariant;
}
- return .{ lhs[0], try rule.toOwnedSlice() };
+ try self.rule_list.append(try rule.toOwnedSlice());
}
pub inline fn is_empty(self: *const Self) bool {
@@ -76,20 +128,152 @@ pub fn format(
_ = fmt;
_ = options;
- try writer.print("[{c} -> ", .{ self.name });
+ try writer.print("{s} ->", .{ self.name });
if (self.rule_list.items.len > 0) {
for (self.rule_list.items[0]) |c| {
- try writer.print("{}", .{c});
+ try writer.print(" {}", .{c});
}
for (self.rule_list.items[1..]) |r| {
- try writer.print(" | ", .{});
+ try writer.print(" |", .{});
for (r) |c| {
- try writer.print("{}", .{c});
+ try writer.print(" {}", .{c});
}
}
}
+}
+
+test "single-non-terminal" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+ try names.put("one", 1);
+
+ try non_terminal.parse_and_add_variants("main -> one", &names);
+
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character { Character { .non_terminal = 1 } }
+ }, non_terminal.rules());
+}
+
+test "multiple-non-terminal" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+ try names.put("one", 1);
+ try names.put("two", 2);
+ try names.put("three", 3);
+
+ try non_terminal.parse_and_add_variants("main -> one | two | three", &names);
+
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character { Character { .non_terminal = 1 } },
+ &[_] Character { Character { .non_terminal = 2 } },
+ &[_] Character { Character { .non_terminal = 3 } },
+ }, non_terminal.rules());
+}
+
+test "multiple-non-terminal-sequence" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+ try names.put("one", 1);
+ try names.put("two", 2);
+ try names.put("three", 3);
+
+ try non_terminal.parse_and_add_variants("main -> one two | two | three", &names);
+
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character { Character { .non_terminal = 1 }, Character { .non_terminal = 2 }, },
+ &[_] Character { Character { .non_terminal = 2 } },
+ &[_] Character { Character { .non_terminal = 3 } },
+ }, non_terminal.rules());
+}
+
+test "single-terminal" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+ try names.put("one", 1);
+
+ try non_terminal.parse_and_add_variants("main -> 'one'", &names);
+
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character {
+ Character { .terminal = 'o' },
+ Character { .terminal = 'n' },
+ Character { .terminal = 'e' }
+ },
+ }, non_terminal.rules());
+}
+
+test "single-terminal-with-escapes" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+
+ try non_terminal.parse_and_add_variants("main -> 'o\\r\\nn\\'e\\\\'", &names);
+
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character {
+ Character { .terminal = 'o' },
+ Character { .terminal = '\r' },
+ Character { .terminal = '\n' },
+ Character { .terminal = 'n' },
+ Character { .terminal = '\'' },
+ Character { .terminal = 'e' },
+ Character { .terminal = '\\' },
+ },
+ }, non_terminal.rules());
+}
+
+test "mix" {
+ const allocator = std.testing.allocator;
+ var non_terminal = try Self.init("main", 0, allocator);
+ defer non_terminal.deinit();
+
+ var names = std.StringHashMap(usize).init(allocator);
+ defer names.deinit();
+ try names.put("main", 0);
+ try names.put("other", 1);
+ try names.put("some", 2);
+
+ try non_terminal.parse_and_add_variants("main -> 'ab' other | other some 'a' | main", &names);
- try writer.writeByte(']');
+ try std.testing.expectEqualDeep(&[_][]const Character{
+ &[_] Character {
+ Character { .terminal = 'a' },
+ Character { .terminal = 'b' },
+ Character { .non_terminal = 1 },
+ },
+ &[_] Character {
+ Character { .non_terminal = 1 },
+ Character { .non_terminal = 2 },
+ Character { .terminal = 'a' },
+ },
+ &[_] Character {
+ Character { .non_terminal = 0 },
+ },
+ }, non_terminal.rules());
}
diff --git a/src/recognizer.zig b/src/recognizer.zig
index 1783db1..25a7944 100644
--- a/src/recognizer.zig
+++ b/src/recognizer.zig
@@ -8,8 +8,8 @@ const gss = @import("gss.zig");
const State = struct {
const Self = @This();
- name: u8,
- rule: NonTerminal.Rule,
+ id: usize,
+ rule_index: usize,
inner_position: usize,
input_position: usize,
@@ -22,13 +22,34 @@ const State = struct {
_ = fmt;
_ = options;
- try writer.print("[ {c}, {s}, {}, {} ]", .{
- self.name,
- self.rule,
+ try writer.print("[ {}, {}, {}, {} ]", .{
+ self.id,
+ self.rule_index,
self.inner_position,
self.input_position,
});
}
+
+ pub fn debug(self: *const Self, grammar: *Grammar, input: []const u8) void {
+ const rule = grammar.non_terminal_by_id(self.id).rules()[self.rule_index];
+ std.debug.print("input ({s}○{s}) state (", .{
+ input[0..self.input_position],
+ input[self.input_position..],
+ });
+
+ for (rule, 0..) |char, index| {
+ if (index == self.inner_position) {
+ std.debug.print("○", .{});
+ }
+
+ std.debug.print("{}", .{char});
+ }
+
+ if (rule.len == self.inner_position) {
+ std.debug.print("○", .{});
+ }
+ std.debug.print(")\n", .{});
+ }
};
pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allocator) !bool {
@@ -44,11 +65,11 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
defer for (queues) |queue| queue.deinit();
var processing_queue = &queues[0];
- for (entry.rules()) |rule| {
+ for (0..entry.rules().len) |index| {
const node = try allocator.create(gss.Node(State));
node.* = gss.Node(State).init(State {
- .name = entry.name,
- .rule = rule,
+ .id = entry.id,
+ .rule_index = index,
.inner_position = 0,
.input_position = 0,
});
@@ -60,10 +81,12 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
while (processing_queue.items.len > 0) {
for (processing_queue.items) |node| {
- if (node.state.inner_position == node.state.rule.len) {
+ const rule = grammar.non_terminal_by_id(node.state.id).rules()[node.state.rule_index];
+
+ if (node.state.inner_position == rule.len) {
if (
node.parent == null and
- node.state.name == entry.name and
+ node.state.id == entry.id and
node.state.input_position == input.len
) {
return true;
@@ -72,8 +95,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
const old_state, const parent_node = node.pop(allocator);
if (parent_node) |parent| {
const sibling = try parent.clone(State {
- .name = parent.state.name,
- .rule = parent.state.rule,
+ .id = parent.state.id,
+ .rule_index = parent.state.rule_index,
.inner_position = parent.state.inner_position + 1,
.input_position = old_state.input_position,
}, allocator);
@@ -82,12 +105,12 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
} else {
const epsilon_only = node.state.input_position == input.len;
- switch (node.state.rule[node.state.inner_position]) {
+ switch (rule[node.state.inner_position]) {
.terminal => |t| {
if (!epsilon_only and input[node.state.input_position] == t) {
const sibling = try node.clone(State {
- .name = node.state.name,
- .rule = node.state.rule,
+ .id = node.state.id,
+ .rule_index = node.state.rule_index,
.inner_position = node.state.inner_position + 1,
.input_position = node.state.input_position + 1,
}, allocator);
@@ -95,14 +118,13 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
}
},
.non_terminal => |n| {
- const non_terminal = grammar.non_terminal_by_name(n);
- const rules = non_terminal.rules();
+ const non_terminal = grammar.non_terminal_by_id(n);
if (!epsilon_only and non_terminal.first.is_set(input[node.state.input_position])) {
- for (rules) |rule| {
+ for (0..non_terminal.rules().len) |index| {
const child = try node.push(State {
- .name = non_terminal.name,
- .rule = rule,
+ .id = non_terminal.id,
+ .rule_index = index,
.inner_position = 0,
.input_position = node.state.input_position,
}, allocator);
@@ -112,8 +134,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
if (non_terminal.first.is_set(Character.EPSILON)) {
const sibling = try node.clone(State {
- .name = node.state.name,
- .rule = node.state.rule,
+ .id = node.state.id,
+ .rule_index = node.state.rule_index,
.inner_position = node.state.inner_position + 1,
.input_position = node.state.input_position
}, allocator);
@@ -122,8 +144,8 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
},
.epsilon => {
const sibling = try node.clone(State {
- .name = node.state.name,
- .rule = node.state.rule,
+ .id = node.state.id,
+ .rule_index = node.state.rule_index,
.inner_position = node.state.inner_position + 1,
.input_position = node.state.input_position
}, allocator);
@@ -145,83 +167,83 @@ pub fn check(grammar: *Grammar, input: []const u8, inner_allocator: std.mem.Allo
test "expr" {
const text =
\\S -> B A
- \\A -> + B A
- \\A -> _
+ \\A -> '+' B A
+ \\A -> ''
\\B -> D C
- \\C -> * D C
- \\C -> _
- \\D -> ( S )
- \\D -> a
- \\D -> b
+ \\C -> '*' D C
+ \\C -> ''
+ \\D -> '(' S ')'
+ \\D -> 'a'
+ \\D -> 'b'
;
const input = "b+a*b";
const allocator = std.testing.allocator;
- var grammar = try Grammar.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Grammar.parse("S", text, allocator);
+ defer grammar.deinit();
try std.testing.expect(try check(&grammar, input, allocator));
}
test "simple 0 - success" {
const text =
- \\S -> A S d
+ \\S -> A S 'd'
\\S -> B S
- \\S -> _
- \\A -> a
- \\A -> c
- \\B -> a
- \\B -> b
+ \\S -> ''
+ \\A -> 'a'
+ \\A -> 'c'
+ \\B -> 'a'
+ \\B -> 'b'
;
const input = "aad";
const allocator = std.testing.allocator;
- var grammar = try Grammar.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Grammar.parse("S", text, allocator);
+ defer grammar.deinit();
try std.testing.expect(try check(&grammar, input, allocator));
}
test "simple 0 - fail" {
const text =
- \\S -> A S d
+ \\S -> A S 'd'
\\S -> B S
- \\S -> _
- \\A -> a
- \\A -> c
- \\B -> a
- \\B -> b
+ \\S -> ''
+ \\A -> 'a'
+ \\A -> 'c'
+ \\B -> 'a'
+ \\B -> 'b'
;
const input = "accd";
const allocator = std.testing.allocator;
- var grammar = try Grammar.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Grammar.parse("S", text, allocator);
+ defer grammar.deinit();
try std.testing.expect(!try check(&grammar, input, allocator));
}
test "simple 0 - fuzzy" {
const text =
- \\S -> A S d
+ \\S -> A S 'd'
\\S -> B S
- \\S -> _
- \\A -> a
- \\A -> c
- \\B -> a
- \\B -> b
+ \\S -> ''
+ \\A -> 'a'
+ \\A -> 'c'
+ \\B -> 'a'
+ \\B -> 'b'
;
const allocator = std.testing.allocator;
- var grammar = try Grammar.parse(text, allocator);
- defer grammar.deinit(allocator);
+ var grammar = try Grammar.parse("S", text, allocator);
+ defer grammar.deinit();
var generator = Generator(struct {
const Self = @This();
diff --git a/src/string.zig b/src/string.zig
new file mode 100644
index 0000000..8217a77
--- /dev/null
+++ b/src/string.zig
@@ -0,0 +1,41 @@
+const std = @import("std");
+
+pub fn isNameChar(char: u8) bool {
+ return switch(char) {
+ '0'...'9', 'A'...'Z', 'a'...'z', '_' => true,
+ else => false
+ };
+}
+
+pub fn read_name(buffer: []const u8) struct { []const u8, []const u8 } {
+ for (buffer, 0..) |char, index| {
+ if (!isNameChar(char)) {
+ return .{ buffer[0..index], buffer[index..] };
+ }
+ }
+
+ return .{ buffer[0..buffer.len], buffer[buffer.len..] };
+}
+
+pub fn read_first_from_literal(buffer: []const u8) !struct { u8, []const u8 } {
+ if (buffer[0] != '\\') {
+ return .{ buffer[0], buffer[1..] } ;
+ }
+
+ if (buffer.len > 1 and buffer[1] != 'x') {
+ return switch(buffer[1]) {
+ 't' => return .{ '\t', buffer[2..] },
+ 'n' => return .{ '\n', buffer[2..] },
+ 'r' => return .{ '\r', buffer[2..] },
+ '\'' => return .{ '\'', buffer[2..] },
+ '\\' => return .{ '\\', buffer[2..] },
+ else => return error.InvalidEscapeSequence,
+ };
+ }
+
+ if (buffer.len > 4 and buffer[1] == 'x') {
+ return .{ try std.fmt.parseInt(u8, buffer[2..4], 16), buffer[4..] };
+ }
+
+ return error.InvalidEscapeSequnce;
+}