From fef3d201479fef6530b72a17e893920797c77121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 30 Oct 2023 13:43:23 +0100 Subject: [PATCH 01/20] Starts to implement ptkgen grammar compiler, refactors ptk.Matcher to not return an optional anymore --- .envrc | 1 + build.zig | 104 ++++- build.zig.zon | 10 + design/ptkdefv/design.md | 4 + design/ptkdefv/grammar.ptk | 43 ++ design/ptkdefv/mapping-concept-01.ptk | 37 ++ examples/ptkgen/ast-with-unions.ptk | 62 +++ shell.nix | 11 + src/ptkgen/main.zig | 106 +++++ src/ptkgen/parser.zig | 370 ++++++++++++++++++ src/{ => toolkit}/Diagnostics.zig | 9 +- src/{ => toolkit}/Error.zig | 0 src/{ => toolkit}/Location.zig | 0 src/{ => toolkit}/StringCache.zig | 0 src/{ => toolkit}/main.zig | 4 + src/{ => toolkit}/parser_core.zig | 0 src/{ => toolkit}/token.zig | 10 + src/{ => toolkit}/tokenizer.zig | 68 ++-- .../accept/empty-with-comment-linefeed.ptk | 1 + test/parser/accept/empty-with-comment.ptk | 1 + test/parser/accept/empty.ptk | 0 test/parser/accept/identifiers.ptk | 8 + 22 files changed, 797 insertions(+), 52 deletions(-) create mode 100644 .envrc create mode 100644 build.zig.zon create mode 100644 design/ptkdefv/design.md create mode 100644 design/ptkdefv/grammar.ptk create mode 100644 design/ptkdefv/mapping-concept-01.ptk create mode 100644 examples/ptkgen/ast-with-unions.ptk create mode 100644 shell.nix create mode 100644 src/ptkgen/main.zig create mode 100644 src/ptkgen/parser.zig rename src/{ => toolkit}/Diagnostics.zig (84%) rename src/{ => toolkit}/Error.zig (100%) rename src/{ => toolkit}/Location.zig (100%) rename src/{ => toolkit}/StringCache.zig (100%) rename src/{ => toolkit}/main.zig (87%) rename src/{ => toolkit}/parser_core.zig (100%) rename src/{ => toolkit}/token.zig (56%) rename src/{ => toolkit}/tokenizer.zig (89%) create mode 100644 test/parser/accept/empty-with-comment-linefeed.ptk create mode 100644 test/parser/accept/empty-with-comment.ptk create mode 100644 test/parser/accept/empty.ptk create mode 100644 test/parser/accept/identifiers.ptk diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..17d6464 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use_nix \ No newline at end of file diff --git a/build.zig b/build.zig index 00a2907..12b0ce7 100644 --- a/build.zig +++ b/build.zig @@ -1,31 +1,99 @@ const std = @import("std"); pub fn build(b: *std.build.Builder) void { + // build options: + + const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); - _ = b.addModule("parser-toolkit", .{ - .source_file = .{ .path = "src/main.zig" }, + const test_step = b.step("test", "Run library tests"); + const examples_step = b.step("examples", "Builds and installs examples"); + const run_calc_step = b.step("run-calculator", "Runs calculator example"); + + const all_step = b.step("all", "Builds everything, tests everything"); + all_step.dependOn(b.getInstallStep()); + all_step.dependOn(test_step); + all_step.dependOn(examples_step); + + // dependencies + + const args_dep = b.dependency("args", .{}); + + // external modules + + const args_mod = args_dep.module("args"); + + // internal modules + + const ptk_mod = b.addModule("parser-toolkit", .{ + .source_file = .{ .path = "src/toolkit/main.zig" }, .dependencies = &.{}, }); - var main_tests = b.addTest(.{ - .root_source_file = .{ .path = "src/main.zig" }, - .optimize = optimize, - }); + // Applications + const ptkdef_exe = blk: { + const ptkdef = b.addExecutable(.{ + .name = "ptkgen", + .root_source_file = .{ .path = "src/ptkgen/main.zig" }, + .optimize = optimize, + .target = target, + }); - const test_step = b.step("test", "Run library tests"); - test_step.dependOn(&b.addRunArtifact(main_tests).step); + ptkdef.addModule("parser-toolkit", ptk_mod); + ptkdef.addModule("args", args_mod); - const calculator_example = b.addExecutable(.{ - .root_source_file = .{ .path = "examples/calculator.zig" }, - .name = "calculator", - .optimize = optimize, - }); + b.installArtifact(ptkdef); - b.installArtifact(calculator_example); - calculator_example.addAnonymousModule("parser-toolkit", .{ - .source_file = .{ .path = "src/main.zig" }, - }); + break :blk ptkdef; + }; + + // test suite + { + // unit tests for ptk: + var ptk_tests = b.addTest(.{ + .root_source_file = ptk_mod.source_file, + .optimize = optimize, + }); + for (ptk_mod.dependencies.keys()) |dep_name| { + ptk_tests.addModule(dep_name, ptk_mod.dependencies.get(dep_name).?); + } + test_step.dependOn(&b.addRunArtifact(ptk_tests).step); - b.step("run", "Runs the calculator example").dependOn(&b.addRunArtifact(calculator_example).step); + // unit tests for ptkgen: + var ptkgen_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/ptkgen/main.zig" }, + .optimize = optimize, + }); + ptkgen_tests.addModule("parser-toolkit", ptk_mod); + test_step.dependOn(&b.addRunArtifact(ptkgen_tests).step); + + // Integration tests for ptkgen: + for (parser_ok_files) |file| { + const run = b.addRunArtifact(ptkdef_exe); + run.addArg("--test_mode=parse_only"); + run.addFileArg(.{ .path = file }); + test_step.dependOn(&run.step); + } + } + + // examples + { + const calculator_example = b.addExecutable(.{ + .root_source_file = .{ .path = "examples/calculator.zig" }, + .name = "calculator", + .optimize = optimize, + }); + calculator_example.addModule("parser-toolkit", ptk_mod); + examples_step.dependOn(&b.addInstallArtifact(calculator_example, .{}).step); + + run_calc_step.dependOn(&b.addRunArtifact(calculator_example).step); + } } + +const parser_ok_files = [_][]const u8{ + "test/parser/accept/empty.ptk", + "test/parser/accept/empty-with-comment-linefeed.ptk", + "test/parser/accept/empty-with-comment.ptk", + "test/parser/accept/identifiers.ptk", + "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples +}; diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..5cbec5c --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,10 @@ +.{ + .name = "parser-toolkit", + .version = "0.2.0", + .dependencies = .{ + .args = .{ + .url = "https://github.com/MasterQ32/zig-args/archive/7989929d055ef7618e60de84cc54644046516fdb.tar.gz", + .hash = "12207752d975a7f5d7cc65662ed1c6b117da8dec6d1bd7af9a39e1b65d90bf86e833", + }, + }, +} diff --git a/design/ptkdefv/design.md b/design/ptkdefv/design.md new file mode 100644 index 0000000..e017c98 --- /dev/null +++ b/design/ptkdefv/design.md @@ -0,0 +1,4 @@ +# Parser Generator Language + +Create basic recursive descent parsers with "well-known" patterns that output a Zig AST data structure. + diff --git a/design/ptkdefv/grammar.ptk b/design/ptkdefv/grammar.ptk new file mode 100644 index 0000000..e50f519 --- /dev/null +++ b/design/ptkdefv/grammar.ptk @@ -0,0 +1,43 @@ + + +root ; # <...> is a "rule reference" + +token identifier = regex "[A-Za-z_][A-Za-z0-9_]*"; # defines token "identifier" to match this regex + +token line-comment = regex "//[^\n]*" skip; # ignores this token when parsing, but tokenizer recognizes it +token whitespace = regex "[ \t\r\n]" skip; + +rule document = + # [ ... ] is a loop construct, can appear several times + [ ] [ ]* +; + +rule toplevel-decl = + # | is a "either/or" scenario, with precedence from left to right (first come, first serve) + | | +; + +rule interface-decl = + "interface" $identifier "(" ... ")" ";"; +; + +rule module-decl = + "module" $identifier "(" ... ")" "{" ... "}" ";"; +; + +rule using = + # "bla" is a literal token + # $bla is an explicitly defined token reference + # ...? is an optional part of a parse + "using" ";" ( "as" $identifier )? +; + +rule namespace-decl = + "namespace" ";" +; + +rule compound-identifier = + $identifier [ "." $identifier ]* +; + + diff --git a/design/ptkdefv/mapping-concept-01.ptk b/design/ptkdefv/mapping-concept-01.ptk new file mode 100644 index 0000000..9e4ccf9 --- /dev/null +++ b/design/ptkdefv/mapping-concept-01.ptk @@ -0,0 +1,37 @@ + +# "!id" is a type reference +# "$id" is a token reference +# "" is a rule reference + +# maps type "array" to a slice/arraylist of whatever "int" is +node array = sequence !int; + +# "int" is the Zig type "i32" +node int = literal "i32"; + +# the initial rule is "list", also determines the root type of the ast +start ; + +# "decimal" token is a decimal number sequence token +token decimal = regex "\d+"; + +# "list" is a sequence of decimals with comma separated, potential trailing comma, +# enclosed by square brackets +rule list = "[" [ $decimal "," ] $decimal? "]"; +# $0 $1______________ $2_______ $3 + +# the rule "list" is mapped to the type "array" +# as a sequence of the second element (unwrapped into items) and +# the third item appended. square brackets in a map are the "construct array operator". +# if the array is not sequence of optionals, optional items are skipped in construction +map !array = [ $1..., $2 ]; + +# the "decimal" token is mapped to i32 by invoking a Zig function called +# "parse" that takes the token as input and returns "i32": +map $decimal !int = @parse($0); + + + + + + diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk new file mode 100644 index 0000000..b8c093f --- /dev/null +++ b/examples/ptkgen/ast-with-unions.ptk @@ -0,0 +1,62 @@ +# parse a construct like this into a single type: +# var name = value; +# const name = value; +# var name: type = value; +# const name: type = value; + +node declaration = struct + is_const: literal `bool`, + name: !identifier, + type: optional !type, + value: !value +; + +node identifier = literal `[]const u8`; +node type = custom `TypeId`; # enum { int, float, string } +node value = custom `Value`; + +start ; + +rule decl : !declaration = + ( ":" )? "=" => { + is_const = $0, + name = $1, + type = $2, + value = $4 + } +# $0_________ $1__ $2_____________ $3 $4_____ +; + +rule : literal `bool` = + "var" => `false` + | "const" => `true` +; + +rule : !identifier = "name" => tostring $0; + +rule : !type = + "int" => `.int` + | "float" => `.float` + | "string" => `.string` +; + +rule : !value = + "10" => @parseInt($0) + | "3.14" => @parseFloat($0) + | "\"nice\"" => @parseStringLiteral($0) +; + + + +# Unions have can only have a single option active at a time +node TLDeclaration = union + ns : !namespace, + interface : !interface, + module : !module, +; + +rule toplevel-decl : !TLDeclaration = + => ns: $0, # this is syntax for a union field selector as unions are not compounds + | => interface: $0, + | => module: $0, +; diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..664d354 --- /dev/null +++ b/shell.nix @@ -0,0 +1,11 @@ +{ pkgs ? import { } }: +pkgs.mkShell { + nativeBuildInputs = [ + # zig + pkgs.zig_0_11 + ]; + buildInputs = [ ]; + shellHook = '' + # put your shell hook here + ''; +} diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig new file mode 100644 index 0000000..3303955 --- /dev/null +++ b/src/ptkgen/main.zig @@ -0,0 +1,106 @@ +//! +//! Parser Toolkit Grammar Compiler +//! + +const std = @import("std"); +const args_parser = @import("args"); +const ptk = @import("parser-toolkit"); + +const parser = @import("parser.zig"); + +comptime { + // reference for unit tests: + _ = parser; +} + +pub const CliOptions = struct { + help: bool = false, + output: ?[]const u8 = null, + test_mode: TestMode = .none, + + pub const shorthands = .{ + .h = "help", + .o = "output", + }; + + pub const meta = .{ + .full_text = "Compiles a .ptk grammar file into Zig code.", + + .usage_summary = "[-h] [-o ] []", + + .option_docs = .{ + .help = "Prints this help.", + .output = "If given, will print the generated code into ", + + .test_mode = "(internal use only, required for testing)", + }, + }; +}; + +const TestMode = enum { + none, + parse_only, +}; + +pub fn main() !u8 { + var stdout = std.io.getStdOut(); + var stdin = std.io.getStdIn(); + var stderr = std.io.getStdErr(); + + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + + var arena = std.heap.ArenaAllocator.init(gpa.allocator()); + defer arena.deinit(); + + const dynamic_allocator = gpa.allocator(); + const static_allocator = arena.allocator(); + + var cli = args_parser.parseForCurrentProcess(CliOptions, static_allocator, .print) catch return 1; + defer cli.deinit(); + + if (cli.options.help) { + try args_parser.printHelp(CliOptions, cli.executable_name orelse "ptkgen", stdout.writer()); + return 0; + } + + var diagnostics = ptk.Diagnostics.init(dynamic_allocator); + defer diagnostics.deinit(); + + // From here on, always print the diagnostics on exit! + defer diagnostics.print(stderr.writer()) catch {}; + + var input_file = switch (cli.positionals.len) { + 0 => stdin, + 1 => std.fs.cwd().openFile(cli.positionals[0], .{}) catch |err| { + try stderr.writer().print("failed to open file {s}: {s}\n", .{ + cli.positionals[0], + @errorName(err), + }); + return 1; + }, + else => { + try stderr.writeAll("Expects either a single positional file or none.\nSee --help for usage!\n"); + return 1; + }, + }; + defer input_file.close(); + + var ast = try parser.parse( + dynamic_allocator, + &diagnostics, + if (cli.positionals.len > 0) + cli.positionals[0] + else + "stdint", + input_file.reader(), + ); + defer ast.deinit(); + + if (cli.options.test_mode == .parse_only) { + // we're done if we're here + return 0; + } + + return 0; +} diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig new file mode 100644 index 0000000..f1402ee --- /dev/null +++ b/src/ptkgen/parser.zig @@ -0,0 +1,370 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +pub const Document = struct { + arena: std.heap.ArenaAllocator, + + file_name: []const u8, + source_text: []const u8, + + pub fn deinit(ts: *Document) void { + ts.arena.deinit(); + ts.* = undefined; + } +}; + +pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, file_name: []const u8, stream: anytype) !Document { + var arena = std.heap.ArenaAllocator.init(allocator); + errdefer arena.deinit(); + + const file_name_copy = try arena.allocator().dupe(u8, file_name); + + const text = try stream.readAllAlloc(arena.allocator(), 4 << 20); // 4 MB should be enough for now... + + var tokenizer = Tokenizer.init(text, file_name_copy); + + while (true) { + const token_or_none = tokenizer.next() catch |err| switch (err) { + error.UnexpectedCharacter => { + try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ + std.zig.fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), + }); + return error.SyntaxError; + }, + + else => |e| return e, + }; + const token = token_or_none orelse break; + + std.log.info("token: {}", .{token}); + } + + return Document{ + .arena = arena, + .file_name = file_name_copy, + .source_text = text, + }; +} + +pub const TokenType = enum { + + // keywords + + node, + @"struct", + optional, + start, + rule, + token, + + literal, + custom, + regex, + skip, + + // user values + + raw_identifier, // foo-bar_bam + node_ref, // !node + rule_ref, // + token_ref, // $token + value_ref, // $0 + builtin_ref, // @builtin + + // values + + string_literal, // "string" + code_literal, // `code` + + // operators + + @"=", + @",", + @".", + @"*", + @"+", + @":", + @";", + @"|", + @"!", + @"?", + @"[", + @"]", + @"(", + @")", + @"{", + @"}", + @"=>", + + // auxiliary + + line_comment, + whitespace, +}; + +pub const Token = Tokenizer.Token; + +const match = ptk.matchers; + +const Pattern = ptk.Pattern(TokenType); + +const ParserCore = ptk.ParserCore(TokenType, .{ .whitespace, .line_comment }); + +const Tokenizer = ptk.Tokenizer(TokenType, &.{ + Pattern.create(.line_comment, match.sequenceOf(.{ match.literal("#"), match.takeNoneOf("\r\n") })), + + Pattern.create(.node, match.word("node")), + Pattern.create(.@"struct", match.word("struct")), + Pattern.create(.optional, match.word("optional")), + Pattern.create(.start, match.word("start")), + Pattern.create(.rule, match.word("rule")), + Pattern.create(.token, match.word("token")), + Pattern.create(.literal, match.word("literal")), + Pattern.create(.custom, match.word("custom")), + Pattern.create(.regex, match.word("regex")), + Pattern.create(.skip, match.word("skip")), + + Pattern.create(.@"=>", match.literal("=>")), + + Pattern.create(.@"=", match.literal("=")), + Pattern.create(.@",", match.literal(",")), + Pattern.create(.@".", match.literal(".")), + Pattern.create(.@"*", match.literal("*")), + Pattern.create(.@"+", match.literal("+")), + Pattern.create(.@":", match.literal(":")), + Pattern.create(.@";", match.literal(";")), + Pattern.create(.@"|", match.literal("|")), + Pattern.create(.@"!", match.literal("!")), + Pattern.create(.@"?", match.literal("?")), + Pattern.create(.@"[", match.literal("[")), + Pattern.create(.@"]", match.literal("]")), + Pattern.create(.@"(", match.literal("(")), + Pattern.create(.@")", match.literal(")")), + Pattern.create(.@"{", match.literal("{")), + Pattern.create(.@"}", match.literal("}")), + + Pattern.create(.string_literal, matchStringLiteral), + Pattern.create(.code_literal, matchCodeLiteral), + + // identifiers must come after keywords: + Pattern.create(.raw_identifier, matchRawIdentifier), + Pattern.create(.node_ref, matchNodeRef), + Pattern.create(.rule_ref, matchRuleRef), + Pattern.create(.token_ref, matchTokenRef), + Pattern.create(.value_ref, matchValueRef), + Pattern.create(.builtin_ref, matchBuiltinRef), + + // Whitespace is the "kitchen sink" at the end: + Pattern.create(.whitespace, match.takeAnyOf(" \r\n\t")), +}); + +/// Accepts a basic identifier without any prefix or suffix. +/// The regex that matches this pattern is roughly this: +/// +/// (@\"[^"]+\")|([A-Za-z_][A-Za-z0-9_\-]*) +/// +fn matchRawIdentifier(text: []const u8) usize { + if (text.len < 1) + return 0; + + if (std.mem.startsWith(u8, text, "@\"")) { + if (text.len < 3) + return 0; + + var i: usize = 2; // skip `@"` + while (i < text.len) : (i += 1) { + if (text[i] == '\"') + return i + 1; + if (text[i] == '\\') + i += 1; + } + + return 0; + } else { + const prefix_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_"; + const suffix_chars = prefix_chars ++ "0123456789"; + const inner_chars = suffix_chars ++ "-"; + + if (std.mem.indexOfScalar(u8, prefix_chars, text[0]) == null) + return 0; // invalid start char + + // Suffix check is done in "postprocessing" by checking if any identifier ends with "-" + + var len: usize = 1; + while (len < text.len and std.mem.indexOfScalar(u8, inner_chars, text[len]) != null) { + len += 1; + } + + return len; + } + + return 0; +} + +test matchRawIdentifier { + try ptk.testing.validateMatcher(matchRawIdentifier, &.{ + // good: + "a", + "a-z", + "items10", + "_foo", + "_", + "_cheese-cake", + }, &.{ + // bad: + "-", + "-10", + "10", + "1-2", + "10items", + }); +} + +const matchNodeRef = match.sequenceOf(.{ match.literal("!"), matchRawIdentifier }); + +test matchNodeRef { + try ptk.testing.validateMatcher(matchNodeRef, &.{ + // good: + "!a", + "!foo_bar", + }, &.{ + // bad: + "a", + "!", + }); +} + +const matchRuleRef = match.sequenceOf(.{ match.literal("<"), matchRawIdentifier, match.literal(">") }); + +test matchRuleRef { + try ptk.testing.validateMatcher(matchRuleRef, &.{ + // good: + "", + "", + "", + "<@\"very exiting boy\">", + }, &.{ + // bad: + "", + }); +} + +const matchTokenRef = match.sequenceOf(.{ match.literal("$"), matchRawIdentifier }); + +test matchTokenRef { + try ptk.testing.validateMatcher(matchTokenRef, &.{ + // good: + "$token", + "$user-token", + "$user_token", + "$@\"wtf\"", + }, &.{ + // bad: + "$\"wtf\"", + "bad boy", + "bad-boy", + "$0", + "$100", + }); +} + +const matchValueRef = match.sequenceOf(.{ match.literal("$"), match.decimalNumber }); + +test matchValueRef { + try ptk.testing.validateMatcher(matchValueRef, &.{ + // good: + "$0", + "$10", + "$99999999", + }, &.{ + // bad: + "9", + "$", + "$foo", + }); +} + +const matchBuiltinRef = match.sequenceOf(.{ match.literal("@"), matchRawIdentifier }); + +test matchBuiltinRef { + try ptk.testing.validateMatcher(matchBuiltinRef, &.{ + // good: + "@token", + "@user-token", + "@user_token", + "@@\"wtf\"", + }, &.{ + // bad: + "@\"wtf\"", + "bad boy", + "bad-boy", + "@0", + "@100", + }); +} + +fn matchStringLiteral(text: []const u8) usize { + if (text.len < 2) + return 0; + + if (text[0] != '"') + return 0; + + var i: usize = 1; // skip `"` + while (i < text.len) : (i += 1) { + if (text[i] == '\"') + return i + 1; + if (text[i] == '\\') + i += 1; + } + + return 0; +} + +test matchStringLiteral { + try ptk.testing.validateMatcher(matchStringLiteral, &.{ + // good: + "\"\"", + "\"x\"", + "\" \"", + "\" hello \\\"world\\\"\"", + }, &.{ + // bad: + "\"", + "\"\\\"", + "\"", + "foo\"", + }); +} + +fn matchCodeLiteral(text: []const u8) usize { + var prefix_len: usize = 0; + while (prefix_len < text.len and text[prefix_len] == '`') { + prefix_len += 1; + } + + if (prefix_len == 0 or 2 * prefix_len >= text.len) + return 0; + + const body_len = std.mem.indexOf(u8, text[prefix_len..], text[0..prefix_len]) orelse return 0; + + return 2 * prefix_len + body_len; +} + +test matchCodeLiteral { + try ptk.testing.validateMatcher(matchCodeLiteral, &.{ + // good: + "`x`", + "`\"hello, World!\"`", + "`\n\n`", + "`\x00`", + "``you can write a `code` snippet like this!``", + }, &.{ + // bad: + "`", + "``", + "```hello, world!``", + }); +} diff --git a/src/Diagnostics.zig b/src/toolkit/Diagnostics.zig similarity index 84% rename from src/Diagnostics.zig rename to src/toolkit/Diagnostics.zig index 0a93c19..bf3a842 100644 --- a/src/Diagnostics.zig +++ b/src/toolkit/Diagnostics.zig @@ -38,8 +38,15 @@ pub fn emit(self: *Self, location: Location, level: Error.Level, comptime fmt: [ const str = try std.fmt.allocPrintZ(allocator, fmt, args); errdefer allocator.free(str); + var cloned_location = location; + if (location.source) |source| { + cloned_location.source = try allocator.dupe(u8, source); + } + errdefer if (cloned_location.source) |source| + allocator.free(source); + try self.errors.append(allocator, Error{ - .location = location, + .location = cloned_location, .level = level, .message = str, }); diff --git a/src/Error.zig b/src/toolkit/Error.zig similarity index 100% rename from src/Error.zig rename to src/toolkit/Error.zig diff --git a/src/Location.zig b/src/toolkit/Location.zig similarity index 100% rename from src/Location.zig rename to src/toolkit/Location.zig diff --git a/src/StringCache.zig b/src/toolkit/StringCache.zig similarity index 100% rename from src/StringCache.zig rename to src/toolkit/StringCache.zig diff --git a/src/main.zig b/src/toolkit/main.zig similarity index 87% rename from src/main.zig rename to src/toolkit/main.zig index 784dec5..09b1ba8 100644 --- a/src/main.zig +++ b/src/toolkit/main.zig @@ -18,6 +18,10 @@ pub const Error = @import("Error.zig"); pub const Diagnostics = @import("Diagnostics.zig"); pub const StringCache = @import("StringCache.zig"); +pub const testing = struct { + pub const validateMatcher = tok.testMatcher; +}; + test { _ = Location; _ = tok; diff --git a/src/parser_core.zig b/src/toolkit/parser_core.zig similarity index 100% rename from src/parser_core.zig rename to src/toolkit/parser_core.zig diff --git a/src/token.zig b/src/toolkit/token.zig similarity index 56% rename from src/token.zig rename to src/toolkit/token.zig index 60ae8fa..028272c 100644 --- a/src/token.zig +++ b/src/toolkit/token.zig @@ -14,5 +14,15 @@ pub fn Token(comptime Type: type) type { /// The type of the token that was matched by a matching function type: Type, + + pub fn format(token: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = fmt; + _ = options; + try writer.print("Token {{ .type = {}, .text = \"{}\", .location = {} }}", .{ + token.type, + std.zig.fmtEscapes(token.text), + token.location, + }); + } }; } diff --git a/src/tokenizer.zig b/src/toolkit/tokenizer.zig similarity index 89% rename from src/tokenizer.zig rename to src/toolkit/tokenizer.zig index 1ee859c..ec20f18 100644 --- a/src/tokenizer.zig +++ b/src/toolkit/tokenizer.zig @@ -3,7 +3,9 @@ const std = @import("std"); const Location = @import("Location.zig"); const GenericToken = @import("token.zig").Token; -pub const Matcher = *const fn (str: []const u8) ?usize; +/// This is a function that will either accept a `text` as a token +/// of a non-zero length or returns `0` if the text does not match the token. +pub const Matcher = *const fn (text: []const u8) usize; pub fn Pattern(comptime TokenType: type) type { return struct { @@ -66,14 +68,13 @@ pub fn Tokenizer(comptime TokenTypeT: type, comptime patterns: []const Pattern(T if (rest.len == 0) return null; const maybe_token = for (patterns) |pat| { - if (pat.match(rest)) |len| { - if (len > 0) { - break Token{ - .location = self.current_location, - .text = rest[0..len], - .type = pat.type, - }; - } + const len = pat.match(rest); + if (len > 0) { + break Token{ + .location = self.current_location, + .text = rest[0..len], + .type = pat.type, + }; } } else null; if (maybe_token) |token| { @@ -91,11 +92,11 @@ pub const matchers = struct { /// Matches the literal `text`. pub fn literal(comptime text: []const u8) Matcher { return struct { - fn match(str: []const u8) ?usize { + fn match(str: []const u8) usize { return if (std.mem.startsWith(u8, str, text)) text.len else - null; + 0; } }.match; } @@ -103,17 +104,17 @@ pub const matchers = struct { /// Matches any "word" that is "text\b" pub fn word(comptime text: []const u8) Matcher { return struct { - fn match(input: []const u8) ?usize { + fn match(input: []const u8) usize { if (std.mem.startsWith(u8, input, text)) { if (text.len == input.len) return text.len; const c = input[text.len]; if (std.ascii.isAlphanumeric(c) or (c == '_')) // matches regex \w\W - return null; + return 0; return text.len; } - return null; + return 0; } }.match; } @@ -121,7 +122,7 @@ pub const matchers = struct { /// Takes characters while they are any of the given `chars`. pub fn takeAnyOf(comptime chars: []const u8) Matcher { return struct { - fn match(str: []const u8) ?usize { + fn match(str: []const u8) usize { for (str, 0..) |c, i| { if (std.mem.indexOfScalar(u8, chars, c) == null) { return i; @@ -140,7 +141,7 @@ pub const matchers = struct { }; return struct { - fn match(str: []const u8) ?usize { + fn match(str: []const u8) usize { for (str, 0..) |c, i| { const lc = std.ascii.toLower(c); if (std.mem.indexOfScalar(u8, lower_chars, lc) == null) { @@ -155,7 +156,7 @@ pub const matchers = struct { /// Takes characters while they are not any of the given `chars`. pub fn takeNoneOf(comptime chars: []const u8) Matcher { return struct { - fn match(str: []const u8) ?usize { + fn match(str: []const u8) usize { for (str, 0..) |c, i| { if (std.mem.indexOfScalar(u8, chars, c) != null) { return i; @@ -168,10 +169,12 @@ pub const matchers = struct { pub fn withPrefix(comptime prefix: []const u8, comptime matcher: Matcher) Matcher { return struct { - fn match(str: []const u8) ?usize { + fn match(str: []const u8) usize { if (!std.mem.startsWith(u8, str, prefix)) - return null; - const pattern_len = matcher(str[prefix.len..]) orelse return null; + return 0; + const pattern_len = matcher(str[prefix.len..]); + if (pattern_len == 0) + return 0; return prefix.len + pattern_len; } }.match; @@ -183,12 +186,12 @@ pub const matchers = struct { if (sequence.len == 0) @compileError("Empty sequence not allowed!"); return struct { - fn match(input: []const u8) ?usize { + fn match(input: []const u8) usize { var total_len: usize = 0; for (sequence) |seq_match| { - const len = seq_match(input[total_len..]) orelse return null; + const len = seq_match(input[total_len..]); if (len == 0) - return null; + return 0; total_len += len; } return total_len; @@ -198,7 +201,7 @@ pub const matchers = struct { // pre-shipped typical patterns - pub fn identifier(str: []const u8) ?usize { + pub fn identifier(str: []const u8) usize { const first_char = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; const all_chars = first_char ++ "0123456789"; for (str, 0..) |c, i| { @@ -209,7 +212,7 @@ pub const matchers = struct { return str.len; } - pub fn whitespace(str: []const u8) ?usize { + pub fn whitespace(str: []const u8) usize { for (str, 0..) |c, i| { if (!std.ascii.isWhitespace(c)) return i; @@ -217,12 +220,12 @@ pub const matchers = struct { return str.len; } - pub fn linefeed(str: []const u8) ?usize { + pub fn linefeed(str: []const u8) usize { if (std.mem.startsWith(u8, str, "\r\n")) return 2; if (std.mem.startsWith(u8, str, "\n")) return 1; - return null; + return 0; } pub fn numberOfBase(comptime base: comptime_int) Matcher { @@ -321,12 +324,11 @@ test "save/restore tokenization" { try std.testing.expectEqual(Location{ .source = null, .line = 2, .column = 1 }, id1.location); } -fn testMatcher(match: Matcher, good: []const []const u8, bad: []const []const u8) !void { +pub fn testMatcher(match: Matcher, good: []const []const u8, bad: []const []const u8) !void { + std.debug.assert(good.len > 0); + std.debug.assert(bad.len > 0); for (good) |str| { - const v = match(str) orelse { - std.log.err("Didn't match pattern '{s}'", .{str}); - return error.MissedGoodPattern; - }; + const v = match(str); if (v == 0) { std.log.err("Didn't match pattern '{s}'", .{str}); return error.MissedGoodPattern; @@ -334,7 +336,7 @@ fn testMatcher(match: Matcher, good: []const []const u8, bad: []const []const u8 } for (bad) |str| { const v = match(str); - if (v != null and v.? > 0) { + if (v > 0) { std.log.err("Matched pattern '{s}'", .{str}); return error.MissedBadPattern; } diff --git a/test/parser/accept/empty-with-comment-linefeed.ptk b/test/parser/accept/empty-with-comment-linefeed.ptk new file mode 100644 index 0000000..a1e7613 --- /dev/null +++ b/test/parser/accept/empty-with-comment-linefeed.ptk @@ -0,0 +1 @@ +# hello, world! diff --git a/test/parser/accept/empty-with-comment.ptk b/test/parser/accept/empty-with-comment.ptk new file mode 100644 index 0000000..0017949 --- /dev/null +++ b/test/parser/accept/empty-with-comment.ptk @@ -0,0 +1 @@ +# hello, world! \ No newline at end of file diff --git a/test/parser/accept/empty.ptk b/test/parser/accept/empty.ptk new file mode 100644 index 0000000..e69de29 diff --git a/test/parser/accept/identifiers.ptk b/test/parser/accept/identifiers.ptk new file mode 100644 index 0000000..521db6f --- /dev/null +++ b/test/parser/accept/identifiers.ptk @@ -0,0 +1,8 @@ + +rule a = literal `whatever`; +rule _ = literal `whatever`; +rule a0 = literal `whatever`; +rule a-z = literal `whatever`; +rule _10 = literal `whatever`; +rule @"x" = literal `whatever`; +rule @"hello, world!" = literal `whatever`; From 8ae8684fca1f72315827076cf0bf3e530355cfaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 30 Oct 2023 17:46:17 +0100 Subject: [PATCH 02/20] Starts working on the parser, can already recognize some basic nodes --- build.zig | 9 +- docs/grammar.md | 40 +++ examples/ptkgen/ast-with-unions.ptk | 6 +- src/ptkgen/ast.zig | 166 +++++++++ src/ptkgen/main.zig | 176 +++++++++- src/ptkgen/parser.zig | 330 +++++++++++++++++- src/toolkit/main.zig | 4 +- src/toolkit/parser_core.zig | 1 + src/toolkit/strings.zig | 156 +++++++++ test/analysis/accept/match-literal-rule.ptk | 2 + .../accept/match-literal-sequence.ptk | 2 + 11 files changed, 855 insertions(+), 37 deletions(-) create mode 100644 docs/grammar.md create mode 100644 src/ptkgen/ast.zig create mode 100644 src/toolkit/strings.zig create mode 100644 test/analysis/accept/match-literal-rule.ptk create mode 100644 test/analysis/accept/match-literal-sequence.ptk diff --git a/build.zig b/build.zig index 12b0ce7..d09acee 100644 --- a/build.zig +++ b/build.zig @@ -94,6 +94,11 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/empty.ptk", "test/parser/accept/empty-with-comment-linefeed.ptk", "test/parser/accept/empty-with-comment.ptk", - "test/parser/accept/identifiers.ptk", - "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples + // "test/parser/accept/identifiers.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples +} ++ analyis_ok_files; + +const analyis_ok_files = [_][]const u8{ + "test/analysis/accept/match-literal-rule.ptk", + "test/analysis/accept/match-literal-sequence.ptk", }; diff --git a/docs/grammar.md b/docs/grammar.md new file mode 100644 index 0000000..031d096 --- /dev/null +++ b/docs/grammar.md @@ -0,0 +1,40 @@ +# Parser Toolkit Grammar + +## Syntax + +```rb + +@Identifier # references Identifier from the user context. can be used for types, functions, values + # references another rule named Rule +!Node # references another ast node called Node + + +``` + +## Types + +```rb +literal `text` # pastes text into the code +optional ... # makes ... an optional type + +struct # constructs a structure type, having two fields: + field: !type, + field: !type + +union # constructs a type for alternatives, here with two variants: + Foo: !type, # alternative called Foo + Bar: !type # alternative called Bar + +``` + +## Strings + +- `\x00 ... \xFF` => Hexadecimal escape +- `\000 ... \377` => Octal escape +- `\n` => LF (0x0A) +- `\r` => CR (0x0D) +- `\'` => single quote (0x27) +- `\"` => double quote (0x22) +- `\\` => back slash (0x5C) +- `\u????` => UTF-16 +- `\U????????` => UTF-32 diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index b8c093f..fa170fc 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -12,8 +12,8 @@ node declaration = struct ; node identifier = literal `[]const u8`; -node type = custom `TypeId`; # enum { int, float, string } -node value = custom `Value`; +node type = @TypeId; # enum { int, float, string } +node value = @Value; start ; @@ -32,7 +32,7 @@ rule : literal `bool` = | "const" => `true` ; -rule : !identifier = "name" => tostring $0; +rule : !identifier = "name" => tostring($0); rule : !type = "int" => `.int` diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig new file mode 100644 index 0000000..0bda109 --- /dev/null +++ b/src/ptkgen/ast.zig @@ -0,0 +1,166 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const Location = ptk.Location; + +pub fn List(comptime T: type) type { + return struct { + pub const Item = T; + + pub const Node = std.TailQueue(T).Node; + + inner: std.TailQueue(T) = .{}, + + pub fn append(list: *@This(), item: *@This().Node) void { + list.inner.append(item); + } + + pub fn len(list: @This()) usize { + return list.inner.len; + } + + pub fn only(list: @This()) ?T { + return if (list.inner.len == 1) + list.inner.first.?.data + else + null; + } + }; +} + +pub fn Iterator(comptime T: type) type { + return struct { + node: ?*List(T).Node, + + pub fn next(iter: *@This()) ?T { + const current = iter.node orelse return null; + iter.node = current.next; + return current.data; + } + }; +} + +pub fn iterate(list: anytype) Iterator(@TypeOf(list).Item) { + return Iterator(@TypeOf(list).Item){ .node = list.inner.first }; +} + +pub fn Reference(comptime T: type) type { + return struct { + pub const Referenced = T; + + location: Location, + identifier: ptk.strings.String, + }; +} + +fn String(comptime Tag: anytype) type { + return struct { + pub const tag = Tag; + + location: Location, + value: ptk.strings.String, + }; +} + +pub const Identifier = String(.identifier); +pub const StringLiteral = String(.string); +pub const CodeLiteral = String(.code); +pub const BuiltinLiteral = String(.builtin); + +pub const Document = List(TopLevelDeclaration); + +pub const TopLevelDeclaration = union(enum) { + start: NodeRef, + rule: Rule, + node: Node, +}; + +pub const NodeRef = Reference(Node); // !mynode +pub const RuleRef = Reference(Rule); // +pub const TokenRef = Reference(Token); // $mytoken +pub const ValueRef = struct { // $0 + location: Location, + index: u32, +}; + +pub const Node = struct { // node = ...; + name: Identifier, + value: TypeSpec, +}; + +pub const Rule = struct { // rule ( : )? = ...; + name: Identifier, // + ast_type: ?TypeSpec, // if specified, defines the ast node of the rule + productions: List(MappedProduction), // all alternatives of the rule +}; + +pub const Token = struct { // token = ...; + name: Identifier, + pattern: Pattern, +}; + +pub const MappedProduction = struct { // ... => value + production: Production, // the thing before "=>" + mapping: ?AstMapping, // the thing after "=>" +}; + +pub const Production = union(enum) { + literal: StringLiteral, // "text" + terminal: TokenRef, // $token + recursion: RuleRef, // + sequence: List(Production), // ( ... ) + optional: *Production, // ...? + repetition_zero: *Production, // [ ... ]* + repetition_one: *Production, // [ ... ]+ +}; + +pub const AstMapping = union(enum) { + constructor: List(FieldAssignment), // { field = ..., field = ... } + literal: CodeLiteral, // field: value + context_reference: ValueRef, // $0 + user_reference: BuiltinLiteral, // @field + function_call: FunctionCall, // ...(a,b,c) + union_init: UnionInitializer, +}; + +pub const UnionInitializer = struct { + field: Identifier, + value: *AstMapping, +}; + +pub const FunctionCall = struct { + function: *AstMapping, + arguments: List(AstMapping), +}; + +pub const FieldAssignment = struct { + location: Location, + field: Identifier, + value: *AstMapping, +}; + +pub const Pattern = union(enum) { + literal: StringLiteral, // literal "+" + word: StringLiteral, // word "while" + regex: StringLiteral, // regex "string" + external: CodeLiteral, // custom `matchMe` +}; + +pub const TypeSpec = union(enum) { + reference: NodeRef, // !type + literal: CodeLiteral, // literal `bool` + custom: CodeLiteral, // custom `Custom` + @"struct": CompoundType, // struct + @"union": CompoundType, // union +}; + +pub const CompoundType = struct { + location: Location, + fields: List(Field), +}; + +pub const Field = struct { + location: Location, + name: Identifier, + type: TypeSpec, +}; diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 3303955..852fa0f 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -6,6 +6,7 @@ const std = @import("std"); const args_parser = @import("args"); const ptk = @import("parser-toolkit"); +const ast = @import("ast.zig"); const parser = @import("parser.zig"); comptime { @@ -64,12 +65,12 @@ pub fn main() !u8 { return 0; } + var string_pool = try ptk.strings.Pool.init(dynamic_allocator); + defer string_pool.deinit(); + var diagnostics = ptk.Diagnostics.init(dynamic_allocator); defer diagnostics.deinit(); - // From here on, always print the diagnostics on exit! - defer diagnostics.print(stderr.writer()) catch {}; - var input_file = switch (cli.positionals.len) { 0 => stdin, 1 => std.fs.cwd().openFile(cli.positionals[0], .{}) catch |err| { @@ -86,21 +87,172 @@ pub fn main() !u8 { }; defer input_file.close(); - var ast = try parser.parse( + const file_name = if (cli.positionals.len > 0) + cli.positionals[0] + else + "stdint"; + + compileFile( dynamic_allocator, &diagnostics, - if (cli.positionals.len > 0) - cli.positionals[0] - else - "stdint", + &string_pool, + input_file, + file_name, + cli.options.test_mode, + ) catch |err| switch (err) { + // syntax errors must produce diagnostics: + error.SyntaxError => std.debug.assert(diagnostics.hasErrors()), + + error.OutOfMemory => { + try diagnostics.emit(.{ + .source = file_name, + .line = 1, + .column = 1, + }, .@"error", "out of memory", .{}); + }, + + error.StreamTooLong => { + try diagnostics.emit(.{ + .source = file_name, + .line = 1, + .column = 1, + }, .@"error", "input file too large", .{}); + }, + + error.InputOutput, + error.AccessDenied, + error.BrokenPipe, + error.SystemResources, + error.OperationAborted, + error.WouldBlock, + error.ConnectionResetByPeer, + error.Unexpected, + error.IsDir, + error.ConnectionTimedOut, + error.NotOpenForReading, + error.NetNameDeleted, + => { + try diagnostics.emit(.{ + .source = file_name, + .line = 1, + .column = 1, + }, .@"error", "i/o error: {s}", .{@errorName(err)}); + }, + }; + + try diagnostics.print(stderr.writer()); + + return if (diagnostics.hasErrors()) + 1 + else + 0; +} + +fn compileFile( + allocator: std.mem.Allocator, + diagnostics: *ptk.Diagnostics, + string_pool: *ptk.strings.Pool, + input_file: std.fs.File, + file_name: []const u8, + mode: TestMode, +) !void { + var tree = try parser.parse( + allocator, + diagnostics, + string_pool, + file_name, input_file.reader(), ); - defer ast.deinit(); + defer tree.deinit(); + + dumpAst(string_pool, tree.top_level_declarations); - if (cli.options.test_mode == .parse_only) { + if (mode == .parse_only) { // we're done if we're here - return 0; + return; + } +} + +fn dumpAst(strings: *const ptk.strings.Pool, decls: ast.List(ast.TopLevelDeclaration)) void { + std.debug.print("ast dump:\n", .{}); + + var iter = ast.iterate(decls); + while (iter.next()) |decl| { + switch (decl) { + .start => |item| std.debug.print("start {s}\n", .{strings.get(item.identifier)}), + + .rule => |rule| { + std.debug.print("rule {s}", .{strings.get(rule.name.value)}); + + if (rule.ast_type) |ast_type| { + std.debug.print(" : ", .{}); + dumpAstType(strings, ast_type); + } + + std.debug.print(" = \n", .{}); + + var prods = ast.iterate(rule.productions); + var first = true; + while (prods.next()) |prod| { + defer first = false; + if (!first) { + std.debug.print(" | ", .{}); + } else { + std.debug.print(" ", .{}); + } + dumpMappedProd(strings, prod); + } + + std.debug.print("\n;\n", .{}); + }, + + .node => |node| { + std.debug.print("node {s}", .{strings.get(node.name.value)}); + + std.debug.print(";\n", .{}); + }, + } + } +} + +fn dumpAstType(strings: *const ptk.strings.Pool, typespec: ast.TypeSpec) void { + _ = strings; + _ = typespec; + std.debug.print("", .{}); +} + +fn dumpMappedProd(strings: *const ptk.strings.Pool, mapped_prod: ast.MappedProduction) void { + dumpProd(strings, mapped_prod.production); + + if (mapped_prod.mapping) |mapping| { + dumpMapping(strings, mapping); + } +} + +fn dumpProd(strings: *const ptk.strings.Pool, production: ast.Production) void { + switch (production) { + .literal => |lit| std.debug.print("\"{}\"", .{std.zig.fmtEscapes(strings.get(lit.value))}), + .terminal => |term| std.debug.print("<{}>", .{std.zig.fmtId(strings.get(term.identifier))}), + .recursion => std.debug.print("", .{}), + .sequence => |seq| { + std.debug.print("(", .{}); + + var iter = ast.iterate(seq); + while (iter.next()) |item| { + std.debug.print(" ", .{}); + dumpProd(strings, item); + } + + std.debug.print(" )", .{}); + }, + .optional => std.debug.print("", .{}), + .repetition_zero => std.debug.print("", .{}), + .repetition_one => std.debug.print("", .{}), } +} - return 0; +fn dumpMapping(strings: *const ptk.strings.Pool, mapping: ast.AstMapping) void { + _ = strings; + _ = mapping; + std.debug.print("", .{}); } diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index f1402ee..5a5167a 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -1,5 +1,8 @@ const std = @import("std"); const ptk = @import("parser-toolkit"); +const ast = @import("ast.zig"); + +const fmtEscapes = std.zig.fmtEscapes; pub const Document = struct { arena: std.heap.ArenaAllocator, @@ -7,13 +10,15 @@ pub const Document = struct { file_name: []const u8, source_text: []const u8, + top_level_declarations: ast.Document, + pub fn deinit(ts: *Document) void { ts.arena.deinit(); ts.* = undefined; } }; -pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, file_name: []const u8, stream: anytype) !Document { +pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, stream: anytype) !Document { var arena = std.heap.ArenaAllocator.init(allocator); errdefer arena.deinit(); @@ -23,31 +28,54 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, file_n var tokenizer = Tokenizer.init(text, file_name_copy); - while (true) { - const token_or_none = tokenizer.next() catch |err| switch (err) { - error.UnexpectedCharacter => { - try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ - std.zig.fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), - }); - return error.SyntaxError; - }, + var parser = Parser{ + .core = ParserCore.init(&tokenizer), + .arena = arena.allocator(), + .pool = string_pool, + .diagnostics = diagnostics, + }; - else => |e| return e, - }; - const token = token_or_none orelse break; + const document_node = parser.acceptDocument() catch |err| switch (err) { + error.UnexpectedCharacter => { + try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ + fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), + }); + return error.SyntaxError; + }, - std.log.info("token: {}", .{token}); + error.EndOfStream, error.UnexpectedToken => @panic("Error handling is fucked up, something escaped"), + + // Unrecoverable syntax error, must have created diagnostics already + error.SyntaxError => |e| { + std.debug.assert(diagnostics.hasErrors()); + return e; + }, + + error.OutOfMemory => |e| return e, + }; + + if (tokenizer.next()) |token_or_null| { + if (token_or_null) |token| { + try diagnostics.emit(token.location, .@"error", "Excess token at the end of the file: {s}", .{@tagName(token.type)}); + return error.SyntaxError; + } + } else |_| { + try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ + fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), + }); + return error.SyntaxError; } return Document{ .arena = arena, .file_name = file_name_copy, .source_text = text, + + .top_level_declarations = document_node, }; } pub const TokenType = enum { - // keywords node, @@ -64,7 +92,7 @@ pub const TokenType = enum { // user values - raw_identifier, // foo-bar_bam + identifier, // foo-bar_bam node_ref, // !node rule_ref, // token_ref, // $token @@ -104,12 +132,276 @@ pub const TokenType = enum { pub const Token = Tokenizer.Token; -const match = ptk.matchers; +const ParserCore = ptk.ParserCore(Tokenizer, .{ .whitespace, .line_comment }); -const Pattern = ptk.Pattern(TokenType); +const Parser = struct { + const RS = ptk.RuleSet(TokenType); + const String = ptk.strings.String; + + core: ParserCore, + arena: std.mem.Allocator, + pool: *ptk.strings.Pool, + diagnostics: *ptk.Diagnostics, + + pub fn acceptDocument(parser: *Parser) !ast.Document { + var doc = ast.Document{}; + + while (true) { + const decl_or_eof = try parser.acceptTopLevelDecl(); + + const decl = decl_or_eof orelse break; + + try parser.append(ast.TopLevelDeclaration, &doc, decl); + } + + return doc; + } + + fn emitDiagnostic(parser: *Parser, loc: ?ptk.Location, comptime fmt: []const u8, args: anytype) !void { + // Anything detected here is always an error + try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, .@"error", fmt, args); + } + + fn acceptTopLevelDecl(parser: *Parser) !?ast.TopLevelDeclaration { + if (parser.acceptLiteral(.rule)) |_| { + return .{ + .rule = try parser.acceptRule(), + }; + } else |err| try filterAcceptError(err); + + // Detect any excess tokens on the top level: + if (try parser.core.nextToken()) |token| { + try parser.emitDiagnostic(null, "Unexpected token '{}'", .{fmtEscapes(token.text)}); + return error.SyntaxError; + } + + return null; + } + + fn acceptRule(parser: *Parser) !ast.Rule { + var state = parser.save(); + errdefer parser.restore(state); + + const identifier = try parser.acceptIdentifier(); + + const rule_type = if (parser.acceptLiteral(.@":")) + try parser.acceptTypeSpec() + else |_| + null; + + try parser.acceptLiteral(.@"="); + + var list: ast.List(ast.MappedProduction) = .{}; + + while (true) { + var production = try parser.acceptMappedProduction(); + + try parser.append(ast.MappedProduction, &list, production); + + // TODO: Improve error reporting here + if (parser.acceptLiteral(.@";")) { + break; + } else |_| {} + + try parser.acceptLiteral(.@"|"); + } + + return ast.Rule{ + .ast_type = rule_type, + .productions = list, + .name = identifier, + }; + } + + fn acceptMappedProduction(parser: *Parser) !ast.MappedProduction { + var sequence = try parser.acceptProductionSequence(); -const ParserCore = ptk.ParserCore(TokenType, .{ .whitespace, .line_comment }); + const mapping = if (parser.acceptLiteral(.@"=>")) + try parser.acceptAstMapping() + else |_| + null; + return ast.MappedProduction{ + .production = if (sequence.only()) |item| + item + else + .{ .sequence = sequence }, + .mapping = mapping, + }; + } + + fn acceptProductionSequence(parser: *Parser) !ast.List(ast.Production) { + var list: ast.List(ast.Production) = .{}; + + while (true) { + if (parser.acceptProduction()) |prod| { + try parser.append(ast.Production, &list, prod); + } else |err| switch (err) { + error.UnexpectedToken => break, + else => |e| return e, + } + } + + return list; + } + + fn acceptProduction(parser: *Parser) !ast.Production { + const str = try parser.acceptStringLiteral(); + + return ast.Production{ + .literal = str, + }; + } + + fn acceptAstMapping(parser: *Parser) !ast.AstMapping { + _ = parser; + return error.UnexpectedToken; + } + + fn acceptTypeSpec(parser: *Parser) !ast.TypeSpec { + _ = parser; + return error.UnexpectedToken; + } + + fn acceptStringLiteral(parser: *Parser) !ast.StringLiteral { + const token = try parser.core.accept(RS.is(.string_literal)); + + std.debug.assert(token.text.len >= 2); + + return ast.StringLiteral{ + .location = token.location, + .value = try parser.unwrapString(token.location, token.text[1 .. token.text.len - 1]), + }; + } + + fn acceptIdentifier(parser: *Parser) !ast.Identifier { + const token = try parser.core.accept(RS.is(.identifier)); + + return ast.Identifier{ + .location = token.location, + .value = try parser.unwrapIdentifierString(token.location, token.text), + }; + } + + fn acceptLiteral(parser: *Parser, comptime token_type: TokenType) !void { + _ = try parser.core.accept(RS.is(token_type)); + } + + // management: + + fn unwrapIdentifierString(parser: *Parser, loc: ptk.Location, raw: []const u8) !ptk.strings.String { + std.debug.assert(raw.len > 0); + if (raw[0] == '@') { + std.debug.assert(raw[1] == '"'); + std.debug.assert(raw[raw.len - 1] == '"'); + // string-escaped identifier + return try parser.unwrapString(loc, raw[2 .. raw.len - 1]); + } else { + return try parser.pool.insert(raw); + } + } + + fn unwrapString(parser: *Parser, loc: ptk.Location, raw: []const u8) !ptk.strings.String { + var fallback = std.heap.stackFallback(512, parser.arena); + + var working_space = std.ArrayList(u8).init(fallback.get()); + defer working_space.deinit(); + + var i: usize = 0; + while (i < raw.len) { + const c = raw[i]; + if (c == '\\') { + i += 1; + if (i >= raw.len) { + try parser.emitDiagnostic(loc, "Invalid string escape: Missing escaped character!", .{}); + return error.SyntaxError; + } + const escape = raw[i]; + const slice = switch (escape) { + 'n' => "\n", + 'r' => "\r", + '\"' => "\"", + '\'' => "\'", + '\\' => "\\", + + 'x' => @panic("Implement hex escape \\x??"), + 'u' => @panic("Implement utf-16 \\u????"), + 'U' => @panic("Implement utf-32 \\U????????"), + + '0'...'3' => @panic("Implement octal escape \\???"), + + else => { + if (std.ascii.isPrint(c)) { + try parser.emitDiagnostic(loc, "Invalid string escape \\{c}", .{escape}); + } else { + try parser.emitDiagnostic(loc, "Invalid string escape \\x{X:0>2}", .{escape}); + } + return error.SyntaxError; + }, + }; + try working_space.appendSlice(slice); + } else { + try working_space.append(c); + } + i += 1; + } + + return try parser.pool.insert(working_space.items); + } + + fn save(parser: Parser) ParserCore.State { + return parser.core.saveState(); + } + + fn restore(parser: *Parser, state: ParserCore.State) void { + parser.core.restoreState(state); + } + + fn internString(parser: *Parser, string: []const u8) !String { + return try parser.pool.insert(string); + } + + fn append(parser: *Parser, comptime T: type, list: *ast.List(T), item: T) !void { + const node = try parser.arena.create(ast.List(T).Node); + errdefer parser.arena.destroy(node); + + node.data = item; + + list.append(node); + } + + pub const FatalAcceptError = error{ + // We're out of memory accepting some rule. We cannot recover from this. + OutOfMemory, + + // We found a character the tokenizer does not accept, we cannot recover from this ever. + UnexpectedCharacter, + }; + + pub const AcceptError = FatalAcceptError || error{ + + // The token stream is too short to accept this rule + EndOfStream, + + // The token stream contains an unexpected token, this is a syntax error + UnexpectedToken, + }; + + fn filterAcceptError(err: AcceptError) FatalAcceptError!void { + return switch (err) { + error.EndOfStream, + error.UnexpectedToken, + => {}, + + error.OutOfMemory, + error.UnexpectedCharacter, + => |e| return e, + }; + } +}; + +const match = ptk.matchers; +const Pattern = ptk.Pattern(TokenType); const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.line_comment, match.sequenceOf(.{ match.literal("#"), match.takeNoneOf("\r\n") })), @@ -147,7 +439,7 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.code_literal, matchCodeLiteral), // identifiers must come after keywords: - Pattern.create(.raw_identifier, matchRawIdentifier), + Pattern.create(.identifier, matchRawIdentifier), Pattern.create(.node_ref, matchNodeRef), Pattern.create(.rule_ref, matchRuleRef), Pattern.create(.token_ref, matchTokenRef), diff --git a/src/toolkit/main.zig b/src/toolkit/main.zig index 09b1ba8..9a5d40b 100644 --- a/src/toolkit/main.zig +++ b/src/toolkit/main.zig @@ -17,13 +17,15 @@ pub const RuleSet = pcore.RuleSet; pub const Error = @import("Error.zig"); pub const Diagnostics = @import("Diagnostics.zig"); pub const StringCache = @import("StringCache.zig"); +pub const strings = @import("strings.zig"); pub const testing = struct { pub const validateMatcher = tok.testMatcher; }; -test { +comptime { _ = Location; _ = tok; _ = pcore; + _ = strings; } diff --git a/src/toolkit/parser_core.zig b/src/toolkit/parser_core.zig index 394d679..9bfcf42 100644 --- a/src/toolkit/parser_core.zig +++ b/src/toolkit/parser_core.zig @@ -52,6 +52,7 @@ pub fn ParserCore(comptime TokenizerT: type, comptime ignore_list: anytype) type } pub const AcceptError = error{ EndOfStream, UnexpectedToken } || Tokenizer.NextError; + /// Accepts a token that matches `rule`. Otherwise returns /// - `error.EndOfStream` when no tokens are available /// - `error.UnexpectedToken` when an invalid token was encountered diff --git a/src/toolkit/strings.zig b/src/toolkit/strings.zig new file mode 100644 index 0000000..9c41933 --- /dev/null +++ b/src/toolkit/strings.zig @@ -0,0 +1,156 @@ +pub const std = @import("std"); + +pub const String = enum(u32) { + empty, + + _, + + pub fn format(string: String, fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = fmt; + _ = options; + if (string == .empty) { + try writer.writeAll("String(empty)"); + } else { + try writer.print("String({})", .{ + @intFromEnum(string), + }); + } + } +}; + +/// A string pool that can store up to 4 GB of text and deduplicate instances. +/// +/// Use this to reduce the memory footprint of your AST and allow quick comparison of strings +/// by using the `String` type instead of doing a `std.mem.eql`. +pub const Pool = struct { + data: std.ArrayList(u8), + count: usize = 0, + + pub fn init(allocator: std.mem.Allocator) !Pool { + var pool = Pool{ + .data = std.ArrayList(u8).init(allocator), + }; + errdefer pool.deinit(); + + std.debug.assert(try pool.insert("") == .empty); + + return pool; + } + + pub fn deinit(pool: *Pool) void { + pool.data.deinit(); + pool.* = undefined; + } + + pub fn insert(pool: *Pool, string: []const u8) error{OutOfMemory}!String { + std.debug.assert(std.mem.indexOfScalar(u8, string, 0) == null); // Interned strings must not contain NUL! + + const storage = pool.data.items; + + var search_index: usize = 0; + while (search_index < storage.len) { + const index = std.mem.indexOfPos(u8, storage, search_index, string) orelse break; + + if (index + string.len + 1 > storage.len) + break; + + if (storage[index + string.len] == 0) + return @enumFromInt(index); + + // starts with `string`, but doesn't end with NUL. + search_index = index + string.len; + } + + const index = storage.len; + + if (index > std.math.maxInt(u32)) { + return error.OutOfMemory; + } + + try pool.data.ensureUnusedCapacity(string.len + 1); // invalidates storage + pool.data.appendSliceAssumeCapacity(string); + pool.data.appendAssumeCapacity(0); + pool.count += 1; + + return @enumFromInt(index); + } + + /// Returns the string in the pool. + pub fn get(pool: *const Pool, string: String) [:0]const u8 { + const storage = pool.data.items; + const index: usize = @intFromEnum(string); + std.debug.assert(index < storage.len); + const slice = std.mem.sliceTo(storage[index..], 0); + return slice.ptr[0..slice.len :0]; + } + + pub fn format(pool: Pool, fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = fmt; + _ = options; + try writer.print("StringPool(count={}, size={:.2f})", .{ + pool.count, + std.fmt.fmtIntSizeBin(pool.data.items.len), + }); + } +}; + +/// Very simplistic string deduplicator, returns the same slice for each string. +/// Does only perform deduplication, no fancy storage strategy. +pub const Dedupe = struct { + arena: std.heap.ArenaAllocator, + items: std.StringHashMapUnmanaged(void), + + pub fn init(allocator: std.mem.Allocator) Dedupe { + return Dedupe{ + .arena = std.heap.ArenaAllocator.init(allocator), + .items = .{}, + }; + } + + pub fn deinit(cache: *Dedupe) void { + cache.items.deinit(cache.arena.child_allocator); + cache.arena.deinit(); + cache.* = undefined; + } + + /// Gets or inserts a string into the cache. `string` might be a short-lived value, + /// the returned value is guaranteed to have the livetime of the string cache. + pub fn fetch(cache: *Dedupe, string: []const u8) ![]const u8 { + const allocator = cache.arena.child_allocator; + const gop = try cache.items.getOrPut(allocator, string); + if (!gop.found_existing) { + errdefer _ = cache.items.remove(string); + gop.key_ptr.* = try cache.arena.allocator().dupe(u8, string); + } + return gop.key_ptr.*; + } +}; + +test Pool { + var pool = try Pool.init(std.testing.allocator); + defer pool.deinit(); + + try std.testing.expectEqualStrings("", pool.get(.empty)); + + try std.testing.expectEqual(String.empty, try pool.insert("")); + + const a = try pool.insert("hello, world!"); + const b = try pool.insert("world!"); // suffix of a + const c = try pool.insert("world"); // non-suffix + + // All strings must be unique: + try std.testing.expect(a != b); + try std.testing.expect(a != c); + try std.testing.expect(b != c); + + // But must retain their qualities: + try std.testing.expectEqualStrings("hello, world!", pool.get(a)); + try std.testing.expectEqualStrings("world!", pool.get(b)); + try std.testing.expectEqualStrings("world", pool.get(c)); + + // sequential inserts may never return different values: + try std.testing.expectEqual(a, try pool.insert("hello, world!")); + try std.testing.expectEqual(a, try pool.insert("hello, world!")); + try std.testing.expectEqual(a, try pool.insert("hello, world!")); + try std.testing.expectEqual(a, try pool.insert("hello, world!")); +} diff --git a/test/analysis/accept/match-literal-rule.ptk b/test/analysis/accept/match-literal-rule.ptk new file mode 100644 index 0000000..3cda9a8 --- /dev/null +++ b/test/analysis/accept/match-literal-rule.ptk @@ -0,0 +1,2 @@ +# This file contains a single rule with no well-defined start point: +rule basic = "basic"; \ No newline at end of file diff --git a/test/analysis/accept/match-literal-sequence.ptk b/test/analysis/accept/match-literal-sequence.ptk new file mode 100644 index 0000000..555a2dc --- /dev/null +++ b/test/analysis/accept/match-literal-sequence.ptk @@ -0,0 +1,2 @@ +# This file contains a single rule with no well-defined start point: +rule basic = "basic" "words" "after" "another"; \ No newline at end of file From f53a16d8db5ab4f9f12c0243dc4a70c7be25972c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Wed, 1 Nov 2023 15:28:18 +0100 Subject: [PATCH 03/20] Improves error reporting --- src/ptkgen/ast_dump.zig | 127 +++++++++++++++++++++++++++++++++++++ src/ptkgen/main.zig | 87 +------------------------- src/ptkgen/parser.zig | 134 ++++++++++++++++++++++++++++------------ 3 files changed, 224 insertions(+), 124 deletions(-) create mode 100644 src/ptkgen/ast_dump.zig diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig new file mode 100644 index 0000000..835dadd --- /dev/null +++ b/src/ptkgen/ast_dump.zig @@ -0,0 +1,127 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const ast = @import("ast.zig"); +const parser = @import("parser.zig"); + +pub fn dump(strings: *const ptk.strings.Pool, decls: parser.Document) void { + var printer = AstPrinter{ + .strings = strings, + }; + + printer.dumpRoot(decls.top_level_declarations); +} + +const AstPrinter = struct { + const print = std.debug.print; + + strings: *const ptk.strings.Pool, + + fn dumpRoot(printer: AstPrinter, decls: ast.List(ast.TopLevelDeclaration)) void { + print("ast dump:\n", .{}); + + var iter = ast.iterate(decls); + while (iter.next()) |decl| { + switch (decl) { + .start => |item| print("start {}\n", .{printer.fmtId(item.identifier)}), + + .rule => |rule| { + print("rule {s}", .{printer.fmtId(rule.name.value)}); + + if (rule.ast_type) |ast_type| { + print(" : ", .{}); + printer.dumpAstType(ast_type); + } + + print(" = \n", .{}); + + var prods = ast.iterate(rule.productions); + var first = true; + while (prods.next()) |prod| { + defer first = false; + if (!first) { + print(" | ", .{}); + } else { + print(" ", .{}); + } + printer.dumpMappedProd(prod); + } + + print("\n;\n", .{}); + }, + + .node => |node| { + print("node {s}", .{printer.fmtId(node.name.value)}); + print(";\n", .{}); + }, + } + } + } + + fn dumpAstType(printer: AstPrinter, typespec: ast.TypeSpec) void { + _ = printer; + _ = typespec; + std.debug.print("", .{}); + } + + fn dumpMappedProd(printer: AstPrinter, mapped_prod: ast.MappedProduction) void { + printer.dumpProd(mapped_prod.production); + + if (mapped_prod.mapping) |mapping| { + printer.dumpMapping(mapping); + } + } + + fn dumpProd(printer: AstPrinter, production: ast.Production) void { + switch (production) { + .literal => |lit| print("\"{}\"", .{printer.fmtString(lit.value)}), + .terminal => |term| print("<{}>", .{printer.fmtId(term.identifier)}), + .recursion => print("", .{}), + .sequence => |seq| { + print("(", .{}); + + var iter = ast.iterate(seq); + while (iter.next()) |item| { + print(" ", .{}); + printer.dumpProd(item); + } + + print(" )", .{}); + }, + .optional => print("", .{}), + .repetition_zero => print("", .{}), + .repetition_one => print("", .{}), + } + } + + fn dumpMapping(printer: AstPrinter, mapping: ast.AstMapping) void { + _ = printer; + _ = mapping; + print("", .{}); + } + + fn fmtString(printer: AstPrinter, str: ptk.strings.String) StringPrinter { + return StringPrinter{ .printer = printer, .str = str, .mode = .text }; + } + + fn fmtId(printer: AstPrinter, str: ptk.strings.String) StringPrinter { + return StringPrinter{ .printer = printer, .str = str, .mode = .id }; + } + + const StringPrinter = struct { + printer: AstPrinter, + str: ptk.strings.String, + mode: enum { id, text }, + + pub fn format(strpr: StringPrinter, fmt: []const u8, opt: std.fmt.FormatOptions, writer: anytype) !void { + _ = opt; + _ = fmt; + + const text = strpr.printer.strings.get(strpr.str); + switch (strpr.mode) { + .id => try writer.print("\"{}\"", .{std.zig.fmtId(text)}), + .text => try writer.print("\"{}\"", .{std.zig.fmtEscapes(text)}), + } + } + }; +}; diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 852fa0f..15316ac 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -8,6 +8,7 @@ const ptk = @import("parser-toolkit"); const ast = @import("ast.zig"); const parser = @import("parser.zig"); +const ast_dump = @import("ast_dump.zig"); comptime { // reference for unit tests: @@ -165,94 +166,10 @@ fn compileFile( ); defer tree.deinit(); - dumpAst(string_pool, tree.top_level_declarations); - if (mode == .parse_only) { // we're done if we're here return; } -} - -fn dumpAst(strings: *const ptk.strings.Pool, decls: ast.List(ast.TopLevelDeclaration)) void { - std.debug.print("ast dump:\n", .{}); - - var iter = ast.iterate(decls); - while (iter.next()) |decl| { - switch (decl) { - .start => |item| std.debug.print("start {s}\n", .{strings.get(item.identifier)}), - - .rule => |rule| { - std.debug.print("rule {s}", .{strings.get(rule.name.value)}); - - if (rule.ast_type) |ast_type| { - std.debug.print(" : ", .{}); - dumpAstType(strings, ast_type); - } - - std.debug.print(" = \n", .{}); - - var prods = ast.iterate(rule.productions); - var first = true; - while (prods.next()) |prod| { - defer first = false; - if (!first) { - std.debug.print(" | ", .{}); - } else { - std.debug.print(" ", .{}); - } - dumpMappedProd(strings, prod); - } - - std.debug.print("\n;\n", .{}); - }, - - .node => |node| { - std.debug.print("node {s}", .{strings.get(node.name.value)}); - - std.debug.print(";\n", .{}); - }, - } - } -} - -fn dumpAstType(strings: *const ptk.strings.Pool, typespec: ast.TypeSpec) void { - _ = strings; - _ = typespec; - std.debug.print("", .{}); -} - -fn dumpMappedProd(strings: *const ptk.strings.Pool, mapped_prod: ast.MappedProduction) void { - dumpProd(strings, mapped_prod.production); - - if (mapped_prod.mapping) |mapping| { - dumpMapping(strings, mapping); - } -} - -fn dumpProd(strings: *const ptk.strings.Pool, production: ast.Production) void { - switch (production) { - .literal => |lit| std.debug.print("\"{}\"", .{std.zig.fmtEscapes(strings.get(lit.value))}), - .terminal => |term| std.debug.print("<{}>", .{std.zig.fmtId(strings.get(term.identifier))}), - .recursion => std.debug.print("", .{}), - .sequence => |seq| { - std.debug.print("(", .{}); - - var iter = ast.iterate(seq); - while (iter.next()) |item| { - std.debug.print(" ", .{}); - dumpProd(strings, item); - } - - std.debug.print(" )", .{}); - }, - .optional => std.debug.print("", .{}), - .repetition_zero => std.debug.print("", .{}), - .repetition_one => std.debug.print("", .{}), - } -} -fn dumpMapping(strings: *const ptk.strings.Pool, mapping: ast.AstMapping) void { - _ = strings; - _ = mapping; - std.debug.print("", .{}); + ast_dump.dump(string_pool, tree); } diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 5a5167a..3d6bfe0 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -36,14 +36,6 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, string }; const document_node = parser.acceptDocument() catch |err| switch (err) { - error.UnexpectedCharacter => { - try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ - fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), - }); - return error.SyntaxError; - }, - - error.EndOfStream, error.UnexpectedToken => @panic("Error handling is fucked up, something escaped"), // Unrecoverable syntax error, must have created diagnostics already error.SyntaxError => |e| { @@ -163,15 +155,18 @@ const Parser = struct { } fn acceptTopLevelDecl(parser: *Parser) !?ast.TopLevelDeclaration { - if (parser.acceptLiteral(.rule)) |_| { - return .{ - .rule = try parser.acceptRule(), - }; + if (parser.acceptRule()) |rule| { + return .{ .rule = rule }; } else |err| try filterAcceptError(err); // Detect any excess tokens on the top level: - if (try parser.core.nextToken()) |token| { - try parser.emitDiagnostic(null, "Unexpected token '{}'", .{fmtEscapes(token.text)}); + const excess_tokens = if (parser.core.nextToken()) |token| + (token != null) + else |err| switch (err) { + error.UnexpectedCharacter => true, + }; + if (excess_tokens) { + try parser.emitDiagnostic(null, "Unexpected end of file", .{}); return error.SyntaxError; } @@ -182,14 +177,16 @@ const Parser = struct { var state = parser.save(); errdefer parser.restore(state); - const identifier = try parser.acceptIdentifier(); + try parser.acceptLiteral(.rule, .recover); + + const identifier = try parser.acceptIdentifier(.fail); - const rule_type = if (parser.acceptLiteral(.@":")) + const rule_type = if (try parser.tryAcceptLiteral(.@":")) try parser.acceptTypeSpec() - else |_| + else null; - try parser.acceptLiteral(.@"="); + try parser.acceptLiteral(.@"=", .fail); var list: ast.List(ast.MappedProduction) = .{}; @@ -199,11 +196,11 @@ const Parser = struct { try parser.append(ast.MappedProduction, &list, production); // TODO: Improve error reporting here - if (parser.acceptLiteral(.@";")) { + if (try parser.tryAcceptLiteral(.@";")) { break; - } else |_| {} + } - try parser.acceptLiteral(.@"|"); + try parser.acceptLiteral(.@"|", .fail); } return ast.Rule{ @@ -216,9 +213,9 @@ const Parser = struct { fn acceptMappedProduction(parser: *Parser) !ast.MappedProduction { var sequence = try parser.acceptProductionSequence(); - const mapping = if (parser.acceptLiteral(.@"=>")) + const mapping = if (try parser.tryAcceptLiteral(.@"=>")) try parser.acceptAstMapping() - else |_| + else null; return ast.MappedProduction{ @@ -237,8 +234,8 @@ const Parser = struct { if (parser.acceptProduction()) |prod| { try parser.append(ast.Production, &list, prod); } else |err| switch (err) { - error.UnexpectedToken => break, - else => |e| return e, + error.UnexpectedTokenRecoverable => break, + error.OutOfMemory, error.SyntaxError => |e| return e, } } @@ -246,7 +243,7 @@ const Parser = struct { } fn acceptProduction(parser: *Parser) !ast.Production { - const str = try parser.acceptStringLiteral(); + const str = try parser.acceptStringLiteral(.recover); return ast.Production{ .literal = str, @@ -255,16 +252,16 @@ const Parser = struct { fn acceptAstMapping(parser: *Parser) !ast.AstMapping { _ = parser; - return error.UnexpectedToken; + @panic("not implemented yet"); } fn acceptTypeSpec(parser: *Parser) !ast.TypeSpec { _ = parser; - return error.UnexpectedToken; + @panic("not implemented yet"); } - fn acceptStringLiteral(parser: *Parser) !ast.StringLiteral { - const token = try parser.core.accept(RS.is(.string_literal)); + fn acceptStringLiteral(parser: *Parser, accept_mode: AcceptMode) !ast.StringLiteral { + const token = try parser.acceptToken(.string_literal, accept_mode); std.debug.assert(token.text.len >= 2); @@ -274,8 +271,8 @@ const Parser = struct { }; } - fn acceptIdentifier(parser: *Parser) !ast.Identifier { - const token = try parser.core.accept(RS.is(.identifier)); + fn acceptIdentifier(parser: *Parser, accept_mode: AcceptMode) !ast.Identifier { + const token = try parser.acceptToken(.identifier, accept_mode); return ast.Identifier{ .location = token.location, @@ -283,10 +280,69 @@ const Parser = struct { }; } - fn acceptLiteral(parser: *Parser, comptime token_type: TokenType) !void { - _ = try parser.core.accept(RS.is(token_type)); + fn acceptLiteral(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) !void { + _ = try parser.acceptToken(token_type, accept_mode); + } + + fn tryAcceptLiteral(parser: *Parser, comptime token_type: TokenType) !bool { + _ = parser.acceptToken(token_type, .recover) catch |err| switch (err) { + error.UnexpectedTokenRecoverable => return false, + error.OutOfMemory, error.SyntaxError => |e| return e, + }; + return true; + } + + /// Tries to accept a given token and will emit a diagnostic if it fails. + fn acceptToken(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) !Token { + const saved_state = parser.save(); + errdefer parser.restore(saved_state); + + const source_offset = parser.core.tokenizer.offset; + const location = parser.core.tokenizer.current_location; + + if (parser.core.accept(RS.any)) |token| { + // std.log.debug("token trace: {}", .{token}); + + if (token.type != token_type) { + switch (accept_mode) { + .fail => { + try parser.emitDiagnostic(location, "Expected token {s}, but discovered token {s} ('{}')", .{ + @tagName(token_type), + @tagName(token.type), + std.zig.fmtEscapes(token.text), + }); + return error.SyntaxError; + }, + .recover => return error.UnexpectedTokenRecoverable, + } + } + return token; + } else |err| switch (err) { + error.UnexpectedToken => unreachable, // RS.any will always accept the token + error.EndOfStream => switch (accept_mode) { + .fail => { + try parser.emitDiagnostic(location, "Expected token {s}, but end of file was discovered", .{@tagName(token_type)}); + return error.SyntaxError; + }, + .recover => return error.UnexpectedTokenRecoverable, + }, + error.UnexpectedCharacter => { + try parser.emitDiagnostic(location, "Unexpected character: '{}'", .{ + fmtEscapes(parser.core.tokenizer.source[source_offset..][0..1]), + }); + return error.SyntaxError; + }, + } } + const AcceptMode = enum { + /// Will emit a syntax error with diagnostic + fail, + + /// Is recoverable + recover, + }; + // management: fn unwrapIdentifierString(parser: *Parser, loc: ptk.Location, raw: []const u8) !ptk.strings.String { @@ -374,8 +430,8 @@ const Parser = struct { // We're out of memory accepting some rule. We cannot recover from this. OutOfMemory, - // We found a character the tokenizer does not accept, we cannot recover from this ever. - UnexpectedCharacter, + // Something could not be accepted. + SyntaxError, }; pub const AcceptError = FatalAcceptError || error{ @@ -384,17 +440,17 @@ const Parser = struct { EndOfStream, // The token stream contains an unexpected token, this is a syntax error - UnexpectedToken, + UnexpectedTokenRecoverable, }; fn filterAcceptError(err: AcceptError) FatalAcceptError!void { return switch (err) { error.EndOfStream, - error.UnexpectedToken, + error.UnexpectedTokenRecoverable, => {}, error.OutOfMemory, - error.UnexpectedCharacter, + error.SyntaxError, => |e| return e, }; } From b47acc5a6dd15049948df5e2b09a745847cfc3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Thu, 2 Nov 2023 11:39:52 +0100 Subject: [PATCH 04/20] Adds basic i18n, makes diagnostics be errorcode-based instead of using literals --- src/ptkgen/Diagnostics.zig | 249 +++++++++++++++++++++++++++++++++++++ src/ptkgen/intl.zig | 73 +++++++++++ src/ptkgen/intl/en.json | 33 +++++ src/ptkgen/main.zig | 18 +-- src/ptkgen/parser.zig | 74 ++++++----- 5 files changed, 406 insertions(+), 41 deletions(-) create mode 100644 src/ptkgen/Diagnostics.zig create mode 100644 src/ptkgen/intl.zig create mode 100644 src/ptkgen/intl/en.json diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig new file mode 100644 index 0000000..d5b75f5 --- /dev/null +++ b/src/ptkgen/Diagnostics.zig @@ -0,0 +1,249 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const intl = @import("intl.zig"); +const parser = @import("parser.zig"); + +const Diagnostics = @This(); + +pub const Code = enum(u16) { + pub const first_error = 1000; + pub const first_warning = 4000; + pub const first_note = 8000; + pub const last_item = 10000; + + out_of_memory = 1000, + file_limit_exceeded = 1001, + io_error = 1002, + + invalid_source_encoding = 1003, + unexpected_token_eof = 1004, + unexpected_token = 1005, + unexpected_character = 1006, + unexpected_eof = 1007, + + bad_string_escape = 1008, + + invalid_string_escape = 1009, + + excess_tokens = 1010, + + comptime { + std.debug.assert(first_error < first_warning); + std.debug.assert(first_warning < first_note); + std.debug.assert(first_note < last_item); + } + + pub fn isError(code: Code) bool { + const int = @intFromEnum(code); + return @intFromEnum(code) >= first_error and int < first_warning; + } + + pub fn isWarning(code: Code) bool { + const int = @intFromEnum(code); + return int >= first_warning and int < first_note; + } + + pub fn isNote(code: Code) bool { + const int = @intFromEnum(code); + return int >= first_note and int < last_item; + } +}; + +const NoDiagnosticData = struct {}; +pub fn Data(comptime code: Code) type { + return switch (code) { + .out_of_memory => NoDiagnosticData, + .file_limit_exceeded => NoDiagnosticData, + .io_error => struct { error_code: intl.FormattableError }, + + .unexpected_token_eof => struct { + expected_type: parser.TokenType, + }, + .unexpected_token => struct { + expected_type: parser.TokenType, + actual_type: parser.TokenType, + actual_text: []const u8, + }, + .unexpected_eof => NoDiagnosticData, + + .invalid_source_encoding => NoDiagnosticData, + .unexpected_character => struct { character: u21 }, + + .bad_string_escape => NoDiagnosticData, + .invalid_string_escape => struct { escape: u21 }, + .excess_tokens => struct { token_type: parser.TokenType }, + + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), + }; +} + +pub const Message = struct { + level: ptk.Error.Level, + location: ptk.Location, + text: []const u8, +}; + +inner: ptk.Diagnostics, + +pub fn init(allocator: std.mem.Allocator) Diagnostics { + return Diagnostics{ + .inner = ptk.Diagnostics.init(allocator), + }; +} + +pub fn deinit(diag: *Diagnostics) void { + diag.inner.deinit(); + diag.* = undefined; +} + +pub fn hasErrors(diag: Diagnostics) bool { + return diag.inner.hasErrors(); +} + +pub fn hasWarnings(diag: Diagnostics) bool { + return diag.inner.hasWarnings(); +} + +fn Formatter(comptime T: type) type { + return switch (T) { + // text and unicode: + []const u8 => struct { + // TODO: Distinct between "string body" and "string literal" + + value: T, + + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + try writer.print("{}", .{std.zig.fmtEscapes(item.value)}); + } + }, + + u21 => struct { + value: T, + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + + if (item.value < 0x80) { + const ascii: u8 = @intCast(item.value); + + if (std.ascii.isPrint(ascii)) { + try writer.print("{c}", .{ascii}); + } else { + try writer.print("[nonprint: 0x{X:0>2}]", .{ascii}); + } + } else { + var buf: [4]u8 = undefined; + if (std.unicode.utf8Encode(item.value, &buf)) |len| { + try writer.print("{s}", .{buf[0..len]}); + } else |_| { + try writer.print("4}>", .{item.value}); + } + } + } + }, + + // enums: + parser.TokenType => struct { + value: T, + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + try writer.print("{s}", .{@tagName(item.value)}); + } + }, + + intl.FormattableError => struct { + value: T, + + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + + inline for (@typeInfo(intl.FormattableError).ErrorSet.?) |err| { + if (item.value == @field(intl.FormattableError, err.name)) { + try writer.writeAll(@field(intl.localization.errors, err.name)); + return; + } + } else unreachable; + } + }, + + else => @compileError(std.fmt.comptimePrint("{s} is not a supported diagnostic type!", .{@typeName(T)})), + }; +} + +fn createFormatter(comptime T: type, value: T) Formatter(T) { + return Formatter(T){ .value = value }; +} + +fn FormattedData(comptime code: Code) type { + const Field = std.builtin.Type.StructField; + const D = Data(code); + + const src_fields = @typeInfo(D).Struct.fields; + + var dst_fields: [src_fields.len]Field = undefined; + + for (&dst_fields, src_fields) |*dst, src| { + dst.* = .{ + .name = src.name, + .type = Formatter(src.type), + .default_value = null, + .is_comptime = false, + .alignment = @alignOf(Formatter(src.type)), + }; + } + + return @Type(.{ + .Struct = .{ + .layout = .Auto, + .fields = &dst_fields, + .decls = &.{}, + .is_tuple = false, + }, + }); +} + +fn formatData(comptime code: Code, params: Data(code)) FormattedData(code) { + var formatted: FormattedData(code) = undefined; + inline for (std.meta.fields(Data(code))) |fld| { + @field(formatted, fld.name) = createFormatter(fld.type, @field(params, fld.name)); + } + return formatted; +} + +pub fn emit(diag: *Diagnostics, location: ptk.Location, comptime code: Code, params: Data(code)) error{OutOfMemory}!void { + const level = if (code.isError()) + ptk.Error.Level.@"error" + else if (code.isWarning()) + ptk.Error.Level.warning + else if (code.isNote()) + ptk.Error.Level.info + else + unreachable; + + const fmt_string = @field(intl.localization.diagnostics, @tagName(code)); + + var stack_fallback = std.heap.stackFallback(1024, diag.inner.memory.allocator()); + const stack_fallback_allocator = stack_fallback.get(); + + const formatted_params = formatData(code, params); + + const message_text = try std.fmt.allocPrint(stack_fallback_allocator, fmt_string, formatted_params); + defer stack_fallback_allocator.free(message_text); + + const code_prefix = switch (level) { + .@"error" => "E", + .warning => "W", + .info => "D", + }; + + try diag.inner.emit(location, level, "{s}{d:0>4}: {s}", .{ code_prefix, @intFromEnum(code), message_text }); +} + +pub fn render(diag: Diagnostics, stream: anytype) !void { + try diag.inner.print(stream); +} diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig new file mode 100644 index 0000000..51623eb --- /dev/null +++ b/src/ptkgen/intl.zig @@ -0,0 +1,73 @@ +const std = @import("std"); + +pub const Language = enum { + en, +}; + +pub const language: Language = .en; + +pub const localization = @field(localizations, @tagName(language)); + +pub const localizations = struct { + pub const en = Localization.generate(@embedFile("intl/en.json")); +}; + +pub const FormattableError = blk: { + const list = @typeInfo(std.meta.fieldInfo(Localization, .errors).type).Struct.fields; + + var errors: [list.len]std.builtin.Type.Error = undefined; + for (&errors, list) |*dst, src| { + dst.* = .{ .name = src.name }; + } + + break :blk @Type(.{ + .ErrorSet = &errors, + }); +}; + +pub const Localization = struct { + diagnostics: struct { + out_of_memory: []const u8, + file_limit_exceeded: []const u8, + io_error: []const u8, + invalid_source_encoding: []const u8, + unexpected_token_eof: []const u8, + unexpected_token: []const u8, + unexpected_character: []const u8, + unexpected_eof: []const u8, + bad_string_escape: []const u8, + invalid_string_escape: []const u8, + excess_tokens: []const u8, + }, + + errors: struct { + Unexpected: []const u8, + + OutOfMemory: []const u8, + + InputOutput: []const u8, + AccessDenied: []const u8, + BrokenPipe: []const u8, + SystemResources: []const u8, + OperationAborted: []const u8, + WouldBlock: []const u8, + ConnectionResetByPeer: []const u8, + IsDir: []const u8, + ConnectionTimedOut: []const u8, + NotOpenForReading: []const u8, + NetNameDeleted: []const u8, + + StreamTooLong: []const u8, + SyntaxError: []const u8, + InvalidSourceEncoding: []const u8, + }, + + pub fn generate(comptime buffer: []const u8) Localization { + @setEvalBranchQuota(1_000_000); + + var alloc_buf: [buffer.len]u8 = undefined; + var fba = std.heap.FixedBufferAllocator.init(&alloc_buf); + + return std.json.parseFromSliceLeaky(Localization, fba.allocator(), buffer, .{}) catch @compileError("failed to parse json"); + } +}; diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json new file mode 100644 index 0000000..72ed39a --- /dev/null +++ b/src/ptkgen/intl/en.json @@ -0,0 +1,33 @@ +{ + "diagnostics": { + "out_of_memory": "Out of memory", + "file_limit_exceeded": "Input file exceeds maximum file size", + "io_error": "I/O error: {[error_code]}", + "invalid_source_encoding": "Invalid source code encoding detected", + "unexpected_token_eof": "Expected token {[expected_type]}, but end of file was discovered", + "unexpected_token": "Expected token {[expected_type]}, but discovered token {[actual_type]} ('{[actual_text]}')", + "unexpected_character": "Unexpected character: '{[character]}'", + "unexpected_eof": "Unexpected end of file", + "bad_string_escape": "Invalid string escape: Escape sequence at the end of string", + "invalid_string_escape": "Invalid string escape \\{[escape]}", + "excess_tokens": "Excess token at the end of the file: {[token_type]}" + }, + "errors": { + "Unexpected": "unexpected error encountered", + "OutOfMemory": "out of memory", + "InputOutput": "input output", + "AccessDenied": "access denied", + "BrokenPipe": "broken pipe", + "SystemResources": "system resources", + "OperationAborted": "operation aborted", + "WouldBlock": "would block", + "ConnectionResetByPeer": "connection reset by peer", + "IsDir": "path points to directory", + "ConnectionTimedOut": "connection timed out", + "NotOpenForReading": "not open for reading", + "NetNameDeleted": "net name deleted", + "StreamTooLong": "stream too long", + "SyntaxError": "syntax error", + "InvalidSourceEncoding": "invalid source encoding" + } +} \ No newline at end of file diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 15316ac..b5e2741 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -10,6 +10,8 @@ const ast = @import("ast.zig"); const parser = @import("parser.zig"); const ast_dump = @import("ast_dump.zig"); +const Diagnostics = @import("Diagnostics.zig"); + comptime { // reference for unit tests: _ = parser; @@ -45,6 +47,8 @@ const TestMode = enum { }; pub fn main() !u8 { + // errdefer |e| @compileLog(@TypeOf(e)); + var stdout = std.io.getStdOut(); var stdin = std.io.getStdIn(); var stderr = std.io.getStdErr(); @@ -69,7 +73,7 @@ pub fn main() !u8 { var string_pool = try ptk.strings.Pool.init(dynamic_allocator); defer string_pool.deinit(); - var diagnostics = ptk.Diagnostics.init(dynamic_allocator); + var diagnostics = Diagnostics.init(dynamic_allocator); defer diagnostics.deinit(); var input_file = switch (cli.positionals.len) { @@ -102,14 +106,14 @@ pub fn main() !u8 { cli.options.test_mode, ) catch |err| switch (err) { // syntax errors must produce diagnostics: - error.SyntaxError => std.debug.assert(diagnostics.hasErrors()), + error.SyntaxError, error.InvalidSourceEncoding => std.debug.assert(diagnostics.hasErrors()), error.OutOfMemory => { try diagnostics.emit(.{ .source = file_name, .line = 1, .column = 1, - }, .@"error", "out of memory", .{}); + }, .out_of_memory, .{}); }, error.StreamTooLong => { @@ -117,7 +121,7 @@ pub fn main() !u8 { .source = file_name, .line = 1, .column = 1, - }, .@"error", "input file too large", .{}); + }, .file_limit_exceeded, .{}); }, error.InputOutput, @@ -137,11 +141,11 @@ pub fn main() !u8 { .source = file_name, .line = 1, .column = 1, - }, .@"error", "i/o error: {s}", .{@errorName(err)}); + }, .io_error, .{ .error_code = err }); }, }; - try diagnostics.print(stderr.writer()); + try diagnostics.render(stderr.writer()); return if (diagnostics.hasErrors()) 1 @@ -151,7 +155,7 @@ pub fn main() !u8 { fn compileFile( allocator: std.mem.Allocator, - diagnostics: *ptk.Diagnostics, + diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, input_file: std.fs.File, file_name: []const u8, diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 3d6bfe0..0b462c3 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -2,6 +2,8 @@ const std = @import("std"); const ptk = @import("parser-toolkit"); const ast = @import("ast.zig"); +const Diagnostics = @import("Diagnostics.zig"); + const fmtEscapes = std.zig.fmtEscapes; pub const Document = struct { @@ -18,7 +20,7 @@ pub const Document = struct { } }; -pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, stream: anytype) !Document { +pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, stream: anytype) !Document { var arena = std.heap.ArenaAllocator.init(allocator); errdefer arena.deinit(); @@ -38,7 +40,7 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, string const document_node = parser.acceptDocument() catch |err| switch (err) { // Unrecoverable syntax error, must have created diagnostics already - error.SyntaxError => |e| { + error.SyntaxError, error.InvalidSourceEncoding => |e| { std.debug.assert(diagnostics.hasErrors()); return e; }, @@ -48,13 +50,11 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *ptk.Diagnostics, string if (tokenizer.next()) |token_or_null| { if (token_or_null) |token| { - try diagnostics.emit(token.location, .@"error", "Excess token at the end of the file: {s}", .{@tagName(token.type)}); + try diagnostics.emit(token.location, .excess_tokens, .{ .token_type = token.type }); return error.SyntaxError; } } else |_| { - try diagnostics.emit(tokenizer.current_location, .@"error", "Unexpected character: '{}'", .{ - fmtEscapes(tokenizer.source[tokenizer.offset..][0..1]), - }); + try parser.emitUnexpectedCharacter(tokenizer.current_location, tokenizer.offset); return error.SyntaxError; } @@ -133,7 +133,7 @@ const Parser = struct { core: ParserCore, arena: std.mem.Allocator, pool: *ptk.strings.Pool, - diagnostics: *ptk.Diagnostics, + diagnostics: *Diagnostics, pub fn acceptDocument(parser: *Parser) !ast.Document { var doc = ast.Document{}; @@ -149,9 +149,23 @@ const Parser = struct { return doc; } - fn emitDiagnostic(parser: *Parser, loc: ?ptk.Location, comptime fmt: []const u8, args: anytype) !void { + fn emitDiagnostic(parser: Parser, loc: ?ptk.Location, comptime code: Diagnostics.Code, data: Diagnostics.Data(code)) !void { // Anything detected here is always an error - try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, .@"error", fmt, args); + std.debug.assert(code.isError()); + try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, code, data); + } + + fn emitUnexpectedCharacter(parser: Parser, location: ptk.Location, source_offset: usize) !void { + var utf8_view = std.unicode.Utf8View.init(parser.core.tokenizer.source[source_offset..]) catch { + try parser.emitDiagnostic(location, .invalid_source_encoding, .{}); + return error.InvalidSourceEncoding; + }; + + var iter = utf8_view.iterator(); + + try parser.emitDiagnostic(location, .unexpected_character, .{ + .character = iter.nextCodepoint() orelse @panic("very unexpected end of file"), + }); } fn acceptTopLevelDecl(parser: *Parser) !?ast.TopLevelDeclaration { @@ -166,7 +180,7 @@ const Parser = struct { error.UnexpectedCharacter => true, }; if (excess_tokens) { - try parser.emitDiagnostic(null, "Unexpected end of file", .{}); + try parser.emitDiagnostic(null, .unexpected_eof, .{}); return error.SyntaxError; } @@ -235,7 +249,7 @@ const Parser = struct { try parser.append(ast.Production, &list, prod); } else |err| switch (err) { error.UnexpectedTokenRecoverable => break, - error.OutOfMemory, error.SyntaxError => |e| return e, + error.OutOfMemory, error.InvalidSourceEncoding, error.SyntaxError => |e| return e, } } @@ -287,13 +301,13 @@ const Parser = struct { fn tryAcceptLiteral(parser: *Parser, comptime token_type: TokenType) !bool { _ = parser.acceptToken(token_type, .recover) catch |err| switch (err) { error.UnexpectedTokenRecoverable => return false, - error.OutOfMemory, error.SyntaxError => |e| return e, + error.OutOfMemory, error.InvalidSourceEncoding, error.SyntaxError => |e| return e, }; return true; } /// Tries to accept a given token and will emit a diagnostic if it fails. - fn acceptToken(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) !Token { + fn acceptToken(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) AcceptError!Token { const saved_state = parser.save(); errdefer parser.restore(saved_state); @@ -306,10 +320,10 @@ const Parser = struct { if (token.type != token_type) { switch (accept_mode) { .fail => { - try parser.emitDiagnostic(location, "Expected token {s}, but discovered token {s} ('{}')", .{ - @tagName(token_type), - @tagName(token.type), - std.zig.fmtEscapes(token.text), + try parser.emitDiagnostic(location, .unexpected_token, .{ + .expected_type = token_type, + .actual_type = token.type, + .actual_text = token.text, }); return error.SyntaxError; }, @@ -321,15 +335,13 @@ const Parser = struct { error.UnexpectedToken => unreachable, // RS.any will always accept the token error.EndOfStream => switch (accept_mode) { .fail => { - try parser.emitDiagnostic(location, "Expected token {s}, but end of file was discovered", .{@tagName(token_type)}); + try parser.emitDiagnostic(location, .unexpected_token_eof, .{ .expected_type = token_type }); return error.SyntaxError; }, .recover => return error.UnexpectedTokenRecoverable, }, error.UnexpectedCharacter => { - try parser.emitDiagnostic(location, "Unexpected character: '{}'", .{ - fmtEscapes(parser.core.tokenizer.source[source_offset..][0..1]), - }); + try parser.emitUnexpectedCharacter(location, source_offset); return error.SyntaxError; }, } @@ -369,7 +381,7 @@ const Parser = struct { if (c == '\\') { i += 1; if (i >= raw.len) { - try parser.emitDiagnostic(loc, "Invalid string escape: Missing escaped character!", .{}); + try parser.emitDiagnostic(loc, .bad_string_escape, .{}); return error.SyntaxError; } const escape = raw[i]; @@ -387,11 +399,7 @@ const Parser = struct { '0'...'3' => @panic("Implement octal escape \\???"), else => { - if (std.ascii.isPrint(c)) { - try parser.emitDiagnostic(loc, "Invalid string escape \\{c}", .{escape}); - } else { - try parser.emitDiagnostic(loc, "Invalid string escape \\x{X:0>2}", .{escape}); - } + try parser.emitDiagnostic(loc, .invalid_string_escape, .{ .escape = escape }); return error.SyntaxError; }, }; @@ -432,25 +440,23 @@ const Parser = struct { // Something could not be accepted. SyntaxError, + + // The source code contained invalid bytes + InvalidSourceEncoding, }; pub const AcceptError = FatalAcceptError || error{ - - // The token stream is too short to accept this rule - EndOfStream, - // The token stream contains an unexpected token, this is a syntax error UnexpectedTokenRecoverable, }; fn filterAcceptError(err: AcceptError) FatalAcceptError!void { return switch (err) { - error.EndOfStream, - error.UnexpectedTokenRecoverable, - => {}, + error.UnexpectedTokenRecoverable => {}, error.OutOfMemory, error.SyntaxError, + error.InvalidSourceEncoding, => |e| return e, }; } From d7b0050adf3ffdd0feedc48d817905105a5662eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Thu, 2 Nov 2023 15:20:11 +0100 Subject: [PATCH 05/20] Starts creating ptkgen grammar grammar file, adds support for basic group constructs --- build.zig | 9 ++- examples/ptkgen/grammar.ptk | 38 ++++++++++ src/ptkgen/ast.zig | 8 +- src/ptkgen/ast_dump.zig | 6 +- src/ptkgen/parser.zig | 76 +++++++++++-------- .../analysis/accept/match-group-many-item.ptk | 1 + .../accept/match-group-many-sequence.ptk | 1 + test/analysis/accept/match-group-nested.ptk | 1 + test/analysis/accept/match-group-one-item.ptk | 1 + .../accept/match-group-one-sequence.ptk | 1 + test/analysis/accept/match-literal-rule.ptk | 1 - .../accept/match-literal-sequence-variant.ptk | 4 + .../accept/match-literal-sequence.ptk | 1 - .../accept/match-literal-variants.ptk | 1 + test/parser/accept/identifiers.ptk | 14 ++-- 15 files changed, 116 insertions(+), 47 deletions(-) create mode 100644 examples/ptkgen/grammar.ptk create mode 100644 test/analysis/accept/match-group-many-item.ptk create mode 100644 test/analysis/accept/match-group-many-sequence.ptk create mode 100644 test/analysis/accept/match-group-nested.ptk create mode 100644 test/analysis/accept/match-group-one-item.ptk create mode 100644 test/analysis/accept/match-group-one-sequence.ptk create mode 100644 test/analysis/accept/match-literal-sequence-variant.ptk create mode 100644 test/analysis/accept/match-literal-variants.ptk diff --git a/build.zig b/build.zig index d09acee..c6b0c55 100644 --- a/build.zig +++ b/build.zig @@ -94,11 +94,18 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/empty.ptk", "test/parser/accept/empty-with-comment-linefeed.ptk", "test/parser/accept/empty-with-comment.ptk", - // "test/parser/accept/identifiers.ptk", + "test/parser/accept/identifiers.ptk", // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-literal-rule.ptk", "test/analysis/accept/match-literal-sequence.ptk", + "test/analysis/accept/match-literal-variants.ptk", + "test/analysis/accept/match-literal-sequence-variant.ptk", + "test/analysis/accept/match-group-one-item.ptk", + "test/analysis/accept/match-group-one-sequence.ptk", + "test/analysis/accept/match-group-many-item.ptk", + "test/analysis/accept/match-group-many-sequence.ptk", + "test/analysis/accept/match-group-nested.ptk", }; diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk new file mode 100644 index 0000000..2177628 --- /dev/null +++ b/examples/ptkgen/grammar.ptk @@ -0,0 +1,38 @@ + + + +rule document = [ ]* ; + +rule top_level = + + | + | + | +; + +# rule start_decl = "start" $rule_ref ";" ; + +# rule token_decl = "token" $identifier "=" ";" ; + +# rule node_decl = "node" $identifier "=" ";" ; + +rule rule_decl = "rule" $identifier ( ":" )? "=" ";" ; + +rule mapped_productions = ( "|" )* ; + +rule mapped_production = ( "=>" )? ; + +rule production_sequence = ( )+; + +rule production = + $string_literal + | $rule_ref + | "(" ")" "?" + | "(" ")" "*" + | "(" ")" "+" + | "(" ")" +; + +rule mapping = + # TODO +; \ No newline at end of file diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index 0bda109..821e13e 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -108,10 +108,10 @@ pub const Production = union(enum) { literal: StringLiteral, // "text" terminal: TokenRef, // $token recursion: RuleRef, // - sequence: List(Production), // ( ... ) - optional: *Production, // ...? - repetition_zero: *Production, // [ ... ]* - repetition_one: *Production, // [ ... ]+ + sequence: List(Production), // ... + optional: List(Production), // ( ... )? + repetition_zero: List(Production), // [ ... ]* + repetition_one: List(Production), // [ ... ]+ }; pub const AstMapping = union(enum) { diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 835dadd..2b8f08a 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -40,7 +40,7 @@ const AstPrinter = struct { while (prods.next()) |prod| { defer first = false; if (!first) { - print(" | ", .{}); + print("\n | ", .{}); } else { print(" ", .{}); } @@ -119,8 +119,8 @@ const AstPrinter = struct { const text = strpr.printer.strings.get(strpr.str); switch (strpr.mode) { - .id => try writer.print("\"{}\"", .{std.zig.fmtId(text)}), - .text => try writer.print("\"{}\"", .{std.zig.fmtEscapes(text)}), + .id => try writer.print("{}", .{std.zig.fmtId(text)}), + .text => try writer.print("{}", .{std.zig.fmtEscapes(text)}), } } }; diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 0b462c3..fd8665b 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -135,7 +135,7 @@ const Parser = struct { pool: *ptk.strings.Pool, diagnostics: *Diagnostics, - pub fn acceptDocument(parser: *Parser) !ast.Document { + pub fn acceptDocument(parser: *Parser) FatalAcceptError!ast.Document { var doc = ast.Document{}; while (true) { @@ -149,26 +149,7 @@ const Parser = struct { return doc; } - fn emitDiagnostic(parser: Parser, loc: ?ptk.Location, comptime code: Diagnostics.Code, data: Diagnostics.Data(code)) !void { - // Anything detected here is always an error - std.debug.assert(code.isError()); - try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, code, data); - } - - fn emitUnexpectedCharacter(parser: Parser, location: ptk.Location, source_offset: usize) !void { - var utf8_view = std.unicode.Utf8View.init(parser.core.tokenizer.source[source_offset..]) catch { - try parser.emitDiagnostic(location, .invalid_source_encoding, .{}); - return error.InvalidSourceEncoding; - }; - - var iter = utf8_view.iterator(); - - try parser.emitDiagnostic(location, .unexpected_character, .{ - .character = iter.nextCodepoint() orelse @panic("very unexpected end of file"), - }); - } - - fn acceptTopLevelDecl(parser: *Parser) !?ast.TopLevelDeclaration { + fn acceptTopLevelDecl(parser: *Parser) FatalAcceptError!?ast.TopLevelDeclaration { if (parser.acceptRule()) |rule| { return .{ .rule = rule }; } else |err| try filterAcceptError(err); @@ -187,7 +168,7 @@ const Parser = struct { return null; } - fn acceptRule(parser: *Parser) !ast.Rule { + fn acceptRule(parser: *Parser) AcceptError!ast.Rule { var state = parser.save(); errdefer parser.restore(state); @@ -224,7 +205,7 @@ const Parser = struct { }; } - fn acceptMappedProduction(parser: *Parser) !ast.MappedProduction { + fn acceptMappedProduction(parser: *Parser) AcceptError!ast.MappedProduction { var sequence = try parser.acceptProductionSequence(); const mapping = if (try parser.tryAcceptLiteral(.@"=>")) @@ -233,6 +214,7 @@ const Parser = struct { null; return ast.MappedProduction{ + // Auto-flatten the "tree" here if the top level production is a "sequence" of one .production = if (sequence.only()) |item| item else @@ -241,7 +223,7 @@ const Parser = struct { }; } - fn acceptProductionSequence(parser: *Parser) !ast.List(ast.Production) { + fn acceptProductionSequence(parser: *Parser) AcceptError!ast.List(ast.Production) { var list: ast.List(ast.Production) = .{}; while (true) { @@ -256,7 +238,22 @@ const Parser = struct { return list; } - fn acceptProduction(parser: *Parser) !ast.Production { + fn acceptProduction(parser: *Parser) AcceptError!ast.Production { + if (try parser.tryAcceptLiteral(.@"(")) { + var sequence = try parser.acceptProductionSequence(); + try parser.acceptLiteral(.@")", .fail); + + if (try parser.tryAcceptLiteral(.@"?")) { + return .{ .optional = sequence }; + } else if (try parser.tryAcceptLiteral(.@"+")) { + return .{ .repetition_one = sequence }; + } else if (try parser.tryAcceptLiteral(.@"*")) { + return .{ .repetition_zero = sequence }; + } else { + return .{ .sequence = sequence }; + } + } + const str = try parser.acceptStringLiteral(.recover); return ast.Production{ @@ -264,17 +261,17 @@ const Parser = struct { }; } - fn acceptAstMapping(parser: *Parser) !ast.AstMapping { + fn acceptAstMapping(parser: *Parser) AcceptError!ast.AstMapping { _ = parser; @panic("not implemented yet"); } - fn acceptTypeSpec(parser: *Parser) !ast.TypeSpec { + fn acceptTypeSpec(parser: *Parser) AcceptError!ast.TypeSpec { _ = parser; @panic("not implemented yet"); } - fn acceptStringLiteral(parser: *Parser, accept_mode: AcceptMode) !ast.StringLiteral { + fn acceptStringLiteral(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.StringLiteral { const token = try parser.acceptToken(.string_literal, accept_mode); std.debug.assert(token.text.len >= 2); @@ -285,7 +282,7 @@ const Parser = struct { }; } - fn acceptIdentifier(parser: *Parser, accept_mode: AcceptMode) !ast.Identifier { + fn acceptIdentifier(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.Identifier { const token = try parser.acceptToken(.identifier, accept_mode); return ast.Identifier{ @@ -294,7 +291,7 @@ const Parser = struct { }; } - fn acceptLiteral(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) !void { + fn acceptLiteral(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) AcceptError!void { _ = try parser.acceptToken(token_type, accept_mode); } @@ -357,6 +354,25 @@ const Parser = struct { // management: + fn emitDiagnostic(parser: Parser, loc: ?ptk.Location, comptime code: Diagnostics.Code, data: Diagnostics.Data(code)) !void { + // Anything detected here is always an error + std.debug.assert(code.isError()); + try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, code, data); + } + + fn emitUnexpectedCharacter(parser: Parser, location: ptk.Location, source_offset: usize) !void { + var utf8_view = std.unicode.Utf8View.init(parser.core.tokenizer.source[source_offset..]) catch { + try parser.emitDiagnostic(location, .invalid_source_encoding, .{}); + return error.InvalidSourceEncoding; + }; + + var iter = utf8_view.iterator(); + + try parser.emitDiagnostic(location, .unexpected_character, .{ + .character = iter.nextCodepoint() orelse @panic("very unexpected end of file"), + }); + } + fn unwrapIdentifierString(parser: *Parser, loc: ptk.Location, raw: []const u8) !ptk.strings.String { std.debug.assert(raw.len > 0); if (raw[0] == '@') { diff --git a/test/analysis/accept/match-group-many-item.ptk b/test/analysis/accept/match-group-many-item.ptk new file mode 100644 index 0000000..5e1e31f --- /dev/null +++ b/test/analysis/accept/match-group-many-item.ptk @@ -0,0 +1 @@ +rule mode = ( "first" "second" "third" ); \ No newline at end of file diff --git a/test/analysis/accept/match-group-many-sequence.ptk b/test/analysis/accept/match-group-many-sequence.ptk new file mode 100644 index 0000000..40902e7 --- /dev/null +++ b/test/analysis/accept/match-group-many-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "one" "two" "three" ) "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-group-nested.ptk b/test/analysis/accept/match-group-nested.ptk new file mode 100644 index 0000000..d35091c --- /dev/null +++ b/test/analysis/accept/match-group-nested.ptk @@ -0,0 +1 @@ +rule mode = "L0:0" ( "L1:0" ( "L2:0" "L2:1" "L2:2" ) "L1:2" ) "L0:2"; \ No newline at end of file diff --git a/test/analysis/accept/match-group-one-item.ptk b/test/analysis/accept/match-group-one-item.ptk new file mode 100644 index 0000000..faa24e7 --- /dev/null +++ b/test/analysis/accept/match-group-one-item.ptk @@ -0,0 +1 @@ +rule mode = ( "item" ); \ No newline at end of file diff --git a/test/analysis/accept/match-group-one-sequence.ptk b/test/analysis/accept/match-group-one-sequence.ptk new file mode 100644 index 0000000..e34f909 --- /dev/null +++ b/test/analysis/accept/match-group-one-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "second" ) "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-literal-rule.ptk b/test/analysis/accept/match-literal-rule.ptk index 3cda9a8..a0b8dc0 100644 --- a/test/analysis/accept/match-literal-rule.ptk +++ b/test/analysis/accept/match-literal-rule.ptk @@ -1,2 +1 @@ -# This file contains a single rule with no well-defined start point: rule basic = "basic"; \ No newline at end of file diff --git a/test/analysis/accept/match-literal-sequence-variant.ptk b/test/analysis/accept/match-literal-sequence-variant.ptk new file mode 100644 index 0000000..842274e --- /dev/null +++ b/test/analysis/accept/match-literal-sequence-variant.ptk @@ -0,0 +1,4 @@ +rule mode = + "basic" "item" + | "extended" "item" +; \ No newline at end of file diff --git a/test/analysis/accept/match-literal-sequence.ptk b/test/analysis/accept/match-literal-sequence.ptk index 555a2dc..245add7 100644 --- a/test/analysis/accept/match-literal-sequence.ptk +++ b/test/analysis/accept/match-literal-sequence.ptk @@ -1,2 +1 @@ -# This file contains a single rule with no well-defined start point: rule basic = "basic" "words" "after" "another"; \ No newline at end of file diff --git a/test/analysis/accept/match-literal-variants.ptk b/test/analysis/accept/match-literal-variants.ptk new file mode 100644 index 0000000..28ff569 --- /dev/null +++ b/test/analysis/accept/match-literal-variants.ptk @@ -0,0 +1 @@ +rule mode = "basic" | "extended"; \ No newline at end of file diff --git a/test/parser/accept/identifiers.ptk b/test/parser/accept/identifiers.ptk index 521db6f..3c4baaa 100644 --- a/test/parser/accept/identifiers.ptk +++ b/test/parser/accept/identifiers.ptk @@ -1,8 +1,8 @@ -rule a = literal `whatever`; -rule _ = literal `whatever`; -rule a0 = literal `whatever`; -rule a-z = literal `whatever`; -rule _10 = literal `whatever`; -rule @"x" = literal `whatever`; -rule @"hello, world!" = literal `whatever`; +rule a = "whatever"; +rule _ = "whatever"; +rule a0 = "whatever"; +rule a-z = "whatever"; +rule _10 = "whatever"; +rule @"x" = "whatever"; +rule @"hello, world!" = "whatever"; From d0a08c7c3d4f6118c7e058d23fcbdb5631279dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Thu, 2 Nov 2023 16:42:57 +0100 Subject: [PATCH 06/20] Implements more tests, adds support for parser rejection tests --- build.zig | 38 ++++++ src/ptkgen/Diagnostics.zig | 27 +++-- src/ptkgen/ast_dump.zig | 12 +- src/ptkgen/intl.zig | 3 +- src/ptkgen/intl/en.json | 5 +- src/ptkgen/main.zig | 108 ++++++++++++++++-- src/ptkgen/parser.zig | 16 ++- .../accept/match-optional-many-item.ptk | 1 + .../accept/match-optional-many-sequence.ptk | 1 + .../analysis/accept/match-optional-nested.ptk | 1 + .../accept/match-optional-one-item.ptk | 1 + .../accept/match-optional-one-sequence.ptk | 1 + .../accept/match-rep_one-many-item.ptk | 1 + .../accept/match-rep_one-many-sequence.ptk | 1 + test/analysis/accept/match-rep_one-nested.ptk | 1 + .../accept/match-rep_one-one-item.ptk | 1 + .../accept/match-rep_one-one-sequence.ptk | 1 + .../accept/match-rep_zero-many-item.ptk | 1 + .../accept/match-rep_zero-many-sequence.ptk | 1 + .../analysis/accept/match-rep_zero-nested.ptk | 1 + .../accept/match-rep_zero-one-item.ptk | 1 + .../accept/match-rep_zero-one-sequence.ptk | 1 + test/parser/accept/optional-nospace.ptk | 1 + test/parser/accept/optional-space.ptk | 1 + test/parser/accept/rep_one-nospace.ptk | 1 + test/parser/accept/rep_one-space.ptk | 1 + test/parser/accept/rep_zero-nospace.ptk | 1 + test/parser/accept/rep_zero-space.ptk | 1 + test/parser/reject/empty-group.rule | 2 + test/parser/reject/empty-optional.rule | 2 + test/parser/reject/empty-rep_one.rule | 2 + test/parser/reject/empty-rep_zero.rule | 2 + test/parser/reject/empty-rule.rule | 2 + 33 files changed, 204 insertions(+), 36 deletions(-) create mode 100644 test/analysis/accept/match-optional-many-item.ptk create mode 100644 test/analysis/accept/match-optional-many-sequence.ptk create mode 100644 test/analysis/accept/match-optional-nested.ptk create mode 100644 test/analysis/accept/match-optional-one-item.ptk create mode 100644 test/analysis/accept/match-optional-one-sequence.ptk create mode 100644 test/analysis/accept/match-rep_one-many-item.ptk create mode 100644 test/analysis/accept/match-rep_one-many-sequence.ptk create mode 100644 test/analysis/accept/match-rep_one-nested.ptk create mode 100644 test/analysis/accept/match-rep_one-one-item.ptk create mode 100644 test/analysis/accept/match-rep_one-one-sequence.ptk create mode 100644 test/analysis/accept/match-rep_zero-many-item.ptk create mode 100644 test/analysis/accept/match-rep_zero-many-sequence.ptk create mode 100644 test/analysis/accept/match-rep_zero-nested.ptk create mode 100644 test/analysis/accept/match-rep_zero-one-item.ptk create mode 100644 test/analysis/accept/match-rep_zero-one-sequence.ptk create mode 100644 test/parser/accept/optional-nospace.ptk create mode 100644 test/parser/accept/optional-space.ptk create mode 100644 test/parser/accept/rep_one-nospace.ptk create mode 100644 test/parser/accept/rep_one-space.ptk create mode 100644 test/parser/accept/rep_zero-nospace.ptk create mode 100644 test/parser/accept/rep_zero-space.ptk create mode 100644 test/parser/reject/empty-group.rule create mode 100644 test/parser/reject/empty-optional.rule create mode 100644 test/parser/reject/empty-rep_one.rule create mode 100644 test/parser/reject/empty-rep_zero.rule create mode 100644 test/parser/reject/empty-rule.rule diff --git a/build.zig b/build.zig index c6b0c55..54c2ae2 100644 --- a/build.zig +++ b/build.zig @@ -74,6 +74,13 @@ pub fn build(b: *std.build.Builder) void { run.addFileArg(.{ .path = file }); test_step.dependOn(&run.step); } + + for (parser_reject_files) |file| { + const run = b.addRunArtifact(ptkdef_exe); + run.addArg("--test_mode=parse_only"); + run.addFileArg(.{ .path = file }); + test_step.dependOn(&run.step); + } } // examples @@ -95,6 +102,14 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/empty-with-comment-linefeed.ptk", "test/parser/accept/empty-with-comment.ptk", "test/parser/accept/identifiers.ptk", + + "test/parser/accept/optional-nospace.ptk", + "test/parser/accept/optional-space.ptk", + "test/parser/accept/rep_one-nospace.ptk", + "test/parser/accept/rep_one-space.ptk", + "test/parser/accept/rep_zero-nospace.ptk", + "test/parser/accept/rep_zero-space.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; @@ -108,4 +123,27 @@ const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-group-many-item.ptk", "test/analysis/accept/match-group-many-sequence.ptk", "test/analysis/accept/match-group-nested.ptk", + "test/analysis/accept/match-optional-one-item.ptk", + "test/analysis/accept/match-optional-one-sequence.ptk", + "test/analysis/accept/match-optional-many-item.ptk", + "test/analysis/accept/match-optional-many-sequence.ptk", + "test/analysis/accept/match-optional-nested.ptk", + "test/analysis/accept/match-rep_zero-one-item.ptk", + "test/analysis/accept/match-rep_zero-one-sequence.ptk", + "test/analysis/accept/match-rep_zero-many-item.ptk", + "test/analysis/accept/match-rep_zero-many-sequence.ptk", + "test/analysis/accept/match-rep_zero-nested.ptk", + "test/analysis/accept/match-rep_one-one-item.ptk", + "test/analysis/accept/match-rep_one-one-sequence.ptk", + "test/analysis/accept/match-rep_one-many-item.ptk", + "test/analysis/accept/match-rep_one-many-sequence.ptk", + "test/analysis/accept/match-rep_one-nested.ptk", +}; + +const parser_reject_files = [_][]const u8{ + "test/parser/reject/empty-rule.rule", + "test/parser/reject/empty-group.rule", + "test/parser/reject/empty-optional.rule", + "test/parser/reject/empty-rep_one.rule", + "test/parser/reject/empty-rep_zero.rule", }; diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index d5b75f5..12b8fa1 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -12,21 +12,24 @@ pub const Code = enum(u16) { pub const first_note = 8000; pub const last_item = 10000; + // generic failures: out_of_memory = 1000, file_limit_exceeded = 1001, io_error = 1002, - invalid_source_encoding = 1003, - unexpected_token_eof = 1004, - unexpected_token = 1005, - unexpected_character = 1006, - unexpected_eof = 1007, + // non-recoverable syntax errors: - bad_string_escape = 1008, + invalid_source_encoding = 1100, + unexpected_token_eof = 1101, + unexpected_token = 1102, + unexpected_character = 1103, + unexpected_eof = 1104, + bad_string_escape = 1105, + invalid_string_escape = 1106, + excess_tokens = 1107, - invalid_string_escape = 1009, - - excess_tokens = 1010, + // recoverable syntax errors: + illegal_empty_group = 1200, comptime { std.debug.assert(first_error < first_warning); @@ -74,6 +77,8 @@ pub fn Data(comptime code: Code) type { .invalid_string_escape => struct { escape: u21 }, .excess_tokens => struct { token_type: parser.TokenType }, + .illegal_empty_group => NoDiagnosticData, + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; } @@ -85,14 +90,17 @@ pub const Message = struct { }; inner: ptk.Diagnostics, +codes: std.ArrayList(Code), pub fn init(allocator: std.mem.Allocator) Diagnostics { return Diagnostics{ .inner = ptk.Diagnostics.init(allocator), + .codes = std.ArrayList(Code).init(allocator), }; } pub fn deinit(diag: *Diagnostics) void { + diag.codes.deinit(); diag.inner.deinit(); diag.* = undefined; } @@ -242,6 +250,7 @@ pub fn emit(diag: *Diagnostics, location: ptk.Location, comptime code: Code, par }; try diag.inner.emit(location, level, "{s}{d:0>4}: {s}", .{ code_prefix, @intFromEnum(code), message_text }); + try diag.codes.append(code); } pub fn render(diag: Diagnostics, stream: anytype) !void { diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 2b8f08a..9bff39e 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -77,7 +77,7 @@ const AstPrinter = struct { .literal => |lit| print("\"{}\"", .{printer.fmtString(lit.value)}), .terminal => |term| print("<{}>", .{printer.fmtId(term.identifier)}), .recursion => print("", .{}), - .sequence => |seq| { + .sequence, .optional, .repetition_zero, .repetition_one => |seq| { print("(", .{}); var iter = ast.iterate(seq); @@ -87,10 +87,14 @@ const AstPrinter = struct { } print(" )", .{}); + switch (production) { + .sequence => {}, + .optional => print("?", .{}), + .repetition_zero => print("*", .{}), + .repetition_one => print("+", .{}), + else => unreachable, + } }, - .optional => print("", .{}), - .repetition_zero => print("", .{}), - .repetition_one => print("", .{}), } } diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig index 51623eb..f6c2705 100644 --- a/src/ptkgen/intl.zig +++ b/src/ptkgen/intl.zig @@ -38,6 +38,7 @@ pub const Localization = struct { bad_string_escape: []const u8, invalid_string_escape: []const u8, excess_tokens: []const u8, + illegal_empty_group: []const u8, }, errors: struct { @@ -57,7 +58,7 @@ pub const Localization = struct { NotOpenForReading: []const u8, NetNameDeleted: []const u8, - StreamTooLong: []const u8, + FileTooBig: []const u8, SyntaxError: []const u8, InvalidSourceEncoding: []const u8, }, diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 72ed39a..762bb4c 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -10,7 +10,8 @@ "unexpected_eof": "Unexpected end of file", "bad_string_escape": "Invalid string escape: Escape sequence at the end of string", "invalid_string_escape": "Invalid string escape \\{[escape]}", - "excess_tokens": "Excess token at the end of the file: {[token_type]}" + "excess_tokens": "Excess token at the end of the file: {[token_type]}", + "illegal_empty_group": "Production sequence may not be empty" }, "errors": { "Unexpected": "unexpected error encountered", @@ -26,7 +27,7 @@ "ConnectionTimedOut": "connection timed out", "NotOpenForReading": "not open for reading", "NetNameDeleted": "net name deleted", - "StreamTooLong": "stream too long", + "FileTooBig": "Input file exceeds resources", "SyntaxError": "syntax error", "InvalidSourceEncoding": "invalid source encoding" } diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index b5e2741..169f457 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -116,7 +116,7 @@ pub fn main() !u8 { }, .out_of_memory, .{}); }, - error.StreamTooLong => { + error.FileTooBig => { try diagnostics.emit(.{ .source = file_name, .line = 1, @@ -136,21 +136,74 @@ pub fn main() !u8 { error.ConnectionTimedOut, error.NotOpenForReading, error.NetNameDeleted, - => { + => |e| { try diagnostics.emit(.{ .source = file_name, .line = 1, .column = 1, - }, .io_error, .{ .error_code = err }); + }, .io_error, .{ .error_code = e }); }, + + error.TestExpectationMismatched => return 1, // this is a shortcut we can take to not render the diagnostics on test failure }; - try diagnostics.render(stderr.writer()); + if (cli.options.test_mode == .none) { + try diagnostics.render(stderr.writer()); - return if (diagnostics.hasErrors()) - 1 - else - 0; + return if (diagnostics.hasErrors()) + 1 + else + 0; + } else { + // test fails through `error.TestExpectationMismatched`, not through diagnostics + return 0; + } +} + +const TestExpectation = struct { + code: Diagnostics.Code, +}; + +fn validateDiagnostics(allocator: std.mem.Allocator, diagnostics: Diagnostics, expectations: []const TestExpectation) !void { + var available = std.ArrayList(Diagnostics.Code).init(allocator); + defer available.deinit(); + + var expected = std.ArrayList(Diagnostics.Code).init(allocator); + defer expected.deinit(); + + try available.appendSlice(diagnostics.codes.items); + try expected.resize(expectations.len); + + for (expected.items, expectations) |*dst, src| { + dst.* = src.code; + } + + // Remove everything from expected and available that is present in both: + { + var i: usize = 0; + while (i < expected.items.len) { + const e = expected.items[i]; + + if (std.mem.indexOfScalar(Diagnostics.Code, available.items, e)) |index| { + _ = available.swapRemove(index); + _ = expected.swapRemove(i); + } else { + i += 1; + } + } + } + + const ok = (available.items.len == 0) and (expected.items.len == 0); + + for (available.items) |code| { + std.log.err("unexpected diagnostic: {0}", .{code}); + } + for (expected.items) |code| { + std.log.err("unmatched diagnostic: {0}", .{code}); + } + + if (!ok) + return error.TestExpectationMismatched; } fn compileFile( @@ -161,19 +214,52 @@ fn compileFile( file_name: []const u8, mode: TestMode, ) !void { + var source_code = try input_file.readToEndAlloc(allocator, 4 << 20); // 4 MB should be enough for now... + defer allocator.free(source_code); + + var expectations = std.ArrayList(TestExpectation).init(allocator); + defer expectations.deinit(); + + if (mode != .none) { + // parse expectations from source code: + var lines = std.mem.tokenize(u8, source_code, "\n"); + while (lines.next()) |line| { + const prefix = "# expected:"; + if (std.mem.startsWith(u8, line, prefix)) { + var items = std.mem.tokenize(u8, line[prefix.len..], " \t,"); + while (items.next()) |error_code| { + if (error_code.len == 0 or (error_code[0] != 'E' and error_code[0] != 'W' and error_code[0] != 'D')) + @panic("invalid error code!"); + const id = std.fmt.parseInt(u16, error_code[1..], 10) catch @panic("bad integer"); + const code = std.meta.intToEnum(Diagnostics.Code, id) catch @panic("bad diagnostic code"); + try expectations.append(.{ .code = code }); + } + } + } + } + var tree = try parser.parse( allocator, diagnostics, string_pool, file_name, - input_file.reader(), + source_code, ); defer tree.deinit(); if (mode == .parse_only) { - // we're done if we're here + try validateDiagnostics(allocator, diagnostics.*, expectations.items); return; } - ast_dump.dump(string_pool, tree); + // TODO: Implement sema + + // TODO: Implement parsergen / tablegen / highlightergen + + if (mode == .none) { + ast_dump.dump(string_pool, tree); + } else { + // we need to validate against test expectations when doing *any* test mode + try validateDiagnostics(allocator, diagnostics.*, expectations.items); + } } diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index fd8665b..c586b67 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -8,10 +8,7 @@ const fmtEscapes = std.zig.fmtEscapes; pub const Document = struct { arena: std.heap.ArenaAllocator, - file_name: []const u8, - source_text: []const u8, - top_level_declarations: ast.Document, pub fn deinit(ts: *Document) void { @@ -20,15 +17,13 @@ pub const Document = struct { } }; -pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, stream: anytype) !Document { +pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, source_code: []const u8) !Document { var arena = std.heap.ArenaAllocator.init(allocator); errdefer arena.deinit(); const file_name_copy = try arena.allocator().dupe(u8, file_name); - const text = try stream.readAllAlloc(arena.allocator(), 4 << 20); // 4 MB should be enough for now... - - var tokenizer = Tokenizer.init(text, file_name_copy); + var tokenizer = Tokenizer.init(source_code, file_name_copy); var parser = Parser{ .core = ParserCore.init(&tokenizer), @@ -61,8 +56,6 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_poo return Document{ .arena = arena, .file_name = file_name_copy, - .source_text = text, - .top_level_declarations = document_node, }; } @@ -235,6 +228,11 @@ const Parser = struct { } } + if (list.len() == 0) { + // Empty list is a non-recoverable syntax error: + try parser.emitDiagnostic(null, .illegal_empty_group, .{}); + } + return list; } diff --git a/test/analysis/accept/match-optional-many-item.ptk b/test/analysis/accept/match-optional-many-item.ptk new file mode 100644 index 0000000..fb4b409 --- /dev/null +++ b/test/analysis/accept/match-optional-many-item.ptk @@ -0,0 +1 @@ +rule mode = ( "first" "second" "third" )?; \ No newline at end of file diff --git a/test/analysis/accept/match-optional-many-sequence.ptk b/test/analysis/accept/match-optional-many-sequence.ptk new file mode 100644 index 0000000..2c49812 --- /dev/null +++ b/test/analysis/accept/match-optional-many-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "one" "two" "three" )? "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-optional-nested.ptk b/test/analysis/accept/match-optional-nested.ptk new file mode 100644 index 0000000..18bf0d9 --- /dev/null +++ b/test/analysis/accept/match-optional-nested.ptk @@ -0,0 +1 @@ +rule mode = "L0:0" ( "L1:0" ( "L2:0" "L2:1" "L2:2" )? "L1:2" )? "L0:2"; \ No newline at end of file diff --git a/test/analysis/accept/match-optional-one-item.ptk b/test/analysis/accept/match-optional-one-item.ptk new file mode 100644 index 0000000..3c5ccc0 --- /dev/null +++ b/test/analysis/accept/match-optional-one-item.ptk @@ -0,0 +1 @@ +rule mode = ( "item" )?; \ No newline at end of file diff --git a/test/analysis/accept/match-optional-one-sequence.ptk b/test/analysis/accept/match-optional-one-sequence.ptk new file mode 100644 index 0000000..c5fd167 --- /dev/null +++ b/test/analysis/accept/match-optional-one-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "second" )? "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_one-many-item.ptk b/test/analysis/accept/match-rep_one-many-item.ptk new file mode 100644 index 0000000..89961d7 --- /dev/null +++ b/test/analysis/accept/match-rep_one-many-item.ptk @@ -0,0 +1 @@ +rule mode = ( "first" "second" "third" )+; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_one-many-sequence.ptk b/test/analysis/accept/match-rep_one-many-sequence.ptk new file mode 100644 index 0000000..0568546 --- /dev/null +++ b/test/analysis/accept/match-rep_one-many-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "one" "two" "three" )+ "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_one-nested.ptk b/test/analysis/accept/match-rep_one-nested.ptk new file mode 100644 index 0000000..99fbc2f --- /dev/null +++ b/test/analysis/accept/match-rep_one-nested.ptk @@ -0,0 +1 @@ +rule mode = "L0:0" ( "L1:0" ( "L2:0" "L2:1" "L2:2" )+ "L1:2" )+ "L0:2"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_one-one-item.ptk b/test/analysis/accept/match-rep_one-one-item.ptk new file mode 100644 index 0000000..7f273d5 --- /dev/null +++ b/test/analysis/accept/match-rep_one-one-item.ptk @@ -0,0 +1 @@ +rule mode = ( "item" )+; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_one-one-sequence.ptk b/test/analysis/accept/match-rep_one-one-sequence.ptk new file mode 100644 index 0000000..64af460 --- /dev/null +++ b/test/analysis/accept/match-rep_one-one-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "second" )+ "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_zero-many-item.ptk b/test/analysis/accept/match-rep_zero-many-item.ptk new file mode 100644 index 0000000..5d9b366 --- /dev/null +++ b/test/analysis/accept/match-rep_zero-many-item.ptk @@ -0,0 +1 @@ +rule mode = ( "first" "second" "third" )*; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_zero-many-sequence.ptk b/test/analysis/accept/match-rep_zero-many-sequence.ptk new file mode 100644 index 0000000..cadf2c5 --- /dev/null +++ b/test/analysis/accept/match-rep_zero-many-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "one" "two" "three" )* "third"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_zero-nested.ptk b/test/analysis/accept/match-rep_zero-nested.ptk new file mode 100644 index 0000000..fee0799 --- /dev/null +++ b/test/analysis/accept/match-rep_zero-nested.ptk @@ -0,0 +1 @@ +rule mode = "L0:0" ( "L1:0" ( "L2:0" "L2:1" "L2:2" )* "L1:2" )* "L0:2"; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_zero-one-item.ptk b/test/analysis/accept/match-rep_zero-one-item.ptk new file mode 100644 index 0000000..d058aee --- /dev/null +++ b/test/analysis/accept/match-rep_zero-one-item.ptk @@ -0,0 +1 @@ +rule mode = ( "item" )*; \ No newline at end of file diff --git a/test/analysis/accept/match-rep_zero-one-sequence.ptk b/test/analysis/accept/match-rep_zero-one-sequence.ptk new file mode 100644 index 0000000..34e3a06 --- /dev/null +++ b/test/analysis/accept/match-rep_zero-one-sequence.ptk @@ -0,0 +1 @@ +rule mode = "first" ( "second" )* "third"; \ No newline at end of file diff --git a/test/parser/accept/optional-nospace.ptk b/test/parser/accept/optional-nospace.ptk new file mode 100644 index 0000000..c72723f --- /dev/null +++ b/test/parser/accept/optional-nospace.ptk @@ -0,0 +1 @@ +rule group=("word")?; \ No newline at end of file diff --git a/test/parser/accept/optional-space.ptk b/test/parser/accept/optional-space.ptk new file mode 100644 index 0000000..b95fdab --- /dev/null +++ b/test/parser/accept/optional-space.ptk @@ -0,0 +1 @@ +rule group = ( "word" ) ? ; \ No newline at end of file diff --git a/test/parser/accept/rep_one-nospace.ptk b/test/parser/accept/rep_one-nospace.ptk new file mode 100644 index 0000000..9a8646d --- /dev/null +++ b/test/parser/accept/rep_one-nospace.ptk @@ -0,0 +1 @@ +rule group=("word")+; \ No newline at end of file diff --git a/test/parser/accept/rep_one-space.ptk b/test/parser/accept/rep_one-space.ptk new file mode 100644 index 0000000..c624039 --- /dev/null +++ b/test/parser/accept/rep_one-space.ptk @@ -0,0 +1 @@ +rule group = ( "word" ) + ; \ No newline at end of file diff --git a/test/parser/accept/rep_zero-nospace.ptk b/test/parser/accept/rep_zero-nospace.ptk new file mode 100644 index 0000000..3bfb157 --- /dev/null +++ b/test/parser/accept/rep_zero-nospace.ptk @@ -0,0 +1 @@ +rule group=("word")*; \ No newline at end of file diff --git a/test/parser/accept/rep_zero-space.ptk b/test/parser/accept/rep_zero-space.ptk new file mode 100644 index 0000000..3696d95 --- /dev/null +++ b/test/parser/accept/rep_zero-space.ptk @@ -0,0 +1 @@ +rule group = ( "word" ) * ; \ No newline at end of file diff --git a/test/parser/reject/empty-group.rule b/test/parser/reject/empty-group.rule new file mode 100644 index 0000000..2860712 --- /dev/null +++ b/test/parser/reject/empty-group.rule @@ -0,0 +1,2 @@ +# expected: E1200 +rule group = ( ); \ No newline at end of file diff --git a/test/parser/reject/empty-optional.rule b/test/parser/reject/empty-optional.rule new file mode 100644 index 0000000..82ac677 --- /dev/null +++ b/test/parser/reject/empty-optional.rule @@ -0,0 +1,2 @@ +# expected: E1200 +rule group = ( )?; \ No newline at end of file diff --git a/test/parser/reject/empty-rep_one.rule b/test/parser/reject/empty-rep_one.rule new file mode 100644 index 0000000..82ac677 --- /dev/null +++ b/test/parser/reject/empty-rep_one.rule @@ -0,0 +1,2 @@ +# expected: E1200 +rule group = ( )?; \ No newline at end of file diff --git a/test/parser/reject/empty-rep_zero.rule b/test/parser/reject/empty-rep_zero.rule new file mode 100644 index 0000000..82ac677 --- /dev/null +++ b/test/parser/reject/empty-rep_zero.rule @@ -0,0 +1,2 @@ +# expected: E1200 +rule group = ( )?; \ No newline at end of file diff --git a/test/parser/reject/empty-rule.rule b/test/parser/reject/empty-rule.rule new file mode 100644 index 0000000..8d32fe9 --- /dev/null +++ b/test/parser/reject/empty-rule.rule @@ -0,0 +1,2 @@ +# expected: E1200 +rule group = ; \ No newline at end of file From 66527580a7e62cb44e8260dcedf93385bb6e6e31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Thu, 2 Nov 2023 17:43:54 +0100 Subject: [PATCH 07/20] Implements parsing of start rule. --- build.zig | 44 +++--- examples/ptkgen/grammar.ptk | 11 +- src/ptkgen/Diagnostics.zig | 5 + src/ptkgen/ast.zig | 3 +- src/ptkgen/ast_dump.zig | 12 +- src/ptkgen/intl.zig | 1 + src/ptkgen/intl/en.json | 1 + src/ptkgen/main.zig | 133 ++++++++++-------- src/ptkgen/parser.zig | 87 ++++++++++-- test/parser/accept/basic-rule-ref.ptk | 1 + test/parser/accept/basic-token-ref.ptk | 1 + test/parser/accept/document-start.ptk | 1 + .../parser/accept/rule-primitive-sequence.ptk | 1 + .../reject/unexpected-token-string.rule | 2 + 14 files changed, 207 insertions(+), 96 deletions(-) create mode 100644 test/parser/accept/basic-rule-ref.ptk create mode 100644 test/parser/accept/basic-token-ref.ptk create mode 100644 test/parser/accept/document-start.ptk create mode 100644 test/parser/accept/rule-primitive-sequence.ptk create mode 100644 test/parser/reject/unexpected-token-string.rule diff --git a/build.zig b/build.zig index 54c2ae2..e2ba020 100644 --- a/build.zig +++ b/build.zig @@ -97,21 +97,9 @@ pub fn build(b: *std.build.Builder) void { } } -const parser_ok_files = [_][]const u8{ - "test/parser/accept/empty.ptk", - "test/parser/accept/empty-with-comment-linefeed.ptk", - "test/parser/accept/empty-with-comment.ptk", - "test/parser/accept/identifiers.ptk", - - "test/parser/accept/optional-nospace.ptk", - "test/parser/accept/optional-space.ptk", - "test/parser/accept/rep_one-nospace.ptk", - "test/parser/accept/rep_one-space.ptk", - "test/parser/accept/rep_zero-nospace.ptk", - "test/parser/accept/rep_zero-space.ptk", - - // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples -} ++ analyis_ok_files; +const example_files = [_][]const u8{ + "/home/felix/projects/parser-toolkit/examples/ptkgen/grammar.ptk", +}; const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-literal-rule.ptk", @@ -138,7 +126,29 @@ const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-rep_one-many-item.ptk", "test/analysis/accept/match-rep_one-many-sequence.ptk", "test/analysis/accept/match-rep_one-nested.ptk", -}; +} ++ example_files; + +const parser_ok_files = [_][]const u8{ + "test/parser/accept/empty.ptk", + "test/parser/accept/empty-with-comment-linefeed.ptk", + "test/parser/accept/empty-with-comment.ptk", + "test/parser/accept/identifiers.ptk", + + "test/parser/accept/optional-nospace.ptk", + "test/parser/accept/optional-space.ptk", + "test/parser/accept/rep_one-nospace.ptk", + "test/parser/accept/rep_one-space.ptk", + "test/parser/accept/rep_zero-nospace.ptk", + "test/parser/accept/rep_zero-space.ptk", + + "test/parser/accept/basic-rule-ref.ptk", + "test/parser/accept/basic-token-ref.ptk", + "test/parser/accept/rule-primitive-sequence.ptk", + + "test/parser/accept/document-start.ptk", + + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples +} ++ analyis_ok_files; const parser_reject_files = [_][]const u8{ "test/parser/reject/empty-rule.rule", @@ -146,4 +156,6 @@ const parser_reject_files = [_][]const u8{ "test/parser/reject/empty-optional.rule", "test/parser/reject/empty-rep_one.rule", "test/parser/reject/empty-rep_zero.rule", + + "test/parser/reject/unexpected-token-string.rule", }; diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 2177628..07edaec 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -1,7 +1,7 @@ +start ; - -rule document = [ ]* ; +rule document = ( )* ; rule top_level = @@ -27,12 +27,13 @@ rule production_sequence = ( )+; rule production = $string_literal | $rule_ref + | $token_ref | "(" ")" "?" | "(" ")" "*" | "(" ")" "+" | "(" ")" ; -rule mapping = - # TODO -; \ No newline at end of file +# rule mapping = +# # TODO +# ; \ No newline at end of file diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 12b8fa1..555ac8d 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -27,6 +27,7 @@ pub const Code = enum(u16) { bad_string_escape = 1105, invalid_string_escape = 1106, excess_tokens = 1107, + unexpected_toplevel_token = 1108, // recoverable syntax errors: illegal_empty_group = 1200, @@ -68,6 +69,10 @@ pub fn Data(comptime code: Code) type { actual_type: parser.TokenType, actual_text: []const u8, }, + .unexpected_toplevel_token => struct { + actual_type: parser.TokenType, + actual_text: []const u8, + }, .unexpected_eof => NoDiagnosticData, .invalid_source_encoding => NoDiagnosticData, diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index 821e13e..1bfac2a 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -70,9 +70,10 @@ pub const BuiltinLiteral = String(.builtin); pub const Document = List(TopLevelDeclaration); pub const TopLevelDeclaration = union(enum) { - start: NodeRef, + start: RuleRef, rule: Rule, node: Node, + token: Token, }; pub const NodeRef = Reference(Node); // !mynode diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 9bff39e..166b173 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -23,7 +23,7 @@ const AstPrinter = struct { var iter = ast.iterate(decls); while (iter.next()) |decl| { switch (decl) { - .start => |item| print("start {}\n", .{printer.fmtId(item.identifier)}), + .start => |item| print("start <{}>;\n", .{printer.fmtId(item.identifier)}), .rule => |rule| { print("rule {s}", .{printer.fmtId(rule.name.value)}); @@ -54,6 +54,11 @@ const AstPrinter = struct { print("node {s}", .{printer.fmtId(node.name.value)}); print(";\n", .{}); }, + + .token => |token| { + print("token {s}", .{printer.fmtId(token.name.value)}); + print(";\n", .{}); + }, } } } @@ -75,8 +80,9 @@ const AstPrinter = struct { fn dumpProd(printer: AstPrinter, production: ast.Production) void { switch (production) { .literal => |lit| print("\"{}\"", .{printer.fmtString(lit.value)}), - .terminal => |term| print("<{}>", .{printer.fmtId(term.identifier)}), - .recursion => print("", .{}), + .terminal => |term| print("${}", .{printer.fmtId(term.identifier)}), + .recursion => |term| print("<{}>", .{printer.fmtId(term.identifier)}), + .sequence, .optional, .repetition_zero, .repetition_one => |seq| { print("(", .{}); diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig index f6c2705..981120c 100644 --- a/src/ptkgen/intl.zig +++ b/src/ptkgen/intl.zig @@ -39,6 +39,7 @@ pub const Localization = struct { invalid_string_escape: []const u8, excess_tokens: []const u8, illegal_empty_group: []const u8, + unexpected_toplevel_token: []const u8, }, errors: struct { diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 762bb4c..2fb57b2 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -6,6 +6,7 @@ "invalid_source_encoding": "Invalid source code encoding detected", "unexpected_token_eof": "Expected token {[expected_type]}, but end of file was discovered", "unexpected_token": "Expected token {[expected_type]}, but discovered token {[actual_type]} ('{[actual_text]}')", + "unexpected_toplevel_token": "Expected token 'start', 'rule', 'node' or 'token', but discovered token {[actual_type]} ('{[actual_text]}')", "unexpected_character": "Unexpected character: '{[character]}'", "unexpected_eof": "Unexpected end of file", "bad_string_escape": "Invalid string escape: Escape sequence at the end of string", diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 169f457..151ea33 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -7,6 +7,7 @@ const args_parser = @import("args"); const ptk = @import("parser-toolkit"); const ast = @import("ast.zig"); +const intl = @import("intl.zig"); const parser = @import("parser.zig"); const ast_dump = @import("ast_dump.zig"); @@ -22,6 +23,8 @@ pub const CliOptions = struct { output: ?[]const u8 = null, test_mode: TestMode = .none, + @"max-file-size": u32 = 4 * 1024, // 4 MB of source code is a lot! + pub const shorthands = .{ .h = "help", .o = "output", @@ -37,6 +40,8 @@ pub const CliOptions = struct { .output = "If given, will print the generated code into ", .test_mode = "(internal use only, required for testing)", + + .@"max-file-size" = "Maximum input file size in KiB (default: 4096)", }, }; }; @@ -46,7 +51,8 @@ const TestMode = enum { parse_only, }; -pub fn main() !u8 { +const AppError = error{OutOfMemory} || std.fs.File.WriteError; +pub fn main() AppError!u8 { // errdefer |e| @compileLog(@TypeOf(e)); var stdout = std.io.getStdOut(); @@ -97,14 +103,74 @@ pub fn main() !u8 { else "stdint"; - compileFile( - dynamic_allocator, - &diagnostics, - &string_pool, - input_file, - file_name, - cli.options.test_mode, - ) catch |err| switch (err) { + var expectations = std.ArrayList(TestExpectation).init(dynamic_allocator); + defer expectations.deinit(); + + const processing_ok = process_file: { + // 4 MB should be enough for now... + var source_code = input_file.readToEndAlloc(static_allocator, 1024 * cli.options.@"max-file-size") catch |err| { + try convertErrorToDiagnostics(&diagnostics, file_name, err); + break :process_file false; + }; + + defer static_allocator.free(source_code); + + if (cli.options.test_mode != .none) { + // in test mode, parse expectations from source code: + var lines = std.mem.tokenize(u8, source_code, "\n"); + while (lines.next()) |line| { + const prefix = "# expected:"; + if (std.mem.startsWith(u8, line, prefix)) { + var items = std.mem.tokenize(u8, line[prefix.len..], " \t,"); + while (items.next()) |error_code| { + if (error_code.len == 0 or (error_code[0] != 'E' and error_code[0] != 'W' and error_code[0] != 'D')) + @panic("invalid error code!"); + const id = std.fmt.parseInt(u16, error_code[1..], 10) catch @panic("bad integer"); + const code = std.meta.intToEnum(Diagnostics.Code, id) catch @panic("bad diagnostic code"); + try expectations.append(.{ .code = code }); + } + } + } + } + + compileFile( + dynamic_allocator, + &diagnostics, + &string_pool, + source_code, + file_name, + cli.options.test_mode, + ) catch |err| { + try convertErrorToDiagnostics(&diagnostics, file_name, err); + break :process_file false; + }; + + // Todo: continue from here? + + break :process_file true; + }; + + if (cli.options.test_mode == .none) { + try diagnostics.render(stderr.writer()); + + return if (processing_ok and !diagnostics.hasErrors()) + 0 // exit code for success + else + 1; // exit code for failure + } else { + // test fails through `error.TestExpectationMismatched`, not through diagnostics: + validateDiagnostics(dynamic_allocator, diagnostics, expectations.items) catch { + try stderr.writeAll("Full diagnostics:\n"); + try diagnostics.render(stderr.writer()); + + return 1; + }; + return 0; + } +} + +fn convertErrorToDiagnostics(diagnostics: *Diagnostics, file_name: []const u8, err: intl.FormattableError) error{OutOfMemory}!void { + switch (err) { // syntax errors must produce diagnostics: error.SyntaxError, error.InvalidSourceEncoding => std.debug.assert(diagnostics.hasErrors()), @@ -143,20 +209,6 @@ pub fn main() !u8 { .column = 1, }, .io_error, .{ .error_code = e }); }, - - error.TestExpectationMismatched => return 1, // this is a shortcut we can take to not render the diagnostics on test failure - }; - - if (cli.options.test_mode == .none) { - try diagnostics.render(stderr.writer()); - - return if (diagnostics.hasErrors()) - 1 - else - 0; - } else { - // test fails through `error.TestExpectationMismatched`, not through diagnostics - return 0; } } @@ -187,6 +239,7 @@ fn validateDiagnostics(allocator: std.mem.Allocator, diagnostics: Diagnostics, e if (std.mem.indexOfScalar(Diagnostics.Code, available.items, e)) |index| { _ = available.swapRemove(index); _ = expected.swapRemove(i); + // std.log.info("found matching diagnostic {s}", .{@tagName(e)}); } else { i += 1; } @@ -210,34 +263,10 @@ fn compileFile( allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, - input_file: std.fs.File, + source_code: []const u8, file_name: []const u8, mode: TestMode, ) !void { - var source_code = try input_file.readToEndAlloc(allocator, 4 << 20); // 4 MB should be enough for now... - defer allocator.free(source_code); - - var expectations = std.ArrayList(TestExpectation).init(allocator); - defer expectations.deinit(); - - if (mode != .none) { - // parse expectations from source code: - var lines = std.mem.tokenize(u8, source_code, "\n"); - while (lines.next()) |line| { - const prefix = "# expected:"; - if (std.mem.startsWith(u8, line, prefix)) { - var items = std.mem.tokenize(u8, line[prefix.len..], " \t,"); - while (items.next()) |error_code| { - if (error_code.len == 0 or (error_code[0] != 'E' and error_code[0] != 'W' and error_code[0] != 'D')) - @panic("invalid error code!"); - const id = std.fmt.parseInt(u16, error_code[1..], 10) catch @panic("bad integer"); - const code = std.meta.intToEnum(Diagnostics.Code, id) catch @panic("bad diagnostic code"); - try expectations.append(.{ .code = code }); - } - } - } - } - var tree = try parser.parse( allocator, diagnostics, @@ -247,19 +276,11 @@ fn compileFile( ); defer tree.deinit(); - if (mode == .parse_only) { - try validateDiagnostics(allocator, diagnostics.*, expectations.items); - return; - } - // TODO: Implement sema // TODO: Implement parsergen / tablegen / highlightergen if (mode == .none) { ast_dump.dump(string_pool, tree); - } else { - // we need to validate against test expectations when doing *any* test mode - try validateDiagnostics(allocator, diagnostics.*, expectations.items); } } diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index c586b67..f739f85 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -143,24 +143,45 @@ const Parser = struct { } fn acceptTopLevelDecl(parser: *Parser) FatalAcceptError!?ast.TopLevelDeclaration { + if (parser.acceptStartDecl()) |root_rule| { + return .{ .start = root_rule }; + } else |err| try filterAcceptError(err); + if (parser.acceptRule()) |rule| { return .{ .rule = rule }; } else |err| try filterAcceptError(err); // Detect any excess tokens on the top level: - const excess_tokens = if (parser.core.nextToken()) |token| - (token != null) - else |err| switch (err) { - error.UnexpectedCharacter => true, - }; - if (excess_tokens) { - try parser.emitDiagnostic(null, .unexpected_eof, .{}); - return error.SyntaxError; + if (parser.core.nextToken()) |maybe_token| { + if (maybe_token) |token| { + try parser.emitDiagnostic(token.location, .unexpected_toplevel_token, .{ + .actual_type = token.type, + .actual_text = token.text, + }); + return error.SyntaxError; + } else { + // This is actually the good path here, as only if we don't find any token or tokenization error, + // we reached the end of the file. + } + } else |err| switch (err) { + error.UnexpectedCharacter => { + try parser.emitUnexpectedCharacter(parser.core.tokenizer.current_location, parser.core.tokenizer.offset); + return error.SyntaxError; + }, } return null; } + fn acceptStartDecl(parser: *Parser) AcceptError!ast.RuleRef { + try parser.acceptLiteral(.start, .recover); + const init_rule = try parser.acceptRuleReference(.fail); + + try parser.acceptLiteral(.@";", .fail); + + return init_rule; + } + fn acceptRule(parser: *Parser) AcceptError!ast.Rule { var state = parser.save(); errdefer parser.restore(state); @@ -229,7 +250,7 @@ const Parser = struct { } if (list.len() == 0) { - // Empty list is a non-recoverable syntax error: + // Empty list is a recoverable syntax error: try parser.emitDiagnostic(null, .illegal_empty_group, .{}); } @@ -252,11 +273,20 @@ const Parser = struct { } } - const str = try parser.acceptStringLiteral(.recover); + if (parser.acceptStringLiteral(.recover)) |str| { + return ast.Production{ .literal = str }; + } else |err| try filterAcceptError(err); - return ast.Production{ - .literal = str, - }; + if (parser.acceptTokenReference(.recover)) |ref| { + return ast.Production{ .terminal = ref }; + } else |err| try filterAcceptError(err); + + if (parser.acceptRuleReference(.recover)) |ref| { + return ast.Production{ .recursion = ref }; + } else |err| try filterAcceptError(err); + + // We're done with out list + return error.UnexpectedTokenRecoverable; } fn acceptAstMapping(parser: *Parser) AcceptError!ast.AstMapping { @@ -282,18 +312,45 @@ const Parser = struct { fn acceptIdentifier(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.Identifier { const token = try parser.acceptToken(.identifier, accept_mode); - return ast.Identifier{ .location = token.location, .value = try parser.unwrapIdentifierString(token.location, token.text), }; } + fn acceptRuleReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.RuleRef { + const token = try parser.acceptToken(.rule_ref, accept_mode); + std.debug.assert(std.mem.startsWith(u8, token.text, "<")); + std.debug.assert(std.mem.endsWith(u8, token.text, ">")); + return ast.RuleRef{ + .location = token.location, + .identifier = try parser.unwrapIdentifierString(token.location, token.text[1 .. token.text.len - 1]), + }; + } + + fn acceptTokenReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.TokenRef { + const token = try parser.acceptToken(.token_ref, accept_mode); + std.debug.assert(std.mem.startsWith(u8, token.text, "$")); + return ast.TokenRef{ + .location = token.location, + .identifier = try parser.unwrapIdentifierString(token.location, token.text[1..]), + }; + } + + fn acceptNodeReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.NodeRef { + const token = try parser.acceptToken(.node_ref, accept_mode); + std.debug.assert(std.mem.startsWith(u8, token.text, "!")); + return ast.NodeRef{ + .location = token.location, + .identifier = try parser.unwrapIdentifierString(token.location, token.text[1..]), + }; + } + fn acceptLiteral(parser: *Parser, comptime token_type: TokenType, accept_mode: AcceptMode) AcceptError!void { _ = try parser.acceptToken(token_type, accept_mode); } - fn tryAcceptLiteral(parser: *Parser, comptime token_type: TokenType) !bool { + fn tryAcceptLiteral(parser: *Parser, comptime token_type: TokenType) FatalAcceptError!bool { _ = parser.acceptToken(token_type, .recover) catch |err| switch (err) { error.UnexpectedTokenRecoverable => return false, error.OutOfMemory, error.InvalidSourceEncoding, error.SyntaxError => |e| return e, diff --git a/test/parser/accept/basic-rule-ref.ptk b/test/parser/accept/basic-rule-ref.ptk new file mode 100644 index 0000000..e31192c --- /dev/null +++ b/test/parser/accept/basic-rule-ref.ptk @@ -0,0 +1 @@ +rule output = ; \ No newline at end of file diff --git a/test/parser/accept/basic-token-ref.ptk b/test/parser/accept/basic-token-ref.ptk new file mode 100644 index 0000000..29f9ce7 --- /dev/null +++ b/test/parser/accept/basic-token-ref.ptk @@ -0,0 +1 @@ +rule output = $terminal; \ No newline at end of file diff --git a/test/parser/accept/document-start.ptk b/test/parser/accept/document-start.ptk new file mode 100644 index 0000000..0623db6 --- /dev/null +++ b/test/parser/accept/document-start.ptk @@ -0,0 +1 @@ +start ; \ No newline at end of file diff --git a/test/parser/accept/rule-primitive-sequence.ptk b/test/parser/accept/rule-primitive-sequence.ptk new file mode 100644 index 0000000..0067313 --- /dev/null +++ b/test/parser/accept/rule-primitive-sequence.ptk @@ -0,0 +1 @@ +rule sequence = "literal" $terminal "literal" $terminal ; \ No newline at end of file diff --git a/test/parser/reject/unexpected-token-string.rule b/test/parser/reject/unexpected-token-string.rule new file mode 100644 index 0000000..4848c41 --- /dev/null +++ b/test/parser/reject/unexpected-token-string.rule @@ -0,0 +1,2 @@ +# expected: E1108 +"bad toplevel token!" \ No newline at end of file From 45a467f038a9e9c8f6f4dfca0186574cb5ab5ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 3 Nov 2023 12:53:20 +0100 Subject: [PATCH 08/20] Starts parsing of ast mappings. --- build.zig | 20 ++- examples/ptkgen/ast-with-unions.ptk | 6 +- examples/ptkgen/grammar.ptk | 30 ++++- src/ptkgen/Diagnostics.zig | 13 ++ src/ptkgen/ast.zig | 17 ++- src/ptkgen/intl.zig | 44 ++++--- src/ptkgen/intl/en.json | 5 +- src/ptkgen/parser.zig | 121 +++++++++++++++++- test/parser/accept/mapping-code-literal.ptk | 1 + test/parser/accept/mapping-user-value.ptk | 1 + test/parser/accept/mapping-value-ref.ptk | 1 + .../reject/bad-mapping-invalid-token.ptk | 2 + test/parser/reject/bad-mapping-too-long.ptk | 2 + .../{empty-group.rule => empty-group.ptk} | 0 test/parser/reject/empty-mapping.ptk | 2 + ...empty-optional.rule => empty-optional.ptk} | 0 .../{empty-rep_one.rule => empty-rep_one.ptk} | 0 ...empty-rep_zero.rule => empty-rep_zero.ptk} | 0 .../{empty-rule.rule => empty-rule.ptk} | 0 ...tring.rule => unexpected-token-string.ptk} | 0 20 files changed, 226 insertions(+), 39 deletions(-) create mode 100644 test/parser/accept/mapping-code-literal.ptk create mode 100644 test/parser/accept/mapping-user-value.ptk create mode 100644 test/parser/accept/mapping-value-ref.ptk create mode 100644 test/parser/reject/bad-mapping-invalid-token.ptk create mode 100644 test/parser/reject/bad-mapping-too-long.ptk rename test/parser/reject/{empty-group.rule => empty-group.ptk} (100%) create mode 100644 test/parser/reject/empty-mapping.ptk rename test/parser/reject/{empty-optional.rule => empty-optional.ptk} (100%) rename test/parser/reject/{empty-rep_one.rule => empty-rep_one.ptk} (100%) rename test/parser/reject/{empty-rep_zero.rule => empty-rep_zero.ptk} (100%) rename test/parser/reject/{empty-rule.rule => empty-rule.ptk} (100%) rename test/parser/reject/{unexpected-token-string.rule => unexpected-token-string.ptk} (100%) diff --git a/build.zig b/build.zig index e2ba020..05e948e 100644 --- a/build.zig +++ b/build.zig @@ -147,15 +147,23 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/document-start.ptk", + "test/parser/accept/mapping-value-ref.ptk", + "test/parser/accept/mapping-code-literal.ptk", + "test/parser/accept/mapping-user-value.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; const parser_reject_files = [_][]const u8{ - "test/parser/reject/empty-rule.rule", - "test/parser/reject/empty-group.rule", - "test/parser/reject/empty-optional.rule", - "test/parser/reject/empty-rep_one.rule", - "test/parser/reject/empty-rep_zero.rule", + "test/parser/reject/empty-rule.ptk", + "test/parser/reject/empty-group.ptk", + "test/parser/reject/empty-optional.ptk", + "test/parser/reject/empty-rep_one.ptk", + "test/parser/reject/empty-rep_zero.ptk", + + "test/parser/reject/unexpected-token-string.ptk", - "test/parser/reject/unexpected-token-string.rule", + "test/parser/reject/empty-mapping.ptk", + "test/parser/reject/bad-mapping-invalid-token.ptk", + "test/parser/reject/bad-mapping-too-long.ptk", }; diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index fa170fc..1b041d7 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -56,7 +56,7 @@ node TLDeclaration = union ; rule toplevel-decl : !TLDeclaration = - => ns: $0, # this is syntax for a union field selector as unions are not compounds - | => interface: $0, - | => module: $0, + => ns: $0 # this is syntax for a union field selector as unions are not compounds + | => interface: $0 + | => module: $0 ; diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 07edaec..03caa88 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -34,6 +34,30 @@ rule production = | "(" ")" ; -# rule mapping = -# # TODO -# ; \ No newline at end of file +rule mapping = + $identifier ":" + | +; + +rule mapped_value = + # { field = , field = , ... } + | # { , , ... } + | $code_literal # `code` + | $value_ref # $0 + | $userval "(" ")" # @func(...) + | $userval # @value +; + +rule struct_ctor = + "{" ( "," )* "}" +; + +rule assign_field = + $identifier "=" $mapped_value +; + +rule list_ctor = "{" "}"; + +rule value_list = + ( "," )* +; \ No newline at end of file diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 555ac8d..e9de60f 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -28,9 +28,12 @@ pub const Code = enum(u16) { invalid_string_escape = 1106, excess_tokens = 1107, unexpected_toplevel_token = 1108, + unexpected_token_no_context = 1109, // recoverable syntax errors: illegal_empty_group = 1200, + empty_mapping = 1201, + integer_overflow = 1202, comptime { std.debug.assert(first_error < first_warning); @@ -73,6 +76,9 @@ pub fn Data(comptime code: Code) type { actual_type: parser.TokenType, actual_text: []const u8, }, + .unexpected_token_no_context => struct { + actual_type: parser.TokenType, + }, .unexpected_eof => NoDiagnosticData, .invalid_source_encoding => NoDiagnosticData, @@ -83,6 +89,13 @@ pub fn Data(comptime code: Code) type { .excess_tokens => struct { token_type: parser.TokenType }, .illegal_empty_group => NoDiagnosticData, + .empty_mapping => NoDiagnosticData, + + .integer_overflow => struct { + min: []const u8, + max: []const u8, + actual: []const u8, + }, // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index 1bfac2a..163d94b 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -65,7 +65,7 @@ fn String(comptime Tag: anytype) type { pub const Identifier = String(.identifier); pub const StringLiteral = String(.string); pub const CodeLiteral = String(.code); -pub const BuiltinLiteral = String(.builtin); +pub const UserDefinedIdentifier = String(.user_defined); pub const Document = List(TopLevelDeclaration); @@ -119,8 +119,9 @@ pub const AstMapping = union(enum) { constructor: List(FieldAssignment), // { field = ..., field = ... } literal: CodeLiteral, // field: value context_reference: ValueRef, // $0 - user_reference: BuiltinLiteral, // @field - function_call: FunctionCall, // ...(a,b,c) + user_reference: UserDefinedIdentifier, // @field + user_function_call: FunctionCall(UserDefinedIdentifier), // @builtin(a,b,c) + function_call: FunctionCall(Identifier), // identifier(a,b,c) union_init: UnionInitializer, }; @@ -129,10 +130,12 @@ pub const UnionInitializer = struct { value: *AstMapping, }; -pub const FunctionCall = struct { - function: *AstMapping, - arguments: List(AstMapping), -}; +pub fn FunctionCall(comptime Name: type) type { + return struct { + function: Name, + arguments: List(AstMapping), + }; +} pub const FieldAssignment = struct { location: Location, diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig index 981120c..70af58f 100644 --- a/src/ptkgen/intl.zig +++ b/src/ptkgen/intl.zig @@ -1,5 +1,7 @@ const std = @import("std"); +const Diagnostics = @import("Diagnostics.zig"); + pub const Language = enum { en, }; @@ -12,7 +14,7 @@ pub const localizations = struct { pub const en = Localization.generate(@embedFile("intl/en.json")); }; -pub const FormattableError = blk: { +pub const FormattableError: type = blk: { const list = @typeInfo(std.meta.fieldInfo(Localization, .errors).type).Struct.fields; var errors: [list.len]std.builtin.Type.Error = undefined; @@ -25,22 +27,32 @@ pub const FormattableError = blk: { }); }; +pub const DiagnosticStrings: type = blk: { + const list = @typeInfo(Diagnostics.Code).Enum.fields; + + var dst_fields: [list.len]std.builtin.Type.StructField = undefined; + for (&dst_fields, list) |*dst, src| { + dst.* = .{ + .name = src.name, + .type = []const u8, + .default_value = null, + .is_comptime = false, + .alignment = @alignOf([]const u8), + }; + } + + break :blk @Type(.{ + .Struct = .{ + .layout = .Auto, + .fields = &dst_fields, + .decls = &.{}, + .is_tuple = false, + }, + }); +}; + pub const Localization = struct { - diagnostics: struct { - out_of_memory: []const u8, - file_limit_exceeded: []const u8, - io_error: []const u8, - invalid_source_encoding: []const u8, - unexpected_token_eof: []const u8, - unexpected_token: []const u8, - unexpected_character: []const u8, - unexpected_eof: []const u8, - bad_string_escape: []const u8, - invalid_string_escape: []const u8, - excess_tokens: []const u8, - illegal_empty_group: []const u8, - unexpected_toplevel_token: []const u8, - }, + diagnostics: DiagnosticStrings, errors: struct { Unexpected: []const u8, diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 2fb57b2..dd32bbe 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -12,7 +12,10 @@ "bad_string_escape": "Invalid string escape: Escape sequence at the end of string", "invalid_string_escape": "Invalid string escape \\{[escape]}", "excess_tokens": "Excess token at the end of the file: {[token_type]}", - "illegal_empty_group": "Production sequence may not be empty" + "illegal_empty_group": "Production sequence may not be empty", + "unexpected_token_no_context": "Unexpected token '{[actual_type]}'", + "empty_mapping": "Empty mappings are not allowed", + "integer_overflow": "Integer value {[actual]} out of range. Values must be between {[min]} and {[max]}" }, "errors": { "Unexpected": "unexpected error encountered", diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index f739f85..1f7b0bd 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -82,7 +82,7 @@ pub const TokenType = enum { rule_ref, // token_ref, // $token value_ref, // $0 - builtin_ref, // @builtin + userval_ref, // @userval // values @@ -290,8 +290,98 @@ const Parser = struct { } fn acceptAstMapping(parser: *Parser) AcceptError!ast.AstMapping { + const position = parser.core.tokenizer.current_location; + + if (parser.acceptUnionInit()) |init| { + return .{ .union_init = init }; + } else |err| try filterAcceptError(err); + + if (parser.acceptCodeLiteral()) |literal| { + return .{ .literal = literal }; + } else |err| try filterAcceptError(err); + + if (parser.acceptValueReference()) |literal| { + return .{ .context_reference = literal }; + } else |err| try filterAcceptError(err); + + if (parser.acceptBuiltinCall()) |call| { + return .{ .function_call = call }; + } else |err| try filterAcceptError(err); + + if (parser.acceptUserCall()) |call| { + return .{ .user_function_call = call }; + } else |err| try filterAcceptError(err); + + if (parser.acceptUserReference()) |ref| { + return .{ .user_reference = ref }; + } else |err| try filterAcceptError(err); + + if (try parser.tryAcceptLiteral(.@";") or try parser.tryAcceptLiteral(.@"|")) { + try parser.emitDiagnostic(position, .empty_mapping, .{}); + return error.SyntaxError; + } + + return parser.emitUnexpectedToken(); + } + + fn acceptUnionInit(parser: *Parser) AcceptError!ast.UnionInitializer { _ = parser; - @panic("not implemented yet"); + return error.UnexpectedTokenRecoverable; + } + + fn acceptCodeLiteral(parser: *Parser) AcceptError!ast.CodeLiteral { + const token = try parser.acceptToken(.code_literal, .recover); + + std.debug.assert(std.mem.startsWith(u8, token.text, "`")); + std.debug.assert(std.mem.endsWith(u8, token.text, "`")); + + var prefix_len: usize = 0; + while (token.text[prefix_len] == '`') { + prefix_len += 1; + } + + return ast.CodeLiteral{ + .location = token.location, + .value = try parser.pool.insert(token.text[prefix_len .. token.text.len - prefix_len]), + }; + } + + fn acceptValueReference(parser: *Parser) AcceptError!ast.ValueRef { + const token = try parser.acceptToken(.value_ref, .recover); + std.debug.assert(std.mem.startsWith(u8, token.text, "$")); + return ast.ValueRef{ + .location = token.location, + .index = std.fmt.parseInt(u32, token.text[1..], 10) catch |err| switch (err) { + error.InvalidCharacter => unreachable, // ensured by tokenizer, + error.Overflow => blk: { + try parser.emitDiagnostic(token.location, .integer_overflow, .{ + .min = comptime std.fmt.comptimePrint("{}", .{std.math.minInt(u32)}), + .max = comptime std.fmt.comptimePrint("{}", .{std.math.maxInt(u32)}), + .actual = token.text[1..], + }); + break :blk 0; + }, + }, + }; + } + + fn acceptBuiltinCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.Identifier) { + _ = parser; + return error.UnexpectedTokenRecoverable; + } + + fn acceptUserCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.UserDefinedIdentifier) { + _ = parser; + return error.UnexpectedTokenRecoverable; + } + + fn acceptUserReference(parser: *Parser) AcceptError!ast.UserDefinedIdentifier { + const token = try parser.acceptToken(.userval_ref, .recover); + std.debug.assert(std.mem.startsWith(u8, token.text, "@")); + return ast.UserDefinedIdentifier{ + .location = token.location, + .value = try parser.pool.insert(token.text[1..]), + }; } fn acceptTypeSpec(parser: *Parser) AcceptError!ast.TypeSpec { @@ -415,6 +505,31 @@ const Parser = struct { try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, code, data); } + fn emitUnexpectedToken(parser: *Parser) AcceptError { + const state = parser.save(); + defer parser.restore(state); + + const location = parser.core.tokenizer.current_location; + const offset = parser.core.tokenizer.offset; + + const token_or_null = parser.core.nextToken() catch |err| switch (err) { + error.UnexpectedCharacter => { + try parser.emitUnexpectedCharacter(location, offset); + return error.SyntaxError; + }, + }; + + const token = token_or_null orelse { + try parser.emitDiagnostic(location, .unexpected_eof, .{}); + return error.SyntaxError; + }; + + try parser.emitDiagnostic(location, .unexpected_token_no_context, .{ + .actual_type = token.type, + }); + return error.SyntaxError; + } + fn emitUnexpectedCharacter(parser: Parser, location: ptk.Location, source_offset: usize) !void { var utf8_view = std.unicode.Utf8View.init(parser.core.tokenizer.source[source_offset..]) catch { try parser.emitDiagnostic(location, .invalid_source_encoding, .{}); @@ -577,7 +692,7 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.rule_ref, matchRuleRef), Pattern.create(.token_ref, matchTokenRef), Pattern.create(.value_ref, matchValueRef), - Pattern.create(.builtin_ref, matchBuiltinRef), + Pattern.create(.userval_ref, matchBuiltinRef), // Whitespace is the "kitchen sink" at the end: Pattern.create(.whitespace, match.takeAnyOf(" \r\n\t")), diff --git a/test/parser/accept/mapping-code-literal.ptk b/test/parser/accept/mapping-code-literal.ptk new file mode 100644 index 0000000..b18e2b9 --- /dev/null +++ b/test/parser/accept/mapping-code-literal.ptk @@ -0,0 +1 @@ +rule r = "" => `.item`; \ No newline at end of file diff --git a/test/parser/accept/mapping-user-value.ptk b/test/parser/accept/mapping-user-value.ptk new file mode 100644 index 0000000..2183ab2 --- /dev/null +++ b/test/parser/accept/mapping-user-value.ptk @@ -0,0 +1 @@ +rule r = "" => @value; \ No newline at end of file diff --git a/test/parser/accept/mapping-value-ref.ptk b/test/parser/accept/mapping-value-ref.ptk new file mode 100644 index 0000000..b2293b8 --- /dev/null +++ b/test/parser/accept/mapping-value-ref.ptk @@ -0,0 +1 @@ +rule r = "" => $0; \ No newline at end of file diff --git a/test/parser/reject/bad-mapping-invalid-token.ptk b/test/parser/reject/bad-mapping-invalid-token.ptk new file mode 100644 index 0000000..5d783df --- /dev/null +++ b/test/parser/reject/bad-mapping-invalid-token.ptk @@ -0,0 +1,2 @@ +# expected: E1109 +rule group = "value" => "bad" ; \ No newline at end of file diff --git a/test/parser/reject/bad-mapping-too-long.ptk b/test/parser/reject/bad-mapping-too-long.ptk new file mode 100644 index 0000000..1ecf764 --- /dev/null +++ b/test/parser/reject/bad-mapping-too-long.ptk @@ -0,0 +1,2 @@ +# expected: E1102 +rule group = "value" => $0 whatever ; \ No newline at end of file diff --git a/test/parser/reject/empty-group.rule b/test/parser/reject/empty-group.ptk similarity index 100% rename from test/parser/reject/empty-group.rule rename to test/parser/reject/empty-group.ptk diff --git a/test/parser/reject/empty-mapping.ptk b/test/parser/reject/empty-mapping.ptk new file mode 100644 index 0000000..6479ae9 --- /dev/null +++ b/test/parser/reject/empty-mapping.ptk @@ -0,0 +1,2 @@ +# expected: E1201 +rule group = "value" => ; \ No newline at end of file diff --git a/test/parser/reject/empty-optional.rule b/test/parser/reject/empty-optional.ptk similarity index 100% rename from test/parser/reject/empty-optional.rule rename to test/parser/reject/empty-optional.ptk diff --git a/test/parser/reject/empty-rep_one.rule b/test/parser/reject/empty-rep_one.ptk similarity index 100% rename from test/parser/reject/empty-rep_one.rule rename to test/parser/reject/empty-rep_one.ptk diff --git a/test/parser/reject/empty-rep_zero.rule b/test/parser/reject/empty-rep_zero.ptk similarity index 100% rename from test/parser/reject/empty-rep_zero.rule rename to test/parser/reject/empty-rep_zero.ptk diff --git a/test/parser/reject/empty-rule.rule b/test/parser/reject/empty-rule.ptk similarity index 100% rename from test/parser/reject/empty-rule.rule rename to test/parser/reject/empty-rule.ptk diff --git a/test/parser/reject/unexpected-token-string.rule b/test/parser/reject/unexpected-token-string.ptk similarity index 100% rename from test/parser/reject/unexpected-token-string.rule rename to test/parser/reject/unexpected-token-string.ptk From 73b5c85ea7a003f820c87c852dc4c30380a33021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 3 Nov 2023 19:01:39 +0100 Subject: [PATCH 09/20] Implements a good amount of ast mappings --- build.zig | 17 +++ examples/ptkgen/grammar.ptk | 6 +- src/ptkgen/ast.zig | 8 +- src/ptkgen/ast_dump.zig | 58 +++++++- src/ptkgen/parser.zig | 126 ++++++++++++++++-- test/parser/accept/mapping-array-a0.ptk | 1 + test/parser/accept/mapping-array-a1.ptk | 1 + test/parser/accept/mapping-array-a5.ptk | 1 + test/parser/accept/mapping-array-nested.ptk | 1 + .../accept/mapping-builtin-function-a0.ptk | 1 + .../accept/mapping-builtin-function-a1.ptk | 1 + .../accept/mapping-builtin-function-a5.ptk | 1 + .../accept/mapping-builtin-function-nest.ptk | 1 + .../accept/mapping-user-function-a0.ptk | 1 + .../accept/mapping-user-function-a1.ptk | 1 + .../accept/mapping-user-function-a5.ptk | 1 + .../accept/mapping-user-function-nest.ptk | 1 + test/parser/accept/mapping-variant-init.ptk | 1 + 18 files changed, 207 insertions(+), 21 deletions(-) create mode 100644 test/parser/accept/mapping-array-a0.ptk create mode 100644 test/parser/accept/mapping-array-a1.ptk create mode 100644 test/parser/accept/mapping-array-a5.ptk create mode 100644 test/parser/accept/mapping-array-nested.ptk create mode 100644 test/parser/accept/mapping-builtin-function-a0.ptk create mode 100644 test/parser/accept/mapping-builtin-function-a1.ptk create mode 100644 test/parser/accept/mapping-builtin-function-a5.ptk create mode 100644 test/parser/accept/mapping-builtin-function-nest.ptk create mode 100644 test/parser/accept/mapping-user-function-a0.ptk create mode 100644 test/parser/accept/mapping-user-function-a1.ptk create mode 100644 test/parser/accept/mapping-user-function-a5.ptk create mode 100644 test/parser/accept/mapping-user-function-nest.ptk create mode 100644 test/parser/accept/mapping-variant-init.ptk diff --git a/build.zig b/build.zig index 05e948e..9cfc229 100644 --- a/build.zig +++ b/build.zig @@ -151,6 +151,23 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/mapping-code-literal.ptk", "test/parser/accept/mapping-user-value.ptk", + "test/parser/accept/mapping-builtin-function-a0.ptk", + "test/parser/accept/mapping-builtin-function-a1.ptk", + "test/parser/accept/mapping-builtin-function-a5.ptk", + "test/parser/accept/mapping-builtin-function-nest.ptk", + + "test/parser/accept/mapping-user-function-a0.ptk", + "test/parser/accept/mapping-user-function-a1.ptk", + "test/parser/accept/mapping-user-function-a5.ptk", + "test/parser/accept/mapping-user-function-nest.ptk", + + "test/parser/accept/mapping-array-a0.ptk", + "test/parser/accept/mapping-array-a1.ptk", + "test/parser/accept/mapping-array-a5.ptk", + "test/parser/accept/mapping-array-nested.ptk", + + "test/parser/accept/mapping-variant-init.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 03caa88..30b0a2a 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -35,12 +35,12 @@ rule production = ; rule mapping = - $identifier ":" - | + $identifier ":" # variant init + | # regular init ; rule mapped_value = - # { field = , field = , ... } + # { field = , field = , ... } | # { , , ... } | $code_literal # `code` | $value_ref # $0 diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index 163d94b..5b7c715 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -116,16 +116,18 @@ pub const Production = union(enum) { }; pub const AstMapping = union(enum) { - constructor: List(FieldAssignment), // { field = ..., field = ... } + record: List(FieldAssignment), // { field = ..., field = ... } + list: List(AstMapping), // { ..., ..., ... } + variant: VariantInitializer, // field: ... + literal: CodeLiteral, // field: value context_reference: ValueRef, // $0 user_reference: UserDefinedIdentifier, // @field user_function_call: FunctionCall(UserDefinedIdentifier), // @builtin(a,b,c) function_call: FunctionCall(Identifier), // identifier(a,b,c) - union_init: UnionInitializer, }; -pub const UnionInitializer = struct { +pub const VariantInitializer = struct { field: Identifier, value: *AstMapping, }; diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 166b173..64571a1 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -66,13 +66,14 @@ const AstPrinter = struct { fn dumpAstType(printer: AstPrinter, typespec: ast.TypeSpec) void { _ = printer; _ = typespec; - std.debug.print("", .{}); + print("", .{}); } fn dumpMappedProd(printer: AstPrinter, mapped_prod: ast.MappedProduction) void { printer.dumpProd(mapped_prod.production); if (mapped_prod.mapping) |mapping| { + print(" => ", .{}); printer.dumpMapping(mapping); } } @@ -105,9 +106,58 @@ const AstPrinter = struct { } fn dumpMapping(printer: AstPrinter, mapping: ast.AstMapping) void { - _ = printer; - _ = mapping; - print("", .{}); + switch (mapping) { + .record => |record| { + _ = record; + @panic("printing not implemented yet"); + }, + + .list => |list| { + if (list.len() > 0) { + print("{{ ", .{}); + printer.dumpMappingList(list); + print(" }}", .{}); + } else { + print("{{}}", .{}); + } + }, + + .variant => |variant| { + print("{}: ", .{printer.fmtId(variant.field.value)}); + printer.dumpMapping(variant.value.*); + }, + + .literal => |literal| print("`{s}`", .{printer.strings.get(literal.value)}), + + .context_reference => |context_reference| print("${}", .{context_reference.index}), + + .user_reference => |user_reference| print("@{}", .{printer.fmtId(user_reference.value)}), + + .user_function_call => |user_function_call| { + print("@{}(", .{printer.fmtId(user_function_call.function.value)}); + printer.dumpMappingList(user_function_call.arguments); + print(")", .{}); + }, + + .function_call => |function_call| { + print("{}(", .{printer.fmtId(function_call.function.value)}); + printer.dumpMappingList(function_call.arguments); + print(")", .{}); + }, + } + } + + fn dumpMappingList(printer: AstPrinter, list: ast.List(ast.AstMapping)) void { + var first = true; + var iter = ast.iterate(list); + while (iter.next()) |arg| { + if (!first) { + print(", ", .{}); + } + first = false; + + printer.dumpMapping(arg); + } } fn fmtString(printer: AstPrinter, str: ptk.strings.String) StringPrinter { diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 1f7b0bd..f47e7cc 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -223,7 +223,7 @@ const Parser = struct { var sequence = try parser.acceptProductionSequence(); const mapping = if (try parser.tryAcceptLiteral(.@"=>")) - try parser.acceptAstMapping() + try parser.acceptAstMapping(.fail) else null; @@ -289,11 +289,22 @@ const Parser = struct { return error.UnexpectedTokenRecoverable; } - fn acceptAstMapping(parser: *Parser) AcceptError!ast.AstMapping { + fn acceptAstMapping(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.AstMapping { + const state = parser.save(); + errdefer parser.restore(state); + const position = parser.core.tokenizer.current_location; - if (parser.acceptUnionInit()) |init| { - return .{ .union_init = init }; + if (parser.acceptVariantInit()) |init| { + return .{ .variant = init }; + } else |err| try filterAcceptError(err); + + if (parser.acceptRecordInit()) |init| { + return .{ .record = init }; + } else |err| try filterAcceptError(err); + + if (parser.acceptListInit()) |init| { + return .{ .list = init }; } else |err| try filterAcceptError(err); if (parser.acceptCodeLiteral()) |literal| { @@ -321,14 +332,51 @@ const Parser = struct { return error.SyntaxError; } - return parser.emitUnexpectedToken(); + switch (accept_mode) { + .recover => return error.UnexpectedTokenRecoverable, + .fail => return parser.emitUnexpectedToken(), + } } - fn acceptUnionInit(parser: *Parser) AcceptError!ast.UnionInitializer { - _ = parser; + fn acceptVariantInit(parser: *Parser) AcceptError!ast.VariantInitializer { + const state = parser.save(); + errdefer parser.restore(state); + + const field = try parser.acceptIdentifier(.recover); + + try parser.acceptLiteral(.@":", .recover); + + const value = try parser.acceptAstMapping(.fail); + + const clone = try parser.arena.create(ast.AstMapping); + clone.* = value; + + return .{ + .field = field, + .value = clone, + }; + } + + fn acceptRecordInit(parser: *Parser) AcceptError!ast.List(ast.FieldAssignment) { + const state = parser.save(); + errdefer parser.restore(state); + return error.UnexpectedTokenRecoverable; } + fn acceptListInit(parser: *Parser) AcceptError!ast.List(ast.AstMapping) { + const state = parser.save(); + errdefer parser.restore(state); + + try parser.acceptLiteral(.@"{", .recover); + + var items = try parser.acceptMappingList(); + + try parser.acceptLiteral(.@"}", .fail); + + return items; + } + fn acceptCodeLiteral(parser: *Parser) AcceptError!ast.CodeLiteral { const token = try parser.acceptToken(.code_literal, .recover); @@ -366,13 +414,40 @@ const Parser = struct { } fn acceptBuiltinCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.Identifier) { - _ = parser; - return error.UnexpectedTokenRecoverable; + const state = parser.save(); + errdefer parser.restore(state); + + const id = try parser.acceptIdentifier(.recover); + + try parser.acceptLiteral(.@"(", .fail); // a builtin function is the only legal way to use an identifier here, so we fail unrecoverably + + const list = try parser.acceptMappingList(); + + try parser.acceptLiteral(.@")", .fail); + + return .{ + .function = id, + .arguments = list, + }; } fn acceptUserCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.UserDefinedIdentifier) { - _ = parser; - return error.UnexpectedTokenRecoverable; + const state = parser.save(); + errdefer parser.restore(state); + + const id = try parser.acceptUserReference(); + + // If we only accept a user value, fail and fall back to regular user value acceptance later + try parser.acceptLiteral(.@"(", .recover); + + const list = try parser.acceptMappingList(); + + try parser.acceptLiteral(.@")", .fail); + + return .{ + .function = id, + .arguments = list, + }; } fn acceptUserReference(parser: *Parser) AcceptError!ast.UserDefinedIdentifier { @@ -384,6 +459,35 @@ const Parser = struct { }; } + fn acceptMappingList(parser: *Parser) AcceptError!ast.List(ast.AstMapping) { + const list_state = parser.save(); + errdefer parser.restore(list_state); + + var list = ast.List(ast.AstMapping){}; + + var accept_mode: AcceptMode = .recover; + while (true) { + // first item is allowed to be failing, otherwise comma separation must be done! + defer accept_mode = .fail; + + const item_state = parser.save(); + + if (parser.acceptAstMapping(accept_mode)) |mapping| { + try parser.append(ast.AstMapping, &list, mapping); + } else |err| { + try filterAcceptError(err); + parser.restore(item_state); // rollback to the previous item + break; + } + + if (!try parser.tryAcceptLiteral(.@",")) { + break; + } + } + + return list; + } + fn acceptTypeSpec(parser: *Parser) AcceptError!ast.TypeSpec { _ = parser; @panic("not implemented yet"); diff --git a/test/parser/accept/mapping-array-a0.ptk b/test/parser/accept/mapping-array-a0.ptk new file mode 100644 index 0000000..3ef8c33 --- /dev/null +++ b/test/parser/accept/mapping-array-a0.ptk @@ -0,0 +1 @@ +rule r = "" => { }; \ No newline at end of file diff --git a/test/parser/accept/mapping-array-a1.ptk b/test/parser/accept/mapping-array-a1.ptk new file mode 100644 index 0000000..48a6912 --- /dev/null +++ b/test/parser/accept/mapping-array-a1.ptk @@ -0,0 +1 @@ +rule r = "" => { $0 }; \ No newline at end of file diff --git a/test/parser/accept/mapping-array-a5.ptk b/test/parser/accept/mapping-array-a5.ptk new file mode 100644 index 0000000..a46ab16 --- /dev/null +++ b/test/parser/accept/mapping-array-a5.ptk @@ -0,0 +1 @@ +rule r = "" => { $0, $1, $2, $3, $4 }; \ No newline at end of file diff --git a/test/parser/accept/mapping-array-nested.ptk b/test/parser/accept/mapping-array-nested.ptk new file mode 100644 index 0000000..be8a59a --- /dev/null +++ b/test/parser/accept/mapping-array-nested.ptk @@ -0,0 +1 @@ +rule r = "" => { $0, { $10, $11, $12 }, $2 }; \ No newline at end of file diff --git a/test/parser/accept/mapping-builtin-function-a0.ptk b/test/parser/accept/mapping-builtin-function-a0.ptk new file mode 100644 index 0000000..478e220 --- /dev/null +++ b/test/parser/accept/mapping-builtin-function-a0.ptk @@ -0,0 +1 @@ +rule r = "" => tostring(); \ No newline at end of file diff --git a/test/parser/accept/mapping-builtin-function-a1.ptk b/test/parser/accept/mapping-builtin-function-a1.ptk new file mode 100644 index 0000000..58e9623 --- /dev/null +++ b/test/parser/accept/mapping-builtin-function-a1.ptk @@ -0,0 +1 @@ +rule r = "" => tostring($0); \ No newline at end of file diff --git a/test/parser/accept/mapping-builtin-function-a5.ptk b/test/parser/accept/mapping-builtin-function-a5.ptk new file mode 100644 index 0000000..acf6f75 --- /dev/null +++ b/test/parser/accept/mapping-builtin-function-a5.ptk @@ -0,0 +1 @@ +rule r = "" => tostring($0, $1, $2, $3, $4); \ No newline at end of file diff --git a/test/parser/accept/mapping-builtin-function-nest.ptk b/test/parser/accept/mapping-builtin-function-nest.ptk new file mode 100644 index 0000000..c7457fe --- /dev/null +++ b/test/parser/accept/mapping-builtin-function-nest.ptk @@ -0,0 +1 @@ +rule r = "" => tostring($0, tostring($1), $4); \ No newline at end of file diff --git a/test/parser/accept/mapping-user-function-a0.ptk b/test/parser/accept/mapping-user-function-a0.ptk new file mode 100644 index 0000000..12d6fce --- /dev/null +++ b/test/parser/accept/mapping-user-function-a0.ptk @@ -0,0 +1 @@ +rule r = "" => @convert(); \ No newline at end of file diff --git a/test/parser/accept/mapping-user-function-a1.ptk b/test/parser/accept/mapping-user-function-a1.ptk new file mode 100644 index 0000000..0c51664 --- /dev/null +++ b/test/parser/accept/mapping-user-function-a1.ptk @@ -0,0 +1 @@ +rule r = "" => @convert($0); \ No newline at end of file diff --git a/test/parser/accept/mapping-user-function-a5.ptk b/test/parser/accept/mapping-user-function-a5.ptk new file mode 100644 index 0000000..684e3f3 --- /dev/null +++ b/test/parser/accept/mapping-user-function-a5.ptk @@ -0,0 +1 @@ +rule r = "" => @convert($0, $1, $2, $3, $4); \ No newline at end of file diff --git a/test/parser/accept/mapping-user-function-nest.ptk b/test/parser/accept/mapping-user-function-nest.ptk new file mode 100644 index 0000000..f78963b --- /dev/null +++ b/test/parser/accept/mapping-user-function-nest.ptk @@ -0,0 +1 @@ +rule r = "" => @convert($0, tostring($1), $4); \ No newline at end of file diff --git a/test/parser/accept/mapping-variant-init.ptk b/test/parser/accept/mapping-variant-init.ptk new file mode 100644 index 0000000..0fc50e8 --- /dev/null +++ b/test/parser/accept/mapping-variant-init.ptk @@ -0,0 +1 @@ +rule r = "" => child: $0; \ No newline at end of file From a7e57259ce61aab61c30b6693f5060f5ad62bcf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 3 Nov 2023 19:52:38 +0100 Subject: [PATCH 10/20] Implements acceptance of record rules. --- build.zig | 3 + src/ptkgen/ast_dump.zig | 20 +- src/ptkgen/main.zig | 18 +- src/ptkgen/parser.zig | 204 ++++++++++++++++-- test/parser/accept/mapping-record-init-f1.ptk | 1 + test/parser/accept/mapping-record-init-f3.ptk | 1 + 6 files changed, 227 insertions(+), 20 deletions(-) create mode 100644 test/parser/accept/mapping-record-init-f1.ptk create mode 100644 test/parser/accept/mapping-record-init-f3.ptk diff --git a/build.zig b/build.zig index 9cfc229..60a0d97 100644 --- a/build.zig +++ b/build.zig @@ -168,6 +168,9 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/mapping-variant-init.ptk", + "test/parser/accept/mapping-record-init-f1.ptk", + "test/parser/accept/mapping-record-init-f3.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 64571a1..c554abf 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -108,8 +108,24 @@ const AstPrinter = struct { fn dumpMapping(printer: AstPrinter, mapping: ast.AstMapping) void { switch (mapping) { .record => |record| { - _ = record; - @panic("printing not implemented yet"); + std.debug.assert(record.len() > 0); + + print("{{ ", .{}); + + var first = true; + var iter = ast.iterate(record); + while (iter.next()) |arg| { + if (!first) { + print(", ", .{}); + } + first = false; + + print("{} = ", .{printer.fmtId(arg.field.value)}); + + printer.dumpMapping(arg.value.*); + } + + print(" }}", .{}); }, .list => |list| { diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 151ea33..bfb273c 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -22,6 +22,7 @@ pub const CliOptions = struct { help: bool = false, output: ?[]const u8 = null, test_mode: TestMode = .none, + trace: bool = false, @"max-file-size": u32 = 4 * 1024, // 4 MB of source code is a lot! @@ -42,6 +43,8 @@ pub const CliOptions = struct { .test_mode = "(internal use only, required for testing)", .@"max-file-size" = "Maximum input file size in KiB (default: 4096)", + + .trace = "Prints a parse trace", }, }; }; @@ -140,6 +143,7 @@ pub fn main() AppError!u8 { source_code, file_name, cli.options.test_mode, + cli.options.trace, ) catch |err| { try convertErrorToDiagnostics(&diagnostics, file_name, err); break :process_file false; @@ -266,13 +270,17 @@ fn compileFile( source_code: []const u8, file_name: []const u8, mode: TestMode, + trace_enabled: bool, ) !void { var tree = try parser.parse( - allocator, - diagnostics, - string_pool, - file_name, - source_code, + .{ + .allocator = allocator, + .diagnostics = diagnostics, + .string_pool = string_pool, + .file_name = file_name, + .source_code = source_code, + .trace_enabled = trace_enabled, + }, ); defer tree.deinit(); diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index f47e7cc..852355a 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -17,26 +17,46 @@ pub const Document = struct { } }; -pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_pool: *ptk.strings.Pool, file_name: []const u8, source_code: []const u8) !Document { - var arena = std.heap.ArenaAllocator.init(allocator); +pub fn parse(opt: struct { + allocator: std.mem.Allocator, + diagnostics: *Diagnostics, + string_pool: *ptk.strings.Pool, + file_name: []const u8, + source_code: []const u8, + trace_enabled: bool, +}) !Document { + var arena = std.heap.ArenaAllocator.init(opt.allocator); errdefer arena.deinit(); - const file_name_copy = try arena.allocator().dupe(u8, file_name); + const file_name_copy = try arena.allocator().dupe(u8, opt.file_name); - var tokenizer = Tokenizer.init(source_code, file_name_copy); + var tokenizer = Tokenizer.init(opt.source_code, file_name_copy); var parser = Parser{ .core = ParserCore.init(&tokenizer), .arena = arena.allocator(), - .pool = string_pool, - .diagnostics = diagnostics, + .pool = opt.string_pool, + .diagnostics = opt.diagnostics, + .trace_enabled = opt.trace_enabled, }; const document_node = parser.acceptDocument() catch |err| switch (err) { // Unrecoverable syntax error, must have created diagnostics already - error.SyntaxError, error.InvalidSourceEncoding => |e| { - std.debug.assert(diagnostics.hasErrors()); + error.SyntaxError => |e| { + std.debug.assert(opt.diagnostics.hasErrors()); + + if (opt.trace_enabled) { + if (@errorReturnTrace()) |trace| { + std.debug.dumpStackTrace(trace.*); + } + } + + return e; + }, + error.InvalidSourceEncoding => |e| { + std.debug.assert(opt.diagnostics.hasErrors()); + return e; }, @@ -45,7 +65,7 @@ pub fn parse(allocator: std.mem.Allocator, diagnostics: *Diagnostics, string_poo if (tokenizer.next()) |token_or_null| { if (token_or_null) |token| { - try diagnostics.emit(token.location, .excess_tokens, .{ .token_type = token.type }); + try opt.diagnostics.emit(token.location, .excess_tokens, .{ .token_type = token.type }); return error.SyntaxError; } } else |_| { @@ -128,7 +148,13 @@ const Parser = struct { pool: *ptk.strings.Pool, diagnostics: *Diagnostics, + trace_enabled: bool, + trace_depth: u32 = 0, + pub fn acceptDocument(parser: *Parser) FatalAcceptError!ast.Document { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + var doc = ast.Document{}; while (true) { @@ -143,6 +169,9 @@ const Parser = struct { } fn acceptTopLevelDecl(parser: *Parser) FatalAcceptError!?ast.TopLevelDeclaration { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + if (parser.acceptStartDecl()) |root_rule| { return .{ .start = root_rule }; } else |err| try filterAcceptError(err); @@ -174,6 +203,9 @@ const Parser = struct { } fn acceptStartDecl(parser: *Parser) AcceptError!ast.RuleRef { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + try parser.acceptLiteral(.start, .recover); const init_rule = try parser.acceptRuleReference(.fail); @@ -183,6 +215,9 @@ const Parser = struct { } fn acceptRule(parser: *Parser) AcceptError!ast.Rule { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + var state = parser.save(); errdefer parser.restore(state); @@ -220,6 +255,9 @@ const Parser = struct { } fn acceptMappedProduction(parser: *Parser) AcceptError!ast.MappedProduction { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + var sequence = try parser.acceptProductionSequence(); const mapping = if (try parser.tryAcceptLiteral(.@"=>")) @@ -238,6 +276,9 @@ const Parser = struct { } fn acceptProductionSequence(parser: *Parser) AcceptError!ast.List(ast.Production) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + var list: ast.List(ast.Production) = .{}; while (true) { @@ -258,6 +299,9 @@ const Parser = struct { } fn acceptProduction(parser: *Parser) AcceptError!ast.Production { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + if (try parser.tryAcceptLiteral(.@"(")) { var sequence = try parser.acceptProductionSequence(); try parser.acceptLiteral(.@")", .fail); @@ -290,6 +334,9 @@ const Parser = struct { } fn acceptAstMapping(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.AstMapping { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); @@ -339,6 +386,9 @@ const Parser = struct { } fn acceptVariantInit(parser: *Parser) AcceptError!ast.VariantInitializer { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); @@ -358,13 +408,65 @@ const Parser = struct { } fn acceptRecordInit(parser: *Parser) AcceptError!ast.List(ast.FieldAssignment) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); - return error.UnexpectedTokenRecoverable; + try parser.acceptLiteral(.@"{", .recover); + + var mode: AcceptMode = .recover; + + var list = ast.List(ast.FieldAssignment){}; + while (true) { + // First item might fail, then it's not a record initializer, but + // afterwards, all fields must comply + defer mode = .fail; + + const node = try parser.acceptFieldInit(mode); + + try parser.append(ast.FieldAssignment, &list, node); + + if (!try parser.tryAcceptLiteral(.@",")) { + break; + } + } + + try parser.acceptLiteral(.@"}", .fail); + + return list; + } + + fn acceptFieldInit(parser: *Parser, mode: AcceptMode) AcceptError!ast.FieldAssignment { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + const state = parser.save(); + errdefer parser.restore(state); + + const location = parser.core.tokenizer.current_location; + + const field = try parser.acceptIdentifier(mode); + + try parser.acceptLiteral(.@"=", .fail); + + const value = try parser.acceptAstMapping(.fail); + + const clone = try parser.arena.create(ast.AstMapping); + clone.* = value; + + return .{ + .location = location, + .field = field, + .value = clone, + }; } fn acceptListInit(parser: *Parser) AcceptError!ast.List(ast.AstMapping) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); @@ -378,6 +480,9 @@ const Parser = struct { } fn acceptCodeLiteral(parser: *Parser) AcceptError!ast.CodeLiteral { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.code_literal, .recover); std.debug.assert(std.mem.startsWith(u8, token.text, "`")); @@ -395,6 +500,9 @@ const Parser = struct { } fn acceptValueReference(parser: *Parser) AcceptError!ast.ValueRef { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.value_ref, .recover); std.debug.assert(std.mem.startsWith(u8, token.text, "$")); return ast.ValueRef{ @@ -414,6 +522,9 @@ const Parser = struct { } fn acceptBuiltinCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.Identifier) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); @@ -432,6 +543,9 @@ const Parser = struct { } fn acceptUserCall(parser: *Parser) AcceptError!ast.FunctionCall(ast.UserDefinedIdentifier) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const state = parser.save(); errdefer parser.restore(state); @@ -451,6 +565,9 @@ const Parser = struct { } fn acceptUserReference(parser: *Parser) AcceptError!ast.UserDefinedIdentifier { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.userval_ref, .recover); std.debug.assert(std.mem.startsWith(u8, token.text, "@")); return ast.UserDefinedIdentifier{ @@ -460,6 +577,9 @@ const Parser = struct { } fn acceptMappingList(parser: *Parser) AcceptError!ast.List(ast.AstMapping) { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const list_state = parser.save(); errdefer parser.restore(list_state); @@ -489,11 +609,16 @@ const Parser = struct { } fn acceptTypeSpec(parser: *Parser) AcceptError!ast.TypeSpec { - _ = parser; + parser.traceEnterRule(@src()); + defer parser.popTrace(); + @panic("not implemented yet"); } fn acceptStringLiteral(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.StringLiteral { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.string_literal, accept_mode); std.debug.assert(token.text.len >= 2); @@ -505,6 +630,9 @@ const Parser = struct { } fn acceptIdentifier(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.Identifier { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.identifier, accept_mode); return ast.Identifier{ .location = token.location, @@ -513,6 +641,9 @@ const Parser = struct { } fn acceptRuleReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.RuleRef { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.rule_ref, accept_mode); std.debug.assert(std.mem.startsWith(u8, token.text, "<")); std.debug.assert(std.mem.endsWith(u8, token.text, ">")); @@ -523,6 +654,9 @@ const Parser = struct { } fn acceptTokenReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.TokenRef { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.token_ref, accept_mode); std.debug.assert(std.mem.startsWith(u8, token.text, "$")); return ast.TokenRef{ @@ -532,6 +666,9 @@ const Parser = struct { } fn acceptNodeReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.NodeRef { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + const token = try parser.acceptToken(.node_ref, accept_mode); std.debug.assert(std.mem.startsWith(u8, token.text, "!")); return ast.NodeRef{ @@ -561,8 +698,7 @@ const Parser = struct { const location = parser.core.tokenizer.current_location; if (parser.core.accept(RS.any)) |token| { - // std.log.debug("token trace: {}", .{token}); - + errdefer parser.emitTrace(.{ .token_reject = .{ .actual = token, .expected = token_type } }); if (token.type != token_type) { switch (accept_mode) { .fail => { @@ -576,6 +712,7 @@ const Parser = struct { .recover => return error.UnexpectedTokenRecoverable, } } + parser.emitTrace(.{ .token_accept = token }); return token; } else |err| switch (err) { error.UnexpectedToken => unreachable, // RS.any will always accept the token @@ -602,6 +739,47 @@ const Parser = struct { }; // management: + const TraceKind = union(enum) { + token_accept: Token, + token_reject: struct { actual: Token, expected: TokenType }, + rule: []const u8, + }; + + const Trace = struct { + depth: u32, + kind: TraceKind, + + pub fn format(trace: Trace, fmt: []const u8, opt: std.fmt.FormatOptions, writer: anytype) !void { + _ = fmt; + _ = opt; + try writer.writeByteNTimes(' ', 4 * trace.depth); + try writer.print("{s}:", .{@tagName(trace.kind)}); + switch (trace.kind) { + .token_accept => |item| try writer.print("accept {}", .{item}), + .token_reject => |item| try writer.print("reject {}, expected '{s}'", .{ item.actual, @tagName(item.expected) }), + .rule => |item| try writer.print("{s}", .{item}), + } + } + }; + + fn traceEnterRule(parser: *Parser, loc: std.builtin.SourceLocation) void { + parser.emitTrace(.{ .rule = loc.fn_name }); + parser.trace_depth += 1; + } + + fn popTrace(parser: *Parser) void { + parser.trace_depth -= 1; + } + + fn emitTrace(parser: Parser, trace: TraceKind) void { + if (!parser.trace_enabled) { + return; + } + std.log.debug("rule trace: {}", .{Trace{ + .depth = parser.trace_depth, + .kind = trace, + }}); + } fn emitDiagnostic(parser: Parser, loc: ?ptk.Location, comptime code: Diagnostics.Code, data: Diagnostics.Data(code)) !void { // Anything detected here is always an error diff --git a/test/parser/accept/mapping-record-init-f1.ptk b/test/parser/accept/mapping-record-init-f1.ptk new file mode 100644 index 0000000..dcce273 --- /dev/null +++ b/test/parser/accept/mapping-record-init-f1.ptk @@ -0,0 +1 @@ +rule r = "" => { x = $0 }; \ No newline at end of file diff --git a/test/parser/accept/mapping-record-init-f3.ptk b/test/parser/accept/mapping-record-init-f3.ptk new file mode 100644 index 0000000..22d7640 --- /dev/null +++ b/test/parser/accept/mapping-record-init-f3.ptk @@ -0,0 +1 @@ +rule r = "" => { x = $0, y = $1, z = $2 }; \ No newline at end of file From 25220b2877e2524b40ea8c3087271a30a1f5a4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 3 Nov 2023 20:11:42 +0100 Subject: [PATCH 11/20] Starts to implement typespec parsing. --- build.zig | 4 ++ examples/ptkgen/ast-with-unions.ptk | 36 ++++++++--------- examples/ptkgen/grammar.ptk | 23 ++++++----- src/ptkgen/ast.zig | 6 +-- src/ptkgen/ast_dump.zig | 10 +++-- src/ptkgen/parser.zig | 42 +++++++++++++------- test/parser/accept/rule-typespec-custom.ptk | 1 + test/parser/accept/rule-typespec-literal.ptk | 1 + test/parser/accept/rule-typespec-ref.ptk | 1 + 9 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 test/parser/accept/rule-typespec-custom.ptk create mode 100644 test/parser/accept/rule-typespec-literal.ptk create mode 100644 test/parser/accept/rule-typespec-ref.ptk diff --git a/build.zig b/build.zig index 60a0d97..f8528cd 100644 --- a/build.zig +++ b/build.zig @@ -171,6 +171,10 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/mapping-record-init-f1.ptk", "test/parser/accept/mapping-record-init-f3.ptk", + "test/parser/accept/rule-typespec-custom.ptk", + "test/parser/accept/rule-typespec-ref.ptk", + "test/parser/accept/rule-typespec-literal.ptk", + // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples } ++ analyis_ok_files; diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index 1b041d7..1e8f226 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -4,16 +4,16 @@ # var name: type = value; # const name: type = value; -node declaration = struct - is_const: literal `bool`, - name: !identifier, - type: optional !type, - value: !value -; +# node declaration = struct +# is_const: `bool`, +# name: !identifier, +# type: optional !type, +# value: !value +# ; -node identifier = literal `[]const u8`; -node type = @TypeId; # enum { int, float, string } -node value = @Value; +# node identifier = `[]const u8`; +# node type = @TypeId; # enum { int, float, string } +# node value = @Value; start ; @@ -27,20 +27,20 @@ rule decl : !declaration = # $0_________ $1__ $2_____________ $3 $4_____ ; -rule : literal `bool` = +rule decl-type : `bool` = "var" => `false` | "const" => `true` ; -rule : !identifier = "name" => tostring($0); +rule id : !identifier = "name" => tostring($0); -rule : !type = +rule type : !type = "int" => `.int` | "float" => `.float` | "string" => `.string` ; -rule : !value = +rule value : !value = "10" => @parseInt($0) | "3.14" => @parseFloat($0) | "\"nice\"" => @parseStringLiteral($0) @@ -49,11 +49,11 @@ rule : !value = # Unions have can only have a single option active at a time -node TLDeclaration = union - ns : !namespace, - interface : !interface, - module : !module, -; +# node TLDeclaration = union +# ns : !namespace, +# interface : !interface, +# module : !module, +# ; rule toplevel-decl : !TLDeclaration = => ns: $0 # this is syntax for a union field selector as unions are not compounds diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 30b0a2a..7dff596 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -10,11 +10,11 @@ rule top_level = | ; -# rule start_decl = "start" $rule_ref ";" ; +rule start_decl = "start" $rule_ref ";" ; -# rule token_decl = "token" $identifier "=" ";" ; +rule token_decl = "token" $identifier "=" ";" ; -# rule node_decl = "node" $identifier "=" ";" ; +rule node_decl = "node" $identifier "=" ";" ; rule rule_decl = "rule" $identifier ( ":" )? "=" ";" ; @@ -40,12 +40,13 @@ rule mapping = ; rule mapped_value = - # { field = , field = , ... } - | # { , , ... } - | $code_literal # `code` - | $value_ref # $0 - | $userval "(" ")" # @func(...) - | $userval # @value + # { field = , field = , ... } + | # { , , ... } + | $code_literal # `code` + | $value_ref # $0 + | $identifier "(" ")" # builtin(...) + | $userval "(" ")" # @func(...) + | $userval # @value ; rule struct_ctor = @@ -56,8 +57,8 @@ rule assign_field = $identifier "=" $mapped_value ; -rule list_ctor = "{" "}"; +rule list_ctor = "{" ( )? "}"; rule value_list = ( "," )* -; \ No newline at end of file +; diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index 5b7c715..e62d664 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -155,9 +155,9 @@ pub const Pattern = union(enum) { pub const TypeSpec = union(enum) { reference: NodeRef, // !type literal: CodeLiteral, // literal `bool` - custom: CodeLiteral, // custom `Custom` - @"struct": CompoundType, // struct - @"union": CompoundType, // union + custom: UserDefinedIdentifier, // custom `Custom` + record: CompoundType, // struct + variant: CompoundType, // union }; pub const CompoundType = struct { diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index c554abf..6b2451b 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -64,9 +64,13 @@ const AstPrinter = struct { } fn dumpAstType(printer: AstPrinter, typespec: ast.TypeSpec) void { - _ = printer; - _ = typespec; - print("", .{}); + switch (typespec) { + .reference => |ref| print("!{}", .{printer.fmtId(ref.identifier)}), + .literal => |lit| print("literal `{s}`", .{printer.strings.get(lit.value)}), + .custom => @panic("not done yet"), + .record => @panic("not done yet"), + .variant => @panic("not done yet"), + } } fn dumpMappedProd(printer: AstPrinter, mapped_prod: ast.MappedProduction) void { diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 852355a..ef45197 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -84,13 +84,13 @@ pub const TokenType = enum { // keywords node, - @"struct", + record, + variant, optional, start, rule, token, - literal, custom, regex, skip, @@ -612,6 +612,18 @@ const Parser = struct { parser.traceEnterRule(@src()); defer parser.popTrace(); + if (parser.acceptCodeLiteral()) |code| { + return .{ .literal = code }; + } else |err| try filterAcceptError(err); + + if (parser.acceptUserReference()) |ref| { + return .{ .custom = ref }; + } else |err| try filterAcceptError(err); + + if (parser.acceptNodeReference(.fail)) |ref| { + return .{ .reference = ref }; + } else |err| try filterAcceptError(err); + @panic("not implemented yet"); } @@ -936,16 +948,27 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.line_comment, match.sequenceOf(.{ match.literal("#"), match.takeNoneOf("\r\n") })), Pattern.create(.node, match.word("node")), - Pattern.create(.@"struct", match.word("struct")), + Pattern.create(.record, match.word("record")), + Pattern.create(.variant, match.word("variant")), Pattern.create(.optional, match.word("optional")), Pattern.create(.start, match.word("start")), Pattern.create(.rule, match.word("rule")), Pattern.create(.token, match.word("token")), - Pattern.create(.literal, match.word("literal")), Pattern.create(.custom, match.word("custom")), Pattern.create(.regex, match.word("regex")), Pattern.create(.skip, match.word("skip")), + Pattern.create(.string_literal, matchStringLiteral), + Pattern.create(.code_literal, matchCodeLiteral), + + // identifiers must come after keywords: + Pattern.create(.identifier, matchRawIdentifier), + Pattern.create(.node_ref, matchNodeRef), + Pattern.create(.rule_ref, matchRuleRef), + Pattern.create(.token_ref, matchTokenRef), + Pattern.create(.value_ref, matchValueRef), + Pattern.create(.userval_ref, matchBuiltinRef), + Pattern.create(.@"=>", match.literal("=>")), Pattern.create(.@"=", match.literal("=")), @@ -965,17 +988,6 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.@"{", match.literal("{")), Pattern.create(.@"}", match.literal("}")), - Pattern.create(.string_literal, matchStringLiteral), - Pattern.create(.code_literal, matchCodeLiteral), - - // identifiers must come after keywords: - Pattern.create(.identifier, matchRawIdentifier), - Pattern.create(.node_ref, matchNodeRef), - Pattern.create(.rule_ref, matchRuleRef), - Pattern.create(.token_ref, matchTokenRef), - Pattern.create(.value_ref, matchValueRef), - Pattern.create(.userval_ref, matchBuiltinRef), - // Whitespace is the "kitchen sink" at the end: Pattern.create(.whitespace, match.takeAnyOf(" \r\n\t")), }); diff --git a/test/parser/accept/rule-typespec-custom.ptk b/test/parser/accept/rule-typespec-custom.ptk new file mode 100644 index 0000000..3df8de4 --- /dev/null +++ b/test/parser/accept/rule-typespec-custom.ptk @@ -0,0 +1 @@ +rule r : @Point = ""; \ No newline at end of file diff --git a/test/parser/accept/rule-typespec-literal.ptk b/test/parser/accept/rule-typespec-literal.ptk new file mode 100644 index 0000000..7a700d7 --- /dev/null +++ b/test/parser/accept/rule-typespec-literal.ptk @@ -0,0 +1 @@ +rule r : `bool` = ""; \ No newline at end of file diff --git a/test/parser/accept/rule-typespec-ref.ptk b/test/parser/accept/rule-typespec-ref.ptk new file mode 100644 index 0000000..1af0072 --- /dev/null +++ b/test/parser/accept/rule-typespec-ref.ptk @@ -0,0 +1 @@ +rule r : !farpointer = ""; \ No newline at end of file From ce25d3c6a0297504f52969e49ff797d80c199838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 6 Nov 2023 10:59:20 +0100 Subject: [PATCH 12/20] Adds declaration of nodes, and parsing of compound types. --- build.zig | 12 +- examples/ptkgen/ast-with-unions.ptk | 28 ++-- src/ptkgen/Diagnostics.zig | 40 ++++-- src/ptkgen/ast_dump.zig | 33 ++++- src/ptkgen/intl/en.json | 27 ++-- src/ptkgen/parser.zig | 136 ++++++++++++++++-- test/parser/accept/node-alias.ptk | 1 + test/parser/accept/node-custom.ptk | 1 + test/parser/accept/node-literal.ptk | 1 + test/parser/accept/node-record-f1.ptk | 1 + test/parser/accept/node-record-f4.ptk | 6 + test/parser/accept/node-variant-f1.ptk | 1 + test/parser/accept/node-variant-f4.ptk | 6 + .../reject/bad-mapping-invalid-token.ptk | 2 +- 14 files changed, 242 insertions(+), 53 deletions(-) create mode 100644 test/parser/accept/node-alias.ptk create mode 100644 test/parser/accept/node-custom.ptk create mode 100644 test/parser/accept/node-literal.ptk create mode 100644 test/parser/accept/node-record-f1.ptk create mode 100644 test/parser/accept/node-record-f4.ptk create mode 100644 test/parser/accept/node-variant-f1.ptk create mode 100644 test/parser/accept/node-variant-f4.ptk diff --git a/build.zig b/build.zig index f8528cd..562412c 100644 --- a/build.zig +++ b/build.zig @@ -126,6 +126,8 @@ const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-rep_one-many-item.ptk", "test/analysis/accept/match-rep_one-many-sequence.ptk", "test/analysis/accept/match-rep_one-nested.ptk", + + "examples/ptkgen/ast-with-unions.ptk", } ++ example_files; const parser_ok_files = [_][]const u8{ @@ -175,7 +177,15 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/rule-typespec-ref.ptk", "test/parser/accept/rule-typespec-literal.ptk", - // "examples/ptkgen/ast-with-unions.ptk", // TODO: Move to examples + "test/parser/accept/node-alias.ptk", + "test/parser/accept/node-custom.ptk", + "test/parser/accept/node-literal.ptk", + + "test/parser/accept/node-record-f1.ptk", + "test/parser/accept/node-record-f4.ptk", + + "test/parser/accept/node-variant-f4.ptk", + "test/parser/accept/node-variant-f1.ptk", } ++ analyis_ok_files; const parser_reject_files = [_][]const u8{ diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index 1e8f226..9435a51 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -4,16 +4,16 @@ # var name: type = value; # const name: type = value; -# node declaration = struct -# is_const: `bool`, -# name: !identifier, -# type: optional !type, -# value: !value -# ; +node declaration = record + is_const: `bool`, + name: !identifier, + # type: optional !type, + value: !value +; -# node identifier = `[]const u8`; -# node type = @TypeId; # enum { int, float, string } -# node value = @Value; +node identifier = `[]const u8`; +node type = @TypeId; # enum { int, float, string } +node value = @Value; start ; @@ -49,11 +49,11 @@ rule value : !value = # Unions have can only have a single option active at a time -# node TLDeclaration = union -# ns : !namespace, -# interface : !interface, -# module : !module, -# ; +node TLDeclaration = variant + ns : !namespace, + interface : !interface, + module : !module +; rule toplevel-decl : !TLDeclaration = => ns: $0 # this is syntax for a union field selector as unions are not compounds diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index e9de60f..ecccd61 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -29,11 +29,14 @@ pub const Code = enum(u16) { excess_tokens = 1107, unexpected_toplevel_token = 1108, unexpected_token_no_context = 1109, + unexpected_token_type_spec = 1110, + unexpected_token_mapping = 1111, // recoverable syntax errors: illegal_empty_group = 1200, empty_mapping = 1201, integer_overflow = 1202, + empty_typespec = 1203, comptime { std.debug.assert(first_error < first_warning); @@ -58,6 +61,11 @@ pub const Code = enum(u16) { }; const NoDiagnosticData = struct {}; + +const UnexpectedTokenMessage = struct { + actual: parser.Token, +}; + pub fn Data(comptime code: Code) type { return switch (code) { .out_of_memory => NoDiagnosticData, @@ -69,16 +77,14 @@ pub fn Data(comptime code: Code) type { }, .unexpected_token => struct { expected_type: parser.TokenType, - actual_type: parser.TokenType, - actual_text: []const u8, - }, - .unexpected_toplevel_token => struct { - actual_type: parser.TokenType, - actual_text: []const u8, - }, - .unexpected_token_no_context => struct { - actual_type: parser.TokenType, + actual: parser.Token, }, + + .unexpected_toplevel_token => UnexpectedTokenMessage, + .unexpected_token_no_context => UnexpectedTokenMessage, + .unexpected_token_type_spec => UnexpectedTokenMessage, + .unexpected_token_mapping => UnexpectedTokenMessage, + .unexpected_eof => NoDiagnosticData, .invalid_source_encoding => NoDiagnosticData, @@ -97,6 +103,8 @@ pub fn Data(comptime code: Code) type { actual: []const u8, }, + .empty_typespec => NoDiagnosticData, + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; } @@ -173,7 +181,7 @@ fn Formatter(comptime T: type) type { // enums: parser.TokenType => struct { - value: T, + value: parser.TokenType, pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { _ = options; _ = fmt; @@ -181,6 +189,18 @@ fn Formatter(comptime T: type) type { } }, + parser.Token => struct { + value: parser.Token, + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + try writer.print("{s} ('{}')", .{ + @tagName(item.value.type), + std.zig.fmtEscapes(item.value.text), + }); + } + }, + intl.FormattableError => struct { value: T, diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 6b2451b..eabfa11 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -51,7 +51,8 @@ const AstPrinter = struct { }, .node => |node| { - print("node {s}", .{printer.fmtId(node.name.value)}); + print("node {s} = ", .{printer.fmtId(node.name.value)}); + printer.dumpAstType(node.value); print(";\n", .{}); }, @@ -66,10 +67,32 @@ const AstPrinter = struct { fn dumpAstType(printer: AstPrinter, typespec: ast.TypeSpec) void { switch (typespec) { .reference => |ref| print("!{}", .{printer.fmtId(ref.identifier)}), - .literal => |lit| print("literal `{s}`", .{printer.strings.get(lit.value)}), - .custom => @panic("not done yet"), - .record => @panic("not done yet"), - .variant => @panic("not done yet"), + .literal => |lit| print("`{s}`", .{printer.strings.get(lit.value)}), + .custom => |custom| print("@{}", .{printer.fmtId(custom.value)}), + .record, .variant => |compound| { + const multi_field = compound.fields.len() > 1; + + print("{s} ", .{@tagName(typespec)}); + var iter = ast.iterate(compound.fields); + + if (multi_field) { + var line_prefix: []const u8 = "\n "; + while (iter.next()) |field| { + print("{s}{}: ", .{ line_prefix, printer.fmtId(field.name.value) }); + printer.dumpAstType(field.type); + + if (multi_field) { + line_prefix = ",\n "; + } + } + print("\n", .{}); + } else { + const field = iter.next().?; + + print("{}: ", .{printer.fmtId(field.name.value)}); + printer.dumpAstType(field.type); + } + }, } } diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index dd32bbe..0275d39 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -4,18 +4,21 @@ "file_limit_exceeded": "Input file exceeds maximum file size", "io_error": "I/O error: {[error_code]}", "invalid_source_encoding": "Invalid source code encoding detected", - "unexpected_token_eof": "Expected token {[expected_type]}, but end of file was discovered", - "unexpected_token": "Expected token {[expected_type]}, but discovered token {[actual_type]} ('{[actual_text]}')", - "unexpected_toplevel_token": "Expected token 'start', 'rule', 'node' or 'token', but discovered token {[actual_type]} ('{[actual_text]}')", - "unexpected_character": "Unexpected character: '{[character]}'", - "unexpected_eof": "Unexpected end of file", - "bad_string_escape": "Invalid string escape: Escape sequence at the end of string", - "invalid_string_escape": "Invalid string escape \\{[escape]}", - "excess_tokens": "Excess token at the end of the file: {[token_type]}", - "illegal_empty_group": "Production sequence may not be empty", - "unexpected_token_no_context": "Unexpected token '{[actual_type]}'", - "empty_mapping": "Empty mappings are not allowed", - "integer_overflow": "Integer value {[actual]} out of range. Values must be between {[min]} and {[max]}" + "bad_string_escape": "Invalid string escape: Escape sequence at the end of string.", + "invalid_string_escape": "Invalid string escape '\\{[escape]}'.", + "excess_tokens": "Excess token at the end of the file: {[token_type]}-", + "illegal_empty_group": "Production sequence may not be empty.", + "integer_overflow": "Integer value {[actual]} out of range. Values must be between {[min]} and {[max]}.", + "empty_mapping": "Empty mappings are not allowed.", + "empty_typespec": "A type specifier is missing.", + "unexpected_token_eof": "Expected a token of type '{[expected_type]}', but the end of file was discovered.", + "unexpected_token": "Expected a token of type '{[expected_type]}', but found token {[actual]}.", + "unexpected_character": "Unexpected character '{[character]}' found.", + "unexpected_eof": "Unexpected end of file.", + "unexpected_toplevel_token": "Expected a top level declaration ('start', 'rule', 'node' or 'token'), but found token {[actual]}", + "unexpected_token_no_context": "Unexpected token '{[actual]}'.", + "unexpected_token_type_spec": "Expected a type specifier, but found token '{[actual]}'.", + "unexpected_token_mapping": "Expected an AST mapping, but found token '{[actual]}'." }, "errors": { "Unexpected": "unexpected error encountered", diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index ef45197..e33e692 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -6,6 +6,8 @@ const Diagnostics = @import("Diagnostics.zig"); const fmtEscapes = std.zig.fmtEscapes; +const BAD_TYPE_SPEC: ast.TypeSpec = undefined; + pub const Document = struct { arena: std.heap.ArenaAllocator, file_name: []const u8, @@ -180,12 +182,15 @@ const Parser = struct { return .{ .rule = rule }; } else |err| try filterAcceptError(err); + if (parser.acceptNode()) |node| { + return .{ .node = node }; + } else |err| try filterAcceptError(err); + // Detect any excess tokens on the top level: if (parser.core.nextToken()) |maybe_token| { if (maybe_token) |token| { try parser.emitDiagnostic(token.location, .unexpected_toplevel_token, .{ - .actual_type = token.type, - .actual_text = token.text, + .actual = token, }); return error.SyntaxError; } else { @@ -214,6 +219,29 @@ const Parser = struct { return init_rule; } + fn acceptNode(parser: *Parser) AcceptError!ast.Node { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + var state = parser.save(); + errdefer parser.restore(state); + + try parser.acceptLiteral(.node, .recover); + + const identifier = try parser.acceptIdentifier(.fail); + + try parser.acceptLiteral(.@"=", .fail); + + const value = try parser.acceptTypeSpec(); + + try parser.acceptLiteral(.@";", .fail); + + return .{ + .name = identifier, + .value = value, + }; + } + fn acceptRule(parser: *Parser) AcceptError!ast.Rule { parser.traceEnterRule(@src()); defer parser.popTrace(); @@ -381,7 +409,9 @@ const Parser = struct { switch (accept_mode) { .recover => return error.UnexpectedTokenRecoverable, - .fail => return parser.emitUnexpectedToken(), + .fail => return parser.emitUnexpectedToken(.{ + .unexpected_token = .unexpected_token_mapping, + }), } } @@ -612,6 +642,11 @@ const Parser = struct { parser.traceEnterRule(@src()); defer parser.popTrace(); + const list_state = parser.save(); + errdefer parser.restore(list_state); + + const position = parser.core.tokenizer.current_location; + if (parser.acceptCodeLiteral()) |code| { return .{ .literal = code }; } else |err| try filterAcceptError(err); @@ -620,11 +655,86 @@ const Parser = struct { return .{ .custom = ref }; } else |err| try filterAcceptError(err); - if (parser.acceptNodeReference(.fail)) |ref| { + if (parser.acceptNodeReference(.recover)) |ref| { return .{ .reference = ref }; } else |err| try filterAcceptError(err); - @panic("not implemented yet"); + if (parser.acceptCompoundType(.record)) |record| { + return .{ .record = record }; + } else |err| try filterAcceptError(err); + + if (parser.acceptCompoundType(.variant)) |variant| { + return .{ .variant = variant }; + } else |err| try filterAcceptError(err); + + if (try parser.tryAcceptLiteral(.@";") or try parser.tryAcceptLiteral(.@"|") or try parser.tryAcceptLiteral(.@"=")) { + try parser.emitDiagnostic(position, .empty_typespec, .{}); + return BAD_TYPE_SPEC; + } + + // switch (accept_mode) { + // .recover => return error.UnexpectedTokenRecoverable, + // .fail => + return parser.emitUnexpectedToken(.{ + .unexpected_token = .unexpected_token_type_spec, + }); + // } + } + + fn acceptCompoundType(parser: *Parser, comptime designator: TokenType) AcceptError!ast.CompoundType { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + const list_state = parser.save(); + errdefer parser.restore(list_state); + + const current_location = parser.core.tokenizer.current_location; + + // we can recover "struct"/"record", afterwards you must follow the rules + try parser.acceptLiteral(designator, .recover); + + var fields = ast.List(ast.Field){}; + + while (true) { + const field = try parser.acceptField(); + + try parser.append(ast.Field, &fields, field); + + if (try parser.tryAcceptLiteral(.@",")) { + // Comma means we're having another field + continue; + } else { + // Otherwise, the list is over. + break; + } + } + + return .{ + .location = current_location, + .fields = fields, + }; + } + + fn acceptField(parser: *Parser) AcceptError!ast.Field { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + const list_state = parser.save(); + errdefer parser.restore(list_state); + + const current_location = parser.core.tokenizer.current_location; + + const name = try parser.acceptIdentifier(.fail); + + try parser.acceptLiteral(.@":", .fail); + + const type_spec = try parser.acceptTypeSpec(); + + return .{ + .location = current_location, + .name = name, + .type = type_spec, + }; } fn acceptStringLiteral(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.StringLiteral { @@ -716,8 +826,7 @@ const Parser = struct { .fail => { try parser.emitDiagnostic(location, .unexpected_token, .{ .expected_type = token_type, - .actual_type = token.type, - .actual_text = token.text, + .actual = token, }); return error.SyntaxError; }, @@ -799,7 +908,14 @@ const Parser = struct { try parser.diagnostics.emit(loc orelse parser.core.tokenizer.current_location, code, data); } - fn emitUnexpectedToken(parser: *Parser) AcceptError { + const UnexpectedTokenOptions = struct { + unexpected_token: Diagnostics.Code = .unexpected_token_no_context, + }; + fn emitUnexpectedToken(parser: *Parser, comptime opt: UnexpectedTokenOptions) AcceptError { + if (Diagnostics.Data(opt.unexpected_token) != Diagnostics.Data(.unexpected_token_no_context)) { + @compileError("Generic unexpected token must use the same type as 'unexpected_token_no_context' diagnostic."); + } + const state = parser.save(); defer parser.restore(state); @@ -818,8 +934,8 @@ const Parser = struct { return error.SyntaxError; }; - try parser.emitDiagnostic(location, .unexpected_token_no_context, .{ - .actual_type = token.type, + try parser.emitDiagnostic(location, opt.unexpected_token, .{ + .actual = token, }); return error.SyntaxError; } diff --git a/test/parser/accept/node-alias.ptk b/test/parser/accept/node-alias.ptk new file mode 100644 index 0000000..468dbc0 --- /dev/null +++ b/test/parser/accept/node-alias.ptk @@ -0,0 +1 @@ +node Alias = !OtherType; \ No newline at end of file diff --git a/test/parser/accept/node-custom.ptk b/test/parser/accept/node-custom.ptk new file mode 100644 index 0000000..da3a508 --- /dev/null +++ b/test/parser/accept/node-custom.ptk @@ -0,0 +1 @@ +node String = @StringIdentifier; \ No newline at end of file diff --git a/test/parser/accept/node-literal.ptk b/test/parser/accept/node-literal.ptk new file mode 100644 index 0000000..d2e3530 --- /dev/null +++ b/test/parser/accept/node-literal.ptk @@ -0,0 +1 @@ +node String = `[]const u8`; \ No newline at end of file diff --git a/test/parser/accept/node-record-f1.ptk b/test/parser/accept/node-record-f1.ptk new file mode 100644 index 0000000..8b8db7d --- /dev/null +++ b/test/parser/accept/node-record-f1.ptk @@ -0,0 +1 @@ +node Struct = record field: `bool`; \ No newline at end of file diff --git a/test/parser/accept/node-record-f4.ptk b/test/parser/accept/node-record-f4.ptk new file mode 100644 index 0000000..28b3356 --- /dev/null +++ b/test/parser/accept/node-record-f4.ptk @@ -0,0 +1,6 @@ +node Struct = record + x: `i32`, + y: `i32`, + z: `i32`, + location: !Location +; \ No newline at end of file diff --git a/test/parser/accept/node-variant-f1.ptk b/test/parser/accept/node-variant-f1.ptk new file mode 100644 index 0000000..0f675d8 --- /dev/null +++ b/test/parser/accept/node-variant-f1.ptk @@ -0,0 +1 @@ +node Struct = variant field: `bool`; \ No newline at end of file diff --git a/test/parser/accept/node-variant-f4.ptk b/test/parser/accept/node-variant-f4.ptk new file mode 100644 index 0000000..e346aea --- /dev/null +++ b/test/parser/accept/node-variant-f4.ptk @@ -0,0 +1,6 @@ +node Struct = variant + x: `i32`, + y: `i32`, + z: `i32`, + location: !Location +; \ No newline at end of file diff --git a/test/parser/reject/bad-mapping-invalid-token.ptk b/test/parser/reject/bad-mapping-invalid-token.ptk index 5d783df..aada416 100644 --- a/test/parser/reject/bad-mapping-invalid-token.ptk +++ b/test/parser/reject/bad-mapping-invalid-token.ptk @@ -1,2 +1,2 @@ -# expected: E1109 +# expected: E1111 rule group = "value" => "bad" ; \ No newline at end of file From 808720823665735f6f4f19d460f9f95ddb73ddac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 6 Nov 2023 11:17:15 +0100 Subject: [PATCH 13/20] Adds more diagnostics and tests for those --- build.zig | 5 +++ src/ptkgen/Diagnostics.zig | 4 ++ src/ptkgen/intl/en.json | 6 ++- src/ptkgen/parser.zig | 41 +++++++++++++++++++-- test/parser/reject/bad-mapping-too-long.ptk | 2 +- test/parser/reject/node-no-type.ptk | 2 + test/parser/reject/rule-bad-prod.ptk | 2 + test/parser/reject/rule-no-type-no-prod.ptk | 2 + test/parser/reject/rule-no-type.ptk | 2 + 9 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 test/parser/reject/node-no-type.ptk create mode 100644 test/parser/reject/rule-bad-prod.ptk create mode 100644 test/parser/reject/rule-no-type-no-prod.ptk create mode 100644 test/parser/reject/rule-no-type.ptk diff --git a/build.zig b/build.zig index 562412c..d7e3bd2 100644 --- a/build.zig +++ b/build.zig @@ -200,4 +200,9 @@ const parser_reject_files = [_][]const u8{ "test/parser/reject/empty-mapping.ptk", "test/parser/reject/bad-mapping-invalid-token.ptk", "test/parser/reject/bad-mapping-too-long.ptk", + + "test/parser/reject/node-no-type.ptk", + "test/parser/reject/rule-no-type.ptk", + "test/parser/reject/rule-no-type-no-prod.ptk", + "test/parser/reject/rule-bad-prod.ptk", }; diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index ecccd61..014542a 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -31,6 +31,8 @@ pub const Code = enum(u16) { unexpected_token_no_context = 1109, unexpected_token_type_spec = 1110, unexpected_token_mapping = 1111, + unexpected_token_production_list = 1112, + unexpected_token_production = 1113, // recoverable syntax errors: illegal_empty_group = 1200, @@ -84,6 +86,8 @@ pub fn Data(comptime code: Code) type { .unexpected_token_no_context => UnexpectedTokenMessage, .unexpected_token_type_spec => UnexpectedTokenMessage, .unexpected_token_mapping => UnexpectedTokenMessage, + .unexpected_token_production_list => UnexpectedTokenMessage, + .unexpected_token_production => UnexpectedTokenMessage, .unexpected_eof => NoDiagnosticData, diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 0275d39..360b5c1 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -17,8 +17,10 @@ "unexpected_eof": "Unexpected end of file.", "unexpected_toplevel_token": "Expected a top level declaration ('start', 'rule', 'node' or 'token'), but found token {[actual]}", "unexpected_token_no_context": "Unexpected token '{[actual]}'.", - "unexpected_token_type_spec": "Expected a type specifier, but found token '{[actual]}'.", - "unexpected_token_mapping": "Expected an AST mapping, but found token '{[actual]}'." + "unexpected_token_type_spec": "Expected a type specifier, but found '{[actual]}'.", + "unexpected_token_mapping": "Expected an AST mapping, but found '{[actual]}'.", + "unexpected_token_production_list": "Expected ';' or '|', but found '{[actual]}'.", + "unexpected_token_production": "Expected a production, but found '{[actual]}'." }, "errors": { "Unexpected": "unexpected error encountered", diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index e33e692..0cf12fc 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -267,10 +267,20 @@ const Parser = struct { try parser.append(ast.MappedProduction, &list, production); - // TODO: Improve error reporting here + // if a semicolon follows, we're done if (try parser.tryAcceptLiteral(.@";")) { break; } + // if a pipe follows, we got more rules + else if (try parser.tryAcceptLiteral(.@"|")) { + continue; + } + // otherwise, it's a syntax error: + else { + return parser.emitUnexpectedToken(.{ + .unexpected_token = .unexpected_token_production_list, + }); + } try parser.acceptLiteral(.@"|", .fail); } @@ -309,11 +319,28 @@ const Parser = struct { var list: ast.List(ast.Production) = .{}; - while (true) { + sequence_loop: while (true) { if (parser.acceptProduction()) |prod| { try parser.append(ast.Production, &list, prod); } else |err| switch (err) { - error.UnexpectedTokenRecoverable => break, + error.UnexpectedTokenRecoverable => { + // we couldn't accept a production, so let's see if we're in a legal state here: + + const seekahead_reset = parser.save(); + + // all of the following might allow to terminate a list: + inline for (.{ .@")", .@";", .@"=>", .@"|" }) |legal_terminator| { + if (try parser.tryAcceptLiteral(legal_terminator)) { + // All of the above tokens + parser.restore(seekahead_reset); + break :sequence_loop; + } + } + + return parser.emitUnexpectedToken(.{ + .unexpected_token = .unexpected_token_production, + }); + }, error.OutOfMemory, error.InvalidSourceEncoding, error.SyntaxError => |e| return e, } } @@ -667,8 +694,14 @@ const Parser = struct { return .{ .variant = variant }; } else |err| try filterAcceptError(err); + const contiuation_pos = parser.save(); if (try parser.tryAcceptLiteral(.@";") or try parser.tryAcceptLiteral(.@"|") or try parser.tryAcceptLiteral(.@"=")) { try parser.emitDiagnostic(position, .empty_typespec, .{}); + + // restore the previous position, we just seeked a bit forward to make better + // errors here: + parser.restore(contiuation_pos); + return BAD_TYPE_SPEC; } @@ -909,7 +942,7 @@ const Parser = struct { } const UnexpectedTokenOptions = struct { - unexpected_token: Diagnostics.Code = .unexpected_token_no_context, + unexpected_token: Diagnostics.Code, }; fn emitUnexpectedToken(parser: *Parser, comptime opt: UnexpectedTokenOptions) AcceptError { if (Diagnostics.Data(opt.unexpected_token) != Diagnostics.Data(.unexpected_token_no_context)) { diff --git a/test/parser/reject/bad-mapping-too-long.ptk b/test/parser/reject/bad-mapping-too-long.ptk index 1ecf764..057dcd5 100644 --- a/test/parser/reject/bad-mapping-too-long.ptk +++ b/test/parser/reject/bad-mapping-too-long.ptk @@ -1,2 +1,2 @@ -# expected: E1102 +# expected: E1112 rule group = "value" => $0 whatever ; \ No newline at end of file diff --git a/test/parser/reject/node-no-type.ptk b/test/parser/reject/node-no-type.ptk new file mode 100644 index 0000000..9a6b774 --- /dev/null +++ b/test/parser/reject/node-no-type.ptk @@ -0,0 +1,2 @@ +# expected: E1203 +node foo = ; \ No newline at end of file diff --git a/test/parser/reject/rule-bad-prod.ptk b/test/parser/reject/rule-bad-prod.ptk new file mode 100644 index 0000000..f5bf832 --- /dev/null +++ b/test/parser/reject/rule-bad-prod.ptk @@ -0,0 +1,2 @@ +# expected: E1113 +rule foo = `illegal here`; \ No newline at end of file diff --git a/test/parser/reject/rule-no-type-no-prod.ptk b/test/parser/reject/rule-no-type-no-prod.ptk new file mode 100644 index 0000000..bbd4401 --- /dev/null +++ b/test/parser/reject/rule-no-type-no-prod.ptk @@ -0,0 +1,2 @@ +# expected: E1203, E1200 +rule foo : = ; \ No newline at end of file diff --git a/test/parser/reject/rule-no-type.ptk b/test/parser/reject/rule-no-type.ptk new file mode 100644 index 0000000..6ab328d --- /dev/null +++ b/test/parser/reject/rule-no-type.ptk @@ -0,0 +1,2 @@ +# expected: E1203 +rule foo : = "code"; \ No newline at end of file From a2ffdf4a6f07c65cffe161858273be4155bf24cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 6 Nov 2023 14:30:06 +0100 Subject: [PATCH 14/20] Implements some tests for string escaping. --- docs/grammar.md | 5 ++- examples/ptkgen/grammar.ptk | 2 +- src/ptkgen/Diagnostics.zig | 7 ++- src/ptkgen/parser.zig | 86 ++++++++++++++++++++++++++++++++++++- 4 files changed, 91 insertions(+), 9 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index 031d096..0d3d2b5 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -36,5 +36,6 @@ union # constructs a type for alternatives, here with two variants: - `\'` => single quote (0x27) - `\"` => double quote (0x22) - `\\` => back slash (0x5C) -- `\u????` => UTF-16 -- `\U????????` => UTF-32 +- `\u{????}` => UTF-8 encoded codepoint + + diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 7dff596..6cdf958 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -49,7 +49,7 @@ rule mapped_value = | $userval # @value ; -rule struct_ctor = +rule record_ctor = "{" ( "," )* "}" ; diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 014542a..4514708 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -12,13 +12,12 @@ pub const Code = enum(u16) { pub const first_note = 8000; pub const last_item = 10000; - // generic failures: + // generic failures (1000-1099): out_of_memory = 1000, file_limit_exceeded = 1001, io_error = 1002, - // non-recoverable syntax errors: - + // non-recoverable syntax errors (1100-1199): invalid_source_encoding = 1100, unexpected_token_eof = 1101, unexpected_token = 1102, @@ -34,7 +33,7 @@ pub const Code = enum(u16) { unexpected_token_production_list = 1112, unexpected_token_production = 1113, - // recoverable syntax errors: + // recoverable syntax errors (1200-1299): illegal_empty_group = 1200, empty_mapping = 1201, integer_overflow = 1202, diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 0cf12fc..9803b79 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -1022,8 +1022,7 @@ const Parser = struct { '\\' => "\\", 'x' => @panic("Implement hex escape \\x??"), - 'u' => @panic("Implement utf-16 \\u????"), - 'U' => @panic("Implement utf-32 \\U????????"), + 'u' => @panic("Implement unicode utf-8 escapes \\u{????}"), '0'...'3' => @panic("Implement octal escape \\???"), @@ -1351,3 +1350,86 @@ test matchCodeLiteral { "```hello, world!``", }); } + +test "parser string literal" { + const Test = struct { + pub fn run(expected: []const u8, code: []const u8) !void { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + + var diag = Diagnostics.init(std.testing.allocator); + defer diag.deinit(); + + var strings = try ptk.strings.Pool.init(std.testing.allocator); + defer strings.deinit(); + + var tokenizer = Tokenizer.init(code, "unittest"); + + var parser = Parser{ + .diagnostics = &diag, + .pool = &strings, + .core = ParserCore.init(&tokenizer), + .arena = arena.allocator(), + .trace_enabled = false, + }; + + const literal = try parser.acceptStringLiteral(.fail); + + const actual = strings.get(literal.value); + + try std.testing.expectEqualStrings(expected, actual); + } + }; + + // Empty string: + try Test.run("", + \\"" + ); + + // Regular string + try Test.run("hello, world!", + \\"hello, world!" + ); + + // Validate escape sequences: + try Test.run("\r", + \\"\r" + ); + try Test.run("\n", + \\"\n" + ); + try Test.run("\\", + \\"\\" + ); + try Test.run("\"", + \\"\"" + ); + try Test.run("\"hello, world!\"", + \\"\"hello, world!\"" + ); + try Test.run("A\'B", + \\"A\'B" + ); + // TODO: enable those tests for escape sequences! + // try Test.run("\x34", + // \\"\x34" + // ); + // try Test.run("A\xFFB", + // \\"A\xFFB" + // ); + // try Test.run("\x10\x22", + // \\"\x10\x22" + // ); + // try Test.run("A\x1BB", + // \\"A\033B" + // ); + // try Test.run("A\xFFB", + // \\"A\377B" + // ); + // try Test.run("A\x01B", + // \\"A\001B" + // ); + // try Test.run("[\u{1F4A9}]", + // \\"[\u{1F4A9}]" + // ); +} From 490c537fbb9b4b66f3cd5d52cc4d14646dab6e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Tue, 7 Nov 2023 14:17:30 +0100 Subject: [PATCH 15/20] Implements start rule analysis. --- build.zig | 28 +- examples/ptkgen/grammar.ptk | 4 +- src/ptkgen/Diagnostics.zig | 125 +++++- src/ptkgen/ast.zig | 29 +- src/ptkgen/ast_dump.zig | 12 +- src/ptkgen/intl.zig | 4 +- src/ptkgen/intl/en.json | 13 +- src/ptkgen/main.zig | 51 ++- src/ptkgen/parser.zig | 13 +- src/ptkgen/sema.zig | 368 ++++++++++++++++++ .../accept/expect-warn-missing-start.ptk | 1 + test/analysis/accept/start-decl.ptk | 7 + test/analysis/reject/duplicate-node.ptk | 3 + test/analysis/reject/duplicate-pattern.ptk | 3 + test/analysis/reject/duplicate-rule.ptk | 3 + test/analysis/reject/duplicate-start.ptk | 10 + .../reject/duplicate-undeclared-start.ptk | 8 + test/analysis/reject/undeclared-start.ptk | 2 + 18 files changed, 624 insertions(+), 60 deletions(-) create mode 100644 src/ptkgen/sema.zig create mode 100644 test/analysis/accept/expect-warn-missing-start.ptk create mode 100644 test/analysis/accept/start-decl.ptk create mode 100644 test/analysis/reject/duplicate-node.ptk create mode 100644 test/analysis/reject/duplicate-pattern.ptk create mode 100644 test/analysis/reject/duplicate-rule.ptk create mode 100644 test/analysis/reject/duplicate-start.ptk create mode 100644 test/analysis/reject/duplicate-undeclared-start.ptk create mode 100644 test/analysis/reject/undeclared-start.ptk diff --git a/build.zig b/build.zig index d7e3bd2..c8d2c43 100644 --- a/build.zig +++ b/build.zig @@ -68,16 +68,17 @@ pub fn build(b: *std.build.Builder) void { test_step.dependOn(&b.addRunArtifact(ptkgen_tests).step); // Integration tests for ptkgen: - for (parser_ok_files) |file| { + for (parser_accept_files ++ parser_reject_files) |file| { const run = b.addRunArtifact(ptkdef_exe); run.addArg("--test_mode=parse_only"); run.addFileArg(.{ .path = file }); test_step.dependOn(&run.step); } - for (parser_reject_files) |file| { + // Integration tests for ptkgen: + for (analyis_accept_files ++ analyis_reject_files) |file| { const run = b.addRunArtifact(ptkdef_exe); - run.addArg("--test_mode=parse_only"); + run.addArg("--test_mode=no_codegen"); run.addFileArg(.{ .path = file }); test_step.dependOn(&run.step); } @@ -99,9 +100,10 @@ pub fn build(b: *std.build.Builder) void { const example_files = [_][]const u8{ "/home/felix/projects/parser-toolkit/examples/ptkgen/grammar.ptk", + "examples/ptkgen/ast-with-unions.ptk", }; -const analyis_ok_files = [_][]const u8{ +const analyis_accept_files = [_][]const u8{ "test/analysis/accept/match-literal-rule.ptk", "test/analysis/accept/match-literal-sequence.ptk", "test/analysis/accept/match-literal-variants.ptk", @@ -127,10 +129,22 @@ const analyis_ok_files = [_][]const u8{ "test/analysis/accept/match-rep_one-many-sequence.ptk", "test/analysis/accept/match-rep_one-nested.ptk", - "examples/ptkgen/ast-with-unions.ptk", + "test/analysis/accept/start-decl.ptk", } ++ example_files; -const parser_ok_files = [_][]const u8{ +const analyis_reject_files = [_][]const u8{ + "test/analysis/reject/duplicate-node.ptk", + // "test/analysis/reject/duplicate-pattern.ptk", // TODO: Implement pattern support in parser + "test/analysis/reject/duplicate-rule.ptk", + + "test/analysis/accept/expect-warn-missing-start.ptk", + + "test/analysis/reject/undeclared-start.ptk", + "test/analysis/reject/duplicate-undeclared-start.ptk", + "test/analysis/reject/duplicate-start.ptk", +}; + +const parser_accept_files = [_][]const u8{ "test/parser/accept/empty.ptk", "test/parser/accept/empty-with-comment-linefeed.ptk", "test/parser/accept/empty-with-comment.ptk", @@ -186,7 +200,7 @@ const parser_ok_files = [_][]const u8{ "test/parser/accept/node-variant-f4.ptk", "test/parser/accept/node-variant-f1.ptk", -} ++ analyis_ok_files; +} ++ analyis_accept_files; const parser_reject_files = [_][]const u8{ "test/parser/reject/empty-rule.ptk", diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 6cdf958..bc098d1 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -5,14 +5,14 @@ rule document = ( )* ; rule top_level = - | + | | | ; rule start_decl = "start" $rule_ref ";" ; -rule token_decl = "token" $identifier "=" ";" ; +rule pattern_decl = "pattern" $identifier "=" ";" ; rule node_decl = "node" $identifier "=" ";" ; diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 4514708..7e37483 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -10,7 +10,7 @@ pub const Code = enum(u16) { pub const first_error = 1000; pub const first_warning = 4000; pub const first_note = 8000; - pub const last_item = 10000; + pub const last_item = 9999; // generic failures (1000-1099): out_of_memory = 1000, @@ -39,12 +39,56 @@ pub const Code = enum(u16) { integer_overflow = 1202, empty_typespec = 1203, + // semantic errors (1300-1399): + + duplicate_identifier_rule = 1300, + duplicate_identifier_node = 1301, + duplicate_identifier_pattern = 1302, + + reference_to_undeclared_rule = 1303, + reference_to_undeclared_node = 1304, + reference_to_undeclared_pattern = 1305, + + multiple_start_symbols = 1306, + + // semantic warnings (4000-4099): + + missing_start_symbol = 4000, + comptime { std.debug.assert(first_error < first_warning); std.debug.assert(first_warning < first_note); std.debug.assert(first_note < last_item); } + const max_item_len = blk: { + var len = 0; + for (@typeInfo(Code).Enum.fields) |fld| { + len = @max(len, fld.name); + } + break :blk len; + }; + + const code_strings = blk: { + @setEvalBranchQuota(10_000); + var map = std.EnumArray(Code, []const u8).initUndefined(); + + for (std.enums.values(Code)) |code| { + const tag = @tagName(code); + + // perform kebab conversion: + var buf: [tag.len]u8 = tag[0..tag.len].*; + for (&buf) |*c| { + if (c.* == '_') + c.* = '-'; + } + + map.set(code, &buf); + } + + break :blk map; + }; + pub fn isError(code: Code) bool { const int = @intFromEnum(code); return @intFromEnum(code) >= first_error and int < first_warning; @@ -59,6 +103,45 @@ pub const Code = enum(u16) { const int = @intFromEnum(code); return int >= first_note and int < last_item; } + + pub fn parse(string: []const u8) error{ + /// Format is not recognized + InvalidFormat, + /// Numeric error code is out of range. + OutOfRange, + /// Numeric error code does not exist. + InvalidId, + }!Code { + if (string.len == 0 or (string[0] != 'E' and string[0] != 'W' and string[0] != 'D')) + return error.InvalidFormat; + const id = std.fmt.parseInt(u16, string[1..], 10) catch |err| switch (err) { + error.InvalidCharacter => return error.InvalidFormat, + error.Overflow => return error.OutOfRange, + }; + if (id > last_item) + return error.OutOfRange; + return std.meta.intToEnum(Diagnostics.Code, id) catch return error.InvalidId; + } + + pub fn format(code: Code, comptime fmt: []const u8, opt: std.fmt.FormatOptions, writer: anytype) !void { + _ = opt; + + if (comptime std.mem.eql(u8, fmt, "d")) { + const code_prefix = if (code.isError()) + "E" + else if (code.isWarning()) + "W" + else + "D"; + + try writer.print("{s}{d:0>4}", .{ code_prefix, @intFromEnum(code) }); + } else if (comptime std.mem.eql(u8, fmt, "s")) { + try writer.writeAll(code_strings.get(code)); + } else { + @compileError("Code fmt must be {s} (string variant) or {d} (numeric variant)!"); + } + // + } }; const NoDiagnosticData = struct {}; @@ -67,6 +150,14 @@ const UnexpectedTokenMessage = struct { actual: parser.Token, }; +const DuplicateIdentifier = struct { + identifier: []const u8, + previous_location: ptk.Location, +}; +const UndeclaredIdentifier = struct { + identifier: []const u8 +}; + pub fn Data(comptime code: Code) type { return switch (code) { .out_of_memory => NoDiagnosticData, @@ -108,6 +199,21 @@ pub fn Data(comptime code: Code) type { .empty_typespec => NoDiagnosticData, + .duplicate_identifier_rule => DuplicateIdentifier, + .duplicate_identifier_node => DuplicateIdentifier, + .duplicate_identifier_pattern => DuplicateIdentifier, + + .reference_to_undeclared_rule => UndeclaredIdentifier, + .reference_to_undeclared_node => UndeclaredIdentifier, + .reference_to_undeclared_pattern => UndeclaredIdentifier, + + .multiple_start_symbols => struct { + identifier: []const u8, + previous_location: ptk.Location, + }, + + .missing_start_symbol => NoDiagnosticData, + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; } @@ -204,6 +310,15 @@ fn Formatter(comptime T: type) type { } }, + ptk.Location => struct { + value: ptk.Location, + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + try writer.print("{}", .{item.value}); + } + }, + intl.FormattableError => struct { value: T, @@ -284,13 +399,7 @@ pub fn emit(diag: *Diagnostics, location: ptk.Location, comptime code: Code, par const message_text = try std.fmt.allocPrint(stack_fallback_allocator, fmt_string, formatted_params); defer stack_fallback_allocator.free(message_text); - const code_prefix = switch (level) { - .@"error" => "E", - .warning => "W", - .info => "D", - }; - - try diag.inner.emit(location, level, "{s}{d:0>4}: {s}", .{ code_prefix, @intFromEnum(code), message_text }); + try diag.inner.emit(location, level, "{d}: {s}", .{ code, message_text }); try diag.codes.append(code); } diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index e62d664..f269552 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -32,10 +32,10 @@ pub fn Iterator(comptime T: type) type { return struct { node: ?*List(T).Node, - pub fn next(iter: *@This()) ?T { + pub fn next(iter: *@This()) ?*T { const current = iter.node orelse return null; iter.node = current.next; - return current.data; + return ¤t.data; } }; } @@ -73,12 +73,13 @@ pub const TopLevelDeclaration = union(enum) { start: RuleRef, rule: Rule, node: Node, - token: Token, + pattern: Pattern, }; pub const NodeRef = Reference(Node); // !mynode pub const RuleRef = Reference(Rule); // -pub const TokenRef = Reference(Token); // $mytoken +pub const PatternRef = Reference(Pattern); // $mytoken + pub const ValueRef = struct { // $0 location: Location, index: u32, @@ -95,9 +96,16 @@ pub const Rule = struct { // rule ( : )? = ...; productions: List(MappedProduction), // all alternatives of the rule }; -pub const Token = struct { // token = ...; +pub const Pattern = struct { // token = ...; name: Identifier, - pattern: Pattern, + pattern: Data, + + pub const Data = union(enum) { + literal: StringLiteral, // literal "+" + word: StringLiteral, // word "while" + regex: StringLiteral, // regex "string" + external: CodeLiteral, // custom `matchMe` + }; }; pub const MappedProduction = struct { // ... => value @@ -107,7 +115,7 @@ pub const MappedProduction = struct { // ... => value pub const Production = union(enum) { literal: StringLiteral, // "text" - terminal: TokenRef, // $token + terminal: PatternRef, // $token recursion: RuleRef, // sequence: List(Production), // ... optional: List(Production), // ( ... )? @@ -145,13 +153,6 @@ pub const FieldAssignment = struct { value: *AstMapping, }; -pub const Pattern = union(enum) { - literal: StringLiteral, // literal "+" - word: StringLiteral, // word "while" - regex: StringLiteral, // regex "string" - external: CodeLiteral, // custom `matchMe` -}; - pub const TypeSpec = union(enum) { reference: NodeRef, // !type literal: CodeLiteral, // literal `bool` diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index eabfa11..468d789 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -22,7 +22,7 @@ const AstPrinter = struct { var iter = ast.iterate(decls); while (iter.next()) |decl| { - switch (decl) { + switch (decl.*) { .start => |item| print("start <{}>;\n", .{printer.fmtId(item.identifier)}), .rule => |rule| { @@ -44,7 +44,7 @@ const AstPrinter = struct { } else { print(" ", .{}); } - printer.dumpMappedProd(prod); + printer.dumpMappedProd(prod.*); } print("\n;\n", .{}); @@ -56,8 +56,8 @@ const AstPrinter = struct { print(";\n", .{}); }, - .token => |token| { - print("token {s}", .{printer.fmtId(token.name.value)}); + .pattern => |pattern| { + print("pattern {s}", .{printer.fmtId(pattern.name.value)}); print(";\n", .{}); }, } @@ -117,7 +117,7 @@ const AstPrinter = struct { var iter = ast.iterate(seq); while (iter.next()) |item| { print(" ", .{}); - printer.dumpProd(item); + printer.dumpProd(item.*); } print(" )", .{}); @@ -199,7 +199,7 @@ const AstPrinter = struct { } first = false; - printer.dumpMapping(arg); + printer.dumpMapping(arg.*); } } diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig index 70af58f..13ff049 100644 --- a/src/ptkgen/intl.zig +++ b/src/ptkgen/intl.zig @@ -57,6 +57,9 @@ pub const Localization = struct { errors: struct { Unexpected: []const u8, + SyntaxError: []const u8, + SemanticError: []const u8, + OutOfMemory: []const u8, InputOutput: []const u8, @@ -72,7 +75,6 @@ pub const Localization = struct { NetNameDeleted: []const u8, FileTooBig: []const u8, - SyntaxError: []const u8, InvalidSourceEncoding: []const u8, }, diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 360b5c1..95db0b7 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -20,9 +20,19 @@ "unexpected_token_type_spec": "Expected a type specifier, but found '{[actual]}'.", "unexpected_token_mapping": "Expected an AST mapping, but found '{[actual]}'.", "unexpected_token_production_list": "Expected ';' or '|', but found '{[actual]}'.", - "unexpected_token_production": "Expected a production, but found '{[actual]}'." + "unexpected_token_production": "Expected a production, but found '{[actual]}'.", + "duplicate_identifier_rule": "Rule {[identifier]} already defined here: {[previous_location]}", + "duplicate_identifier_node": "Node {[identifier]} already defined here: {[previous_location]}", + "duplicate_identifier_pattern": "Pattern {[identifier]} already defined here: {[previous_location]}", + "reference_to_undeclared_rule": "Reference to undeclared rule {[identifier]}.", + "reference_to_undeclared_node": "Reference to undeclared node {[identifier]}.", + "reference_to_undeclared_pattern": "Reference to undeclared pattern {[identifier]}.", + "missing_start_symbol": "Grammar file has no start symbol declared.", + "multiple_start_symbols": "Another start rule ({[identifier]}) was already declared here: {[previous_location]}" }, "errors": { + "SyntaxError": "syntax error", + "SemanticError": "semantic error", "Unexpected": "unexpected error encountered", "OutOfMemory": "out of memory", "InputOutput": "input output", @@ -37,7 +47,6 @@ "NotOpenForReading": "not open for reading", "NetNameDeleted": "net name deleted", "FileTooBig": "Input file exceeds resources", - "SyntaxError": "syntax error", "InvalidSourceEncoding": "invalid source encoding" } } \ No newline at end of file diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index bfb273c..4384d44 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -7,6 +7,7 @@ const args_parser = @import("args"); const ptk = @import("parser-toolkit"); const ast = @import("ast.zig"); +const sema = @import("sema.zig"); const intl = @import("intl.zig"); const parser = @import("parser.zig"); const ast_dump = @import("ast_dump.zig"); @@ -52,6 +53,7 @@ pub const CliOptions = struct { const TestMode = enum { none, parse_only, + no_codegen, }; const AppError = error{OutOfMemory} || std.fs.File.WriteError; @@ -126,10 +128,10 @@ pub fn main() AppError!u8 { if (std.mem.startsWith(u8, line, prefix)) { var items = std.mem.tokenize(u8, line[prefix.len..], " \t,"); while (items.next()) |error_code| { - if (error_code.len == 0 or (error_code[0] != 'E' and error_code[0] != 'W' and error_code[0] != 'D')) - @panic("invalid error code!"); - const id = std.fmt.parseInt(u16, error_code[1..], 10) catch @panic("bad integer"); - const code = std.meta.intToEnum(Diagnostics.Code, id) catch @panic("bad diagnostic code"); + const code = Diagnostics.Code.parse( + error_code, + ) catch @panic("invalid error code!"); + try expectations.append(.{ .code = code }); } } @@ -142,8 +144,7 @@ pub fn main() AppError!u8 { &string_pool, source_code, file_name, - cli.options.test_mode, - cli.options.trace, + cli.options, ) catch |err| { try convertErrorToDiagnostics(&diagnostics, file_name, err); break :process_file false; @@ -176,7 +177,7 @@ pub fn main() AppError!u8 { fn convertErrorToDiagnostics(diagnostics: *Diagnostics, file_name: []const u8, err: intl.FormattableError) error{OutOfMemory}!void { switch (err) { // syntax errors must produce diagnostics: - error.SyntaxError, error.InvalidSourceEncoding => std.debug.assert(diagnostics.hasErrors()), + error.SyntaxError, error.SemanticError, error.InvalidSourceEncoding => std.debug.assert(diagnostics.hasErrors()), error.OutOfMemory => { try diagnostics.emit(.{ @@ -250,13 +251,26 @@ fn validateDiagnostics(allocator: std.mem.Allocator, diagnostics: Diagnostics, e } } + // Remove all non-errors from available, we do match on them with "-W4000" instead of forcing a expected W4000 into all files without start rules (or similar) + { + var i: usize = 0; + while (i < available.items.len) { + const code = available.items[i]; + if (!code.isError()) { + _ = available.swapRemove(i); + } else { + i += 1; + } + } + } + const ok = (available.items.len == 0) and (expected.items.len == 0); for (available.items) |code| { - std.log.err("unexpected diagnostic: {0}", .{code}); + std.log.err("unexpected diagnostic: {s} ({d})", .{ code, code }); } for (expected.items) |code| { - std.log.err("unmatched diagnostic: {0}", .{code}); + std.log.err("unmatched diagnostic: {s} ({d})", .{ code, code }); } if (!ok) @@ -269,8 +283,7 @@ fn compileFile( string_pool: *ptk.strings.Pool, source_code: []const u8, file_name: []const u8, - mode: TestMode, - trace_enabled: bool, + options: CliOptions, ) !void { var tree = try parser.parse( .{ @@ -279,16 +292,26 @@ fn compileFile( .string_pool = string_pool, .file_name = file_name, .source_code = source_code, - .trace_enabled = trace_enabled, + .trace_enabled = options.trace, }, ); defer tree.deinit(); - // TODO: Implement sema + if (options.test_mode == .parse_only) { + return; + } + + var grammar = try sema.analyze( + allocator, + diagnostics, + string_pool, + tree.top_level_declarations, + ); + defer grammar.deinit(); // TODO: Implement parsergen / tablegen / highlightergen - if (mode == .none) { + if (options.test_mode == .none) { ast_dump.dump(string_pool, tree); } } diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index 9803b79..cb2a445 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -85,13 +85,14 @@ pub fn parse(opt: struct { pub const TokenType = enum { // keywords + start, node, + rule, + pattern, + record, variant, optional, - start, - rule, - token, custom, regex, @@ -808,13 +809,13 @@ const Parser = struct { }; } - fn acceptTokenReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.TokenRef { + fn acceptTokenReference(parser: *Parser, accept_mode: AcceptMode) AcceptError!ast.PatternRef { parser.traceEnterRule(@src()); defer parser.popTrace(); const token = try parser.acceptToken(.token_ref, accept_mode); std.debug.assert(std.mem.startsWith(u8, token.text, "$")); - return ast.TokenRef{ + return ast.PatternRef{ .location = token.location, .identifier = try parser.unwrapIdentifierString(token.location, token.text[1..]), }; @@ -1101,7 +1102,7 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.optional, match.word("optional")), Pattern.create(.start, match.word("start")), Pattern.create(.rule, match.word("rule")), - Pattern.create(.token, match.word("token")), + Pattern.create(.pattern, match.word("pattern")), Pattern.create(.custom, match.word("custom")), Pattern.create(.regex, match.word("regex")), Pattern.create(.skip, match.word("skip")), diff --git a/src/ptkgen/sema.zig b/src/ptkgen/sema.zig new file mode 100644 index 0000000..cfa7562 --- /dev/null +++ b/src/ptkgen/sema.zig @@ -0,0 +1,368 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const ast = @import("ast.zig"); +const Diagnostics = @import("Diagnostics.zig"); + +pub const AnalyzeError = error{ OutOfMemory, SemanticError }; + +const String = ptk.strings.String; + +pub fn StringHashMap(comptime T: type) type { + return std.AutoArrayHashMap(String, T); +} + +pub const Grammar = struct { + arena: std.heap.ArenaAllocator, + + start: ?StartDeclaration, + + rules: StringHashMap(*Rule), + nodes: StringHashMap(*Node), + patterns: StringHashMap(*Pattern), + + pub fn deinit(grammar: *Grammar) void { + grammar.rules.deinit(); + grammar.nodes.deinit(); + grammar.patterns.deinit(); + grammar.arena.deinit(); + grammar.* = undefined; + } +}; + +pub const StartDeclaration = struct { + rule: *Rule, + location: ptk.Location, +}; + +pub const Rule = struct { + location: ptk.Location, + name: String, + + type: ?*Type, + production: *Production, +}; + +pub const Production = union(enum) { + terminal: *Pattern, // literal and terminal ast nodes are wrapped to this + recursion: *Rule, // + sequence: []Production, // ... + optional: *Production, // ( ... )? + repetition_zero: *Production, // [ ... ]* + repetition_one: *Production, // [ ... ]+ +}; + +pub const Node = struct { + location: ptk.Location, + name: String, + + type: *Type, +}; + +pub const Pattern = struct { + location: ptk.Location, + name: String, + + data: Data, + + pub const Data = union(enum) { + literal_match: String, + word: String, + regex: String, + external: String, + }; +}; + +pub const Type = union(enum) { + code_literal: String, + user_value: String, + + optional: *Type, + record: *CompoundType, + variant: *CompoundType, +}; + +pub const CompoundType = struct { + fields: StringHashMap(Field), +}; + +pub const Field = struct { + name: String, + type: *Type, +}; + +pub fn analyze(allocator: std.mem.Allocator, diagnostics: *Diagnostics, strings: *const ptk.strings.Pool, document: ast.Document) AnalyzeError!Grammar { + std.debug.assert(diagnostics.hasErrors() == false); + errdefer |err| if (err == error.SemanticError) + std.debug.assert(diagnostics.hasErrors()); + + var grammar = Grammar{ + .arena = std.heap.ArenaAllocator.init(allocator), + + .rules = StringHashMap(*Rule).init(allocator), + .nodes = StringHashMap(*Node).init(allocator), + .patterns = StringHashMap(*Pattern).init(allocator), + + .start = null, + }; + errdefer grammar.deinit(); + + var analyzer = Analyzer{ + .arena = grammar.arena.allocator(), + .diagnostics = diagnostics, + .strings = strings, + + .rule_to_ast = std.AutoHashMap(*Rule, *ast.Rule).init(allocator), + .node_to_ast = std.AutoHashMap(*Node, *ast.Node).init(allocator), + .pattern_to_ast = std.AutoHashMap(*Pattern, *ast.Pattern).init(allocator), + + .document = document, + + .target = &grammar, + }; + defer analyzer.deinit(); + + try innerAnalysis(&analyzer); + + if (grammar.start == null) { + try analyzer.emitDiagnostic(ptk.Location{ + .line = 0, + .column = 0, + .source = null, + }, .missing_start_symbol, .{}); + } + + return grammar; +} + +fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { + // Phase 0: Validate productions on legality (coarse error checking) + // - Generates errors for badly constructed elements + try analyzer.validateAstRulesCoarse(); + + // Phase 1: Create all global declarations + // - Populates the declaration lookups + // - Generates errors for duplicate identifiers + try analyzer.createDeclarations(); + + // Phase 2: Instantiate all node types and patterns, determine start symbol + + try analyzer.iterateOn(.start, Analyzer.instantiateStartSymbol); + try analyzer.iterateOn(.node, Analyzer.instantiatePatterns); + try analyzer.iterateOn(.node, Analyzer.instantiateNodeTypes); + + // Phase 3: Validate generated types + + // Phase 4: Instantiate AST productions + + // Phase 5: Instantiate and validate AST mappings + +} + +const Analyzer = struct { + arena: std.mem.Allocator, + diagnostics: *Diagnostics, + strings: *const ptk.strings.Pool, + target: *Grammar, + + document: ast.Document, + + rule_to_ast: std.AutoHashMap(*Rule, *ast.Rule), + node_to_ast: std.AutoHashMap(*Node, *ast.Node), + pattern_to_ast: std.AutoHashMap(*Pattern, *ast.Pattern), + + fn deinit(analyzer: *Analyzer) void { + analyzer.rule_to_ast.deinit(); + analyzer.node_to_ast.deinit(); + analyzer.pattern_to_ast.deinit(); + analyzer.* = undefined; + } + + const IterativeAnalysisError = error{RecoverableSemanticError} || AnalyzeError; + + fn iterateOn( + analyzer: *Analyzer, + comptime node_type: std.meta.FieldEnum(ast.TopLevelDeclaration), + comptime functor: fn (*Analyzer, *std.meta.FieldType(ast.TopLevelDeclaration, node_type)) IterativeAnalysisError!void, + ) AnalyzeError!void { + var iter = ast.iterate(analyzer.document); + while (iter.next()) |item| { + switch (item.*) { + @field(std.meta.Tag(ast.TopLevelDeclaration), @tagName(node_type)) => |*node| { + functor(analyzer, node) catch |err| switch (err) { + error.RecoverableSemanticError => {}, + else => |e| return e, + }; + }, + else => {}, + } + } + } + + fn validateAstRulesCoarse(analyzer: *Analyzer) !void { + var iter = ast.iterate(analyzer.document); + while (iter.next()) |item| { + switch (item.*) { + .start => |start| { + _ = start; + }, + + .rule => |rule| { + _ = rule; + }, + + .node => |node| { + _ = node; + }, + + .pattern => |pattern| { + _ = pattern; + }, + } + } + } + + fn createDeclarations(analyzer: *Analyzer) !void { + var iter = ast.iterate(analyzer.document); + while (iter.next()) |item| { + switch (item.*) { + .start => {}, + + .rule => |*rule| { + const instance = try analyzer.declareElement( + Rule, + ast.Rule, + &analyzer.target.rules, + &analyzer.rule_to_ast, + rule, + rule.name, + .duplicate_identifier_rule, + ); + instance.* = .{ + .location = rule.name.location, + .name = rule.name.value, + + .type = undefined, // created in phase 4 + .production = undefined, // created in phase 5 + }; + }, + + .node => |*node| { + const instance = try analyzer.declareElement( + Node, + ast.Node, + &analyzer.target.nodes, + &analyzer.node_to_ast, + node, + node.name, + .duplicate_identifier_node, + ); + instance.* = .{ + .location = node.name.location, + .name = node.name.value, + + .type = undefined, // created in phase 2 + }; + }, + + .pattern => |*pattern| { + const instance = try analyzer.declareElement( + Pattern, + ast.Pattern, + &analyzer.target.patterns, + &analyzer.pattern_to_ast, + pattern, + pattern.name, + .duplicate_identifier_pattern, + ); + instance.* = .{ + .location = pattern.name.location, + .name = pattern.name.value, + + .data = undefined, // created in phase 2 + }; + }, + } + } + } + + fn instantiateStartSymbol(analyzer: *Analyzer, start: *ast.RuleRef) !void { + if (analyzer.target.start) |old_start| { + try analyzer.emitDiagnostic(start.location, .multiple_start_symbols, .{ + .identifier = analyzer.strings.get(old_start.rule.name), + .previous_location = old_start.location, + }); + // error return is further down below so we can also catch the undefined reference error + } + + const rule = analyzer.target.rules.get(start.identifier) orelse { + try analyzer.emitDiagnostic(start.location, .reference_to_undeclared_rule, .{ + .identifier = analyzer.strings.get(start.identifier), + }); + return error.RecoverableSemanticError; + }; + + if (analyzer.target.start != null) { + // return for the first if block + return error.RecoverableSemanticError; + } + + analyzer.target.start = .{ + .rule = rule, + .location = start.location, + }; + } + + fn instantiatePatterns(analyzer: *Analyzer, node: *ast.Node) !void { + _ = analyzer; + _ = node; + // + } + + fn instantiateNodeTypes(analyzer: *Analyzer, node: *ast.Node) !void { + _ = analyzer; + _ = node; + // + } + + const DeclarationError = error{ + OutOfMemory, + SemanticError, + }; + fn declareElement( + analyzer: *Analyzer, + comptime Element: type, + comptime AstNode: type, + set: *StringHashMap(*Element), + ast_map: *std.AutoHashMap(*Element, *AstNode), + ast_node: *AstNode, + name: ast.Identifier, + comptime diagnostic: Diagnostics.Code, + ) DeclarationError!*Element { + const gop = try set.getOrPut(name.value); + if (gop.found_existing) { + // emit diagnostic here + try analyzer.emitDiagnostic(name.location, diagnostic, .{ + .identifier = analyzer.strings.get(name.value), + .previous_location = gop.value_ptr.*.*.location, + }); + return error.SemanticError; + } + errdefer _ = set.swapRemove(name.value); + + const item = try analyzer.arena.create(Element); + errdefer analyzer.arena.destroy(item); + + item.* = undefined; + + gop.value_ptr.* = item; + + try ast_map.putNoClobber(item, ast_node); + + return item; + } + + fn emitDiagnostic(analyzer: *Analyzer, location: ptk.Location, comptime code: Diagnostics.Code, params: Diagnostics.Data(code)) !void { + try analyzer.diagnostics.emit(location, code, params); + } +}; diff --git a/test/analysis/accept/expect-warn-missing-start.ptk b/test/analysis/accept/expect-warn-missing-start.ptk new file mode 100644 index 0000000..f31365e --- /dev/null +++ b/test/analysis/accept/expect-warn-missing-start.ptk @@ -0,0 +1 @@ +# expected: W4000 diff --git a/test/analysis/accept/start-decl.ptk b/test/analysis/accept/start-decl.ptk new file mode 100644 index 0000000..404f545 --- /dev/null +++ b/test/analysis/accept/start-decl.ptk @@ -0,0 +1,7 @@ + + + +start ; + +rule magic = "magic"; + diff --git a/test/analysis/reject/duplicate-node.ptk b/test/analysis/reject/duplicate-node.ptk new file mode 100644 index 0000000..0f67291 --- /dev/null +++ b/test/analysis/reject/duplicate-node.ptk @@ -0,0 +1,3 @@ +# expected: E1301 +node foo = `bool`; +node foo = `bool`; \ No newline at end of file diff --git a/test/analysis/reject/duplicate-pattern.ptk b/test/analysis/reject/duplicate-pattern.ptk new file mode 100644 index 0000000..4302396 --- /dev/null +++ b/test/analysis/reject/duplicate-pattern.ptk @@ -0,0 +1,3 @@ +# expected: E1302 +pattern foo = literal "bla"; +pattern foo = literal "bla"; \ No newline at end of file diff --git a/test/analysis/reject/duplicate-rule.ptk b/test/analysis/reject/duplicate-rule.ptk new file mode 100644 index 0000000..eff3ee6 --- /dev/null +++ b/test/analysis/reject/duplicate-rule.ptk @@ -0,0 +1,3 @@ +# expected: E1300 +rule foo = ""; +rule foo = ""; \ No newline at end of file diff --git a/test/analysis/reject/duplicate-start.ptk b/test/analysis/reject/duplicate-start.ptk new file mode 100644 index 0000000..52c55cf --- /dev/null +++ b/test/analysis/reject/duplicate-start.ptk @@ -0,0 +1,10 @@ +# expected: E1306 + +start ; + +rule magic = "magic"; + +rule disco = "disco"; + +start ; + diff --git a/test/analysis/reject/duplicate-undeclared-start.ptk b/test/analysis/reject/duplicate-undeclared-start.ptk new file mode 100644 index 0000000..8b53833 --- /dev/null +++ b/test/analysis/reject/duplicate-undeclared-start.ptk @@ -0,0 +1,8 @@ +# expected: E1303, E1306 + +start ; + +rule magic = "magic"; + +start ; + diff --git a/test/analysis/reject/undeclared-start.ptk b/test/analysis/reject/undeclared-start.ptk new file mode 100644 index 0000000..5a97c96 --- /dev/null +++ b/test/analysis/reject/undeclared-start.ptk @@ -0,0 +1,2 @@ +# expected: E1303, W4000 +start ; \ No newline at end of file From 05c58f0c0552f99b595ad2c56acba575e7e3a99f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Wed, 15 Nov 2023 15:01:24 +0100 Subject: [PATCH 16/20] More semantic analysis --- build.zig | 12 ++ examples/ptkgen/ast-with-unions.ptk | 4 + examples/ptkgen/grammar.ptk | 9 +- src/ptkgen/Diagnostics.zig | 6 +- src/ptkgen/ast.zig | 5 +- src/ptkgen/ast_dump.zig | 13 +- src/ptkgen/intl/en.json | 11 +- src/ptkgen/parser.zig | 63 ++++++- src/ptkgen/sema.zig | 173 +++++++++++++++++- test/analysis/accept/pattern-custom-skip.ptk | 4 + test/analysis/accept/pattern-custom.ptk | 4 + test/analysis/accept/pattern-literal-skip.ptk | 4 + test/analysis/accept/pattern-literal.ptk | 4 + test/analysis/accept/pattern-regex-skip.ptk | 4 + test/analysis/accept/pattern-regex.ptk | 4 + test/analysis/accept/pattern-word-skip.ptk | 4 + test/analysis/accept/pattern-word.ptk | 4 + .../reject/pattern-unexpected-token.ptk | 4 + 18 files changed, 308 insertions(+), 24 deletions(-) create mode 100644 test/analysis/accept/pattern-custom-skip.ptk create mode 100644 test/analysis/accept/pattern-custom.ptk create mode 100644 test/analysis/accept/pattern-literal-skip.ptk create mode 100644 test/analysis/accept/pattern-literal.ptk create mode 100644 test/analysis/accept/pattern-regex-skip.ptk create mode 100644 test/analysis/accept/pattern-regex.ptk create mode 100644 test/analysis/accept/pattern-word-skip.ptk create mode 100644 test/analysis/accept/pattern-word.ptk create mode 100644 test/parser/reject/pattern-unexpected-token.ptk diff --git a/build.zig b/build.zig index c8d2c43..dcade1e 100644 --- a/build.zig +++ b/build.zig @@ -130,6 +130,16 @@ const analyis_accept_files = [_][]const u8{ "test/analysis/accept/match-rep_one-nested.ptk", "test/analysis/accept/start-decl.ptk", + + "test/analysis/accept/pattern-custom.ptk", + "test/analysis/accept/pattern-literal.ptk", + "test/analysis/accept/pattern-regex.ptk", + "test/analysis/accept/pattern-word.ptk", + + "test/analysis/accept/pattern-word-skip.ptk", + "test/analysis/accept/pattern-regex-skip.ptk", + "test/analysis/accept/pattern-literal-skip.ptk", + "test/analysis/accept/pattern-custom-skip.ptk", } ++ example_files; const analyis_reject_files = [_][]const u8{ @@ -219,4 +229,6 @@ const parser_reject_files = [_][]const u8{ "test/parser/reject/rule-no-type.ptk", "test/parser/reject/rule-no-type-no-prod.ptk", "test/parser/reject/rule-bad-prod.ptk", + + "test/parser/reject/pattern-unexpected-token.ptk", }; diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index 9435a51..6da295d 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -55,6 +55,10 @@ node TLDeclaration = variant module : !module ; +node namespace = @extern; +node interface = @extern; +node module = @extern; + rule toplevel-decl : !TLDeclaration = => ns: $0 # this is syntax for a union field selector as unions are not compounds | => interface: $0 diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index bc098d1..d788e27 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -12,7 +12,14 @@ rule top_level = rule start_decl = "start" $rule_ref ";" ; -rule pattern_decl = "pattern" $identifier "=" ";" ; +rule pattern_decl = "pattern" $identifier "=" ( "skip" )? ";" ; + +rule pattern_spec = + "literal" $string_literal + | "word" $string_literal + | "regex" $string_literal + | $userval +; rule node_decl = "node" $identifier "=" ";" ; diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 7e37483..56aeea2 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -32,6 +32,7 @@ pub const Code = enum(u16) { unexpected_token_mapping = 1111, unexpected_token_production_list = 1112, unexpected_token_production = 1113, + unexpected_token_pattern = 1114, // recoverable syntax errors (1200-1299): illegal_empty_group = 1200, @@ -154,9 +155,7 @@ const DuplicateIdentifier = struct { identifier: []const u8, previous_location: ptk.Location, }; -const UndeclaredIdentifier = struct { - identifier: []const u8 -}; +const UndeclaredIdentifier = struct { identifier: []const u8 }; pub fn Data(comptime code: Code) type { return switch (code) { @@ -178,6 +177,7 @@ pub fn Data(comptime code: Code) type { .unexpected_token_mapping => UnexpectedTokenMessage, .unexpected_token_production_list => UnexpectedTokenMessage, .unexpected_token_production => UnexpectedTokenMessage, + .unexpected_token_pattern => UnexpectedTokenMessage, .unexpected_eof => NoDiagnosticData, diff --git a/src/ptkgen/ast.zig b/src/ptkgen/ast.zig index f269552..a650c35 100644 --- a/src/ptkgen/ast.zig +++ b/src/ptkgen/ast.zig @@ -98,13 +98,14 @@ pub const Rule = struct { // rule ( : )? = ...; pub const Pattern = struct { // token = ...; name: Identifier, - pattern: Data, + data: Data, + invisible: bool, pub const Data = union(enum) { literal: StringLiteral, // literal "+" word: StringLiteral, // word "while" regex: StringLiteral, // regex "string" - external: CodeLiteral, // custom `matchMe` + external: UserDefinedIdentifier, // @matchMe }; }; diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/ast_dump.zig index 468d789..226b324 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/ast_dump.zig @@ -57,7 +57,18 @@ const AstPrinter = struct { }, .pattern => |pattern| { - print("pattern {s}", .{printer.fmtId(pattern.name.value)}); + print("pattern {s} = ", .{printer.fmtId(pattern.name.value)}); + + switch (pattern.data) { + .literal => |value| print("literal \"{}\"", .{printer.fmtString(value.value)}), + .word => |value| print("word \"{}\"", .{printer.fmtString(value.value)}), + .regex => |value| print("regex \"{}\"", .{printer.fmtString(value.value)}), + .external => |value| print("@{}", .{printer.fmtId(value.value)}), + } + + if (pattern.invisible) { + print(" skip", .{}); + } print(";\n", .{}); }, } diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index 95db0b7..c98d0c3 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -15,20 +15,21 @@ "unexpected_token": "Expected a token of type '{[expected_type]}', but found token {[actual]}.", "unexpected_character": "Unexpected character '{[character]}' found.", "unexpected_eof": "Unexpected end of file.", - "unexpected_toplevel_token": "Expected a top level declaration ('start', 'rule', 'node' or 'token'), but found token {[actual]}", + "unexpected_toplevel_token": "Expected a top level declaration ('start', 'rule', 'node' or 'pattern'), but found token {[actual]}", "unexpected_token_no_context": "Unexpected token '{[actual]}'.", "unexpected_token_type_spec": "Expected a type specifier, but found '{[actual]}'.", "unexpected_token_mapping": "Expected an AST mapping, but found '{[actual]}'.", "unexpected_token_production_list": "Expected ';' or '|', but found '{[actual]}'.", "unexpected_token_production": "Expected a production, but found '{[actual]}'.", + "unexpected_token_pattern": "Expected a pattern definition, but found '{[actual]}'.", "duplicate_identifier_rule": "Rule {[identifier]} already defined here: {[previous_location]}", "duplicate_identifier_node": "Node {[identifier]} already defined here: {[previous_location]}", "duplicate_identifier_pattern": "Pattern {[identifier]} already defined here: {[previous_location]}", - "reference_to_undeclared_rule": "Reference to undeclared rule {[identifier]}.", - "reference_to_undeclared_node": "Reference to undeclared node {[identifier]}.", - "reference_to_undeclared_pattern": "Reference to undeclared pattern {[identifier]}.", + "reference_to_undeclared_rule": "Reference to undeclared rule '{[identifier]}'.", + "reference_to_undeclared_node": "Reference to undeclared node '{[identifier]}'.", + "reference_to_undeclared_pattern": "Reference to undeclared pattern '{[identifier]}'.", "missing_start_symbol": "Grammar file has no start symbol declared.", - "multiple_start_symbols": "Another start rule ({[identifier]}) was already declared here: {[previous_location]}" + "multiple_start_symbols": "Another start rule '({[identifier]})' was already declared here: {[previous_location]}" }, "errors": { "SyntaxError": "syntax error", diff --git a/src/ptkgen/parser.zig b/src/ptkgen/parser.zig index cb2a445..7d26a61 100644 --- a/src/ptkgen/parser.zig +++ b/src/ptkgen/parser.zig @@ -94,7 +94,8 @@ pub const TokenType = enum { variant, optional, - custom, + literal, + word, regex, skip, @@ -187,6 +188,10 @@ const Parser = struct { return .{ .node = node }; } else |err| try filterAcceptError(err); + if (parser.acceptPatternDefinition()) |pattern| { + return .{ .pattern = pattern }; + } else |err| try filterAcceptError(err); + // Detect any excess tokens on the top level: if (parser.core.nextToken()) |maybe_token| { if (maybe_token) |token| { @@ -220,6 +225,59 @@ const Parser = struct { return init_rule; } + fn acceptPatternDefinition(parser: *Parser) AcceptError!ast.Pattern { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + try parser.acceptLiteral(.pattern, .recover); + + const name = try parser.acceptIdentifier(.fail); + try parser.acceptLiteral(.@"=", .fail); + + const data = try parser.acceptPatternSpec(); + + const invisible = try parser.tryAcceptLiteral(.skip); + + try parser.acceptLiteral(.@";", .fail); + + return .{ + .name = name, + .data = data, + .invisible = invisible, + }; + } + + fn acceptPatternSpec(parser: *Parser) AcceptError!ast.Pattern.Data { + parser.traceEnterRule(@src()); + defer parser.popTrace(); + + var state = parser.save(); + errdefer parser.restore(state); + + if (try parser.tryAcceptLiteral(.literal)) { + const string = try parser.acceptStringLiteral(.fail); + return .{ .literal = string }; + } + + if (try parser.tryAcceptLiteral(.word)) { + const string = try parser.acceptStringLiteral(.fail); + return .{ .word = string }; + } + + if (try parser.tryAcceptLiteral(.regex)) { + const string = try parser.acceptStringLiteral(.fail); + return .{ .regex = string }; + } + + if (parser.acceptUserReference()) |ref| { + return .{ .external = ref }; + } else |err| try filterAcceptError(err); + + return parser.emitUnexpectedToken(.{ + .unexpected_token = .unexpected_token_pattern, + }); + } + fn acceptNode(parser: *Parser) AcceptError!ast.Node { parser.traceEnterRule(@src()); defer parser.popTrace(); @@ -1103,7 +1161,8 @@ const Tokenizer = ptk.Tokenizer(TokenType, &.{ Pattern.create(.start, match.word("start")), Pattern.create(.rule, match.word("rule")), Pattern.create(.pattern, match.word("pattern")), - Pattern.create(.custom, match.word("custom")), + Pattern.create(.literal, match.word("literal")), + Pattern.create(.word, match.word("word")), Pattern.create(.regex, match.word("regex")), Pattern.create(.skip, match.word("skip")), diff --git a/src/ptkgen/sema.zig b/src/ptkgen/sema.zig index cfa7562..fd60ca1 100644 --- a/src/ptkgen/sema.zig +++ b/src/ptkgen/sema.zig @@ -1,6 +1,8 @@ const std = @import("std"); const ptk = @import("parser-toolkit"); +const logger = std.log.scoped(.ptk_sema); + const ast = @import("ast.zig"); const Diagnostics = @import("Diagnostics.zig"); @@ -74,20 +76,31 @@ pub const Pattern = struct { }; pub const Type = union(enum) { + // trivial types: code_literal: String, - user_value: String, + user_type: String, + // anonymous compound types: optional: *Type, record: *CompoundType, variant: *CompoundType, + + // ast nodes are basically "named types" and must be handled as such + named: *Node, + + pub fn id(t: *const Type) TypeId { + return @as(TypeId, t.*); + } }; +pub const TypeId: type = std.meta.Tag(Type); + pub const CompoundType = struct { fields: StringHashMap(Field), }; pub const Field = struct { - name: String, + // name: String, type: *Type, }; @@ -116,6 +129,8 @@ pub fn analyze(allocator: std.mem.Allocator, diagnostics: *Diagnostics, strings: .node_to_ast = std.AutoHashMap(*Node, *ast.Node).init(allocator), .pattern_to_ast = std.AutoHashMap(*Pattern, *ast.Pattern).init(allocator), + .type_stash = Analyzer.TypeStash.init(allocator), + .document = document, .target = &grammar, @@ -135,6 +150,10 @@ pub fn analyze(allocator: std.mem.Allocator, diagnostics: *Diagnostics, strings: return grammar; } +var BAD_TYPE_SENTINEL: Type = undefined; +var BAD_NODE_SENTINEL: Node = undefined; +var BAD_RULE_SENTINEL: Rule = undefined; + fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { // Phase 0: Validate productions on legality (coarse error checking) // - Generates errors for badly constructed elements @@ -148,10 +167,11 @@ fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { // Phase 2: Instantiate all node types and patterns, determine start symbol try analyzer.iterateOn(.start, Analyzer.instantiateStartSymbol); - try analyzer.iterateOn(.node, Analyzer.instantiatePatterns); + try analyzer.iterateOn(.pattern, Analyzer.instantiatePatterns); try analyzer.iterateOn(.node, Analyzer.instantiateNodeTypes); // Phase 3: Validate generated types + try analyzer.iterateOn(.node, Analyzer.validateNodes); // Phase 4: Instantiate AST productions @@ -160,6 +180,8 @@ fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { } const Analyzer = struct { + const TypeStash = std.HashMap(*Type, void, TypeContext, std.hash_map.default_max_load_percentage); + arena: std.mem.Allocator, diagnostics: *Diagnostics, strings: *const ptk.strings.Pool, @@ -171,10 +193,15 @@ const Analyzer = struct { node_to_ast: std.AutoHashMap(*Node, *ast.Node), pattern_to_ast: std.AutoHashMap(*Pattern, *ast.Pattern), + type_stash: TypeStash, + + deduplicated_type_count: usize = 0, + fn deinit(analyzer: *Analyzer) void { analyzer.rule_to_ast.deinit(); analyzer.node_to_ast.deinit(); analyzer.pattern_to_ast.deinit(); + analyzer.type_stash.deinit(); analyzer.* = undefined; } @@ -313,16 +340,112 @@ const Analyzer = struct { }; } - fn instantiatePatterns(analyzer: *Analyzer, node: *ast.Node) !void { - _ = analyzer; - _ = node; - // + fn instantiatePatterns(analyzer: *Analyzer, ast_pattern: *ast.Pattern) !void { + const sema_pattern = analyzer.target.patterns.get(ast_pattern.name.value).?; + + sema_pattern.data = switch (ast_pattern.data) { + .literal => |value| .{ .literal_match = value.value }, + .word => |value| .{ .word = value.value }, + .regex => |value| .{ .regex = value.value }, + .external => |value| .{ .external = value.value }, + }; + + // TODO: Implement regex validation here! + } + + fn instantiateNodeTypes(analyzer: *Analyzer, ast_node: *ast.Node) !void { + const sema_node = analyzer.target.nodes.get(ast_node.name.value).?; + + sema_node.type = try analyzer.resolveType(&ast_node.value); + } + + fn validateNodes(analyzer: *Analyzer, ast_node: *ast.Node) !void { + const sema_node = analyzer.target.nodes.get(ast_node.name.value).?; + + try analyzer.validateType(sema_node.type); } - fn instantiateNodeTypes(analyzer: *Analyzer, node: *ast.Node) !void { + fn validateType(analyzer: *Analyzer, type_node: *Type) !void { _ = analyzer; - _ = node; - // + if (type_node == &BAD_TYPE_SENTINEL) { + @panic("bad sentinel"); + } + } + + fn createCompoundType(analyzer: *Analyzer, def: ast.CompoundType) !*CompoundType { + const ct = try analyzer.target.arena.allocator().create(CompoundType); + errdefer analyzer.target.arena.allocator().destroy(ct); + + ct.* = CompoundType{ + .fields = StringHashMap(Field).init(analyzer.target.arena.allocator()), + }; + errdefer ct.fields.deinit(); + + try ct.fields.ensureTotalCapacity(def.fields.len()); + + var iter = ast.iterate(def.fields); + while (iter.next()) |field_def| { + const field_type = try analyzer.resolveType(&field_def.type); + ct.fields.putAssumeCapacityNoClobber(field_def.name.value, .{ + .type = field_type, + }); + } + + return ct; + } + + fn destroyCompoundType(analyzer: *Analyzer, ct: *CompoundType) void { + ct.fields.deinit(); + analyzer.target.arena.allocator().destroy(ct); + ct.* = undefined; + } + + fn resolveType(analyzer: *Analyzer, type_node: *ast.TypeSpec) error{OutOfMemory}!*Type { + var compound_type: ?*CompoundType = null; + var proto_type: Type = switch (type_node.*) { + .reference => |def| .{ + .named = analyzer.target.nodes.get(def.identifier) orelse blk: { + try analyzer.emitDiagnostic(def.location, .reference_to_undeclared_node, .{ + .identifier = analyzer.strings.get(def.identifier), + }); + break :blk &BAD_NODE_SENTINEL; + }, + }, + .literal => |def| Type{ .code_literal = def.value }, + .custom => |def| Type{ .user_type = def.value }, + .record => |def| blk: { + compound_type = try analyzer.createCompoundType(def); + break :blk .{ .record = compound_type.? }; + }, + .variant => |def| blk: { + compound_type = try analyzer.createCompoundType(def); + break :blk .{ .record = compound_type.? }; + }, + }; + errdefer if (compound_type) |ct| + analyzer.destroyCompoundType(ct); + + if (analyzer.getUniqueTypeHandle(&proto_type)) |resolved_type| { + analyzer.deduplicated_type_count += 1; + // logger.debug("deduplicated a {s}", .{@tagName(resolved_type.*)}); + return resolved_type; + } + + const new_type = try analyzer.target.arena.allocator().create(Type); + errdefer analyzer.target.arena.allocator().destroy(new_type); + + new_type.* = proto_type; + + try analyzer.type_stash.putNoClobber(new_type, {}); + + return new_type; + } + + fn getUniqueTypeHandle(analyzer: Analyzer, proto_type: *Type) ?*Type { + if (analyzer.type_stash.getKey(proto_type)) |key| { + return key; + } + return null; } const DeclarationError = error{ @@ -366,3 +489,33 @@ const Analyzer = struct { try analyzer.diagnostics.emit(location, code, params); } }; + +const TypeContext = struct { + const HashFn = std.hash.Fnv1a_64; + + pub fn eql(ctx: TypeContext, lhs: *Type, rhs: *Type) bool { + _ = ctx; + if (lhs == rhs) + return true; + if (lhs.id() != rhs.id()) + return false; + switch (lhs.*) { + inline .code_literal, .user_type, .optional, .named => |val, tag| return val == @field(rhs, @tagName(tag)), + .record, .variant => return false, // they are same-by-identitiy + } + } + + pub fn hash(ctx: TypeContext, t: *Type) u64 { + _ = ctx; + var hasher = HashFn.init(); + hasher.update(@tagName(t.*)); + switch (t.*) { + .code_literal => |lit| hasher.update(&std.mem.toBytes(@intFromEnum(lit))), + .user_type => |lit| hasher.update(&std.mem.toBytes(@intFromEnum(lit))), + .optional => |child| hasher.update(&std.mem.toBytes(child)), + .named => |node| hasher.update(&std.mem.toBytes(node)), + .record, .variant => hasher.update(&std.mem.toBytes(t)), + } + return hasher.final(); + } +}; diff --git a/test/analysis/accept/pattern-custom-skip.ptk b/test/analysis/accept/pattern-custom-skip.ptk new file mode 100644 index 0000000..83f23c7 --- /dev/null +++ b/test/analysis/accept/pattern-custom-skip.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = @externalFunction; + diff --git a/test/analysis/accept/pattern-custom.ptk b/test/analysis/accept/pattern-custom.ptk new file mode 100644 index 0000000..83f23c7 --- /dev/null +++ b/test/analysis/accept/pattern-custom.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = @externalFunction; + diff --git a/test/analysis/accept/pattern-literal-skip.ptk b/test/analysis/accept/pattern-literal-skip.ptk new file mode 100644 index 0000000..a5efb6c --- /dev/null +++ b/test/analysis/accept/pattern-literal-skip.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = literal "a-word" skip; + diff --git a/test/analysis/accept/pattern-literal.ptk b/test/analysis/accept/pattern-literal.ptk new file mode 100644 index 0000000..4964d2c --- /dev/null +++ b/test/analysis/accept/pattern-literal.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = literal "a-word"; + diff --git a/test/analysis/accept/pattern-regex-skip.ptk b/test/analysis/accept/pattern-regex-skip.ptk new file mode 100644 index 0000000..b9e45ec --- /dev/null +++ b/test/analysis/accept/pattern-regex-skip.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = regex "a-word" skip; + diff --git a/test/analysis/accept/pattern-regex.ptk b/test/analysis/accept/pattern-regex.ptk new file mode 100644 index 0000000..4ec3715 --- /dev/null +++ b/test/analysis/accept/pattern-regex.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = regex "a-word"; + diff --git a/test/analysis/accept/pattern-word-skip.ptk b/test/analysis/accept/pattern-word-skip.ptk new file mode 100644 index 0000000..07a0e07 --- /dev/null +++ b/test/analysis/accept/pattern-word-skip.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = word "a-word" skip; + diff --git a/test/analysis/accept/pattern-word.ptk b/test/analysis/accept/pattern-word.ptk new file mode 100644 index 0000000..07a0e07 --- /dev/null +++ b/test/analysis/accept/pattern-word.ptk @@ -0,0 +1,4 @@ + + +pattern a_word = word "a-word" skip; + diff --git a/test/parser/reject/pattern-unexpected-token.ptk b/test/parser/reject/pattern-unexpected-token.ptk new file mode 100644 index 0000000..158522d --- /dev/null +++ b/test/parser/reject/pattern-unexpected-token.ptk @@ -0,0 +1,4 @@ +# expected: E1114 + +pattern a_word = `illegal`; + From 466a1a8148171a77323cfd73f6e9ebba7470f0c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 1 Dec 2023 10:01:54 +0100 Subject: [PATCH 17/20] Implements production instantiation --- build.zig | 3 + examples/ptkgen/ast-with-unions.ptk | 4 + examples/ptkgen/grammar.ptk | 14 ++ src/ptkgen/Diagnostics.zig | 7 + src/ptkgen/intl/en.json | 3 +- src/ptkgen/sema.zig | 172 +++++++++++++++++- .../reject/duplicate-field-record.ptk | 7 + .../reject/duplicate-field-variant.ptk | 7 + 8 files changed, 209 insertions(+), 8 deletions(-) create mode 100644 test/analysis/reject/duplicate-field-record.ptk create mode 100644 test/analysis/reject/duplicate-field-variant.ptk diff --git a/build.zig b/build.zig index dcade1e..7bcc743 100644 --- a/build.zig +++ b/build.zig @@ -152,6 +152,9 @@ const analyis_reject_files = [_][]const u8{ "test/analysis/reject/undeclared-start.ptk", "test/analysis/reject/duplicate-undeclared-start.ptk", "test/analysis/reject/duplicate-start.ptk", + + "test/analysis/reject/duplicate-field-record.ptk", + "test/analysis/reject/duplicate-field-variant.ptk", }; const parser_accept_files = [_][]const u8{ diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index 6da295d..0133fcd 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -64,3 +64,7 @@ rule toplevel-decl : !TLDeclaration = | => interface: $0 | => module: $0 ; + +rule namespace-group = "to be done"; +rule interface-decl = "to be done"; +rule module-decl = "to be done"; diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index d788e27..11c29f5 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -69,3 +69,17 @@ rule list_ctor = "{" ( )? "}"; rule value_list = ( "," )* ; + + +# TODO: + +pattern rule_ref = literal ""; +pattern identifier = literal ""; +pattern string_literal = literal ""; +pattern userval = literal ""; +pattern token_ref = literal ""; +pattern code_literal = literal ""; +pattern value_ref = literal ""; +pattern mapped_value = literal ""; + +rule type = "empty"; \ No newline at end of file diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 56aeea2..16677f5 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -52,6 +52,8 @@ pub const Code = enum(u16) { multiple_start_symbols = 1306, + duplicate_compound_field = 1307, + // semantic warnings (4000-4099): missing_start_symbol = 4000, @@ -214,6 +216,11 @@ pub fn Data(comptime code: Code) type { .missing_start_symbol => NoDiagnosticData, + .duplicate_compound_field => struct { + identifier: []const u8, + previous_location: ptk.Location, + }, + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; } diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index c98d0c3..c0f5eb8 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -29,7 +29,8 @@ "reference_to_undeclared_node": "Reference to undeclared node '{[identifier]}'.", "reference_to_undeclared_pattern": "Reference to undeclared pattern '{[identifier]}'.", "missing_start_symbol": "Grammar file has no start symbol declared.", - "multiple_start_symbols": "Another start rule '({[identifier]})' was already declared here: {[previous_location]}" + "multiple_start_symbols": "Another start rule '({[identifier]})' was already declared here: {[previous_location]}", + "duplicate_compound_field": "Another field named '{s}' was already declared here: {[previous_location]}" }, "errors": { "SyntaxError": "syntax error", diff --git a/src/ptkgen/sema.zig b/src/ptkgen/sema.zig index fd60ca1..837b36b 100644 --- a/src/ptkgen/sema.zig +++ b/src/ptkgen/sema.zig @@ -22,11 +22,13 @@ pub const Grammar = struct { rules: StringHashMap(*Rule), nodes: StringHashMap(*Node), patterns: StringHashMap(*Pattern), + literal_patterns: StringHashMap(*Pattern), pub fn deinit(grammar: *Grammar) void { grammar.rules.deinit(); grammar.nodes.deinit(); grammar.patterns.deinit(); + grammar.literal_patterns.deinit(); grammar.arena.deinit(); grammar.* = undefined; } @@ -42,7 +44,14 @@ pub const Rule = struct { name: String, type: ?*Type, - production: *Production, + productions: []MappedProduction, +}; + +/// A production of a rule that is able to map the parsed structure +/// into an AST node. +pub const MappedProduction = struct { + production: Production, + mapping: ?Mapping, }; pub const Production = union(enum) { @@ -54,6 +63,10 @@ pub const Production = union(enum) { repetition_one: *Production, // [ ... ]+ }; +pub const Mapping = struct { + // +}; + pub const Node = struct { location: ptk.Location, name: String, @@ -100,6 +113,7 @@ pub const CompoundType = struct { }; pub const Field = struct { + location: ptk.Location, // name: String, type: *Type, }; @@ -115,6 +129,7 @@ pub fn analyze(allocator: std.mem.Allocator, diagnostics: *Diagnostics, strings: .rules = StringHashMap(*Rule).init(allocator), .nodes = StringHashMap(*Node).init(allocator), .patterns = StringHashMap(*Pattern).init(allocator), + .literal_patterns = StringHashMap(*Pattern).init(allocator), .start = null, }; @@ -153,6 +168,7 @@ pub fn analyze(allocator: std.mem.Allocator, diagnostics: *Diagnostics, strings: var BAD_TYPE_SENTINEL: Type = undefined; var BAD_NODE_SENTINEL: Node = undefined; var BAD_RULE_SENTINEL: Rule = undefined; +var BAD_PATTERN_SENTINEL: Pattern = undefined; fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { // Phase 0: Validate productions on legality (coarse error checking) @@ -174,6 +190,7 @@ fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { try analyzer.iterateOn(.node, Analyzer.validateNodes); // Phase 4: Instantiate AST productions + try analyzer.iterateOn(.rule, Analyzer.instantiateRules); // Phase 5: Instantiate and validate AST mappings @@ -249,6 +266,8 @@ const Analyzer = struct { } } + /// Creates declarations in the target Grammar and makes sure all declared objects are reachable. + /// Emits diagnostics for duplicate declarations. fn createDeclarations(analyzer: *Analyzer) !void { var iter = ast.iterate(analyzer.document); while (iter.next()) |item| { @@ -270,7 +289,7 @@ const Analyzer = struct { .name = rule.name.value, .type = undefined, // created in phase 4 - .production = undefined, // created in phase 5 + .productions = &.{}, // created in phase 5 }; }, @@ -313,6 +332,8 @@ const Analyzer = struct { } } + /// Searches all start symbol declarations and stores a reference to the initial rule. + /// Will emit diagnostics for duplicate start symbol decls and invalid references. fn instantiateStartSymbol(analyzer: *Analyzer, start: *ast.RuleRef) !void { if (analyzer.target.start) |old_start| { try analyzer.emitDiagnostic(start.location, .multiple_start_symbols, .{ @@ -340,6 +361,7 @@ const Analyzer = struct { }; } + /// Fully populate all content of the pattern declarations. Emits diagnostics for invalid patterns. fn instantiatePatterns(analyzer: *Analyzer, ast_pattern: *ast.Pattern) !void { const sema_pattern = analyzer.target.patterns.get(ast_pattern.name.value).?; @@ -353,6 +375,8 @@ const Analyzer = struct { // TODO: Implement regex validation here! } + /// Instantiates and validates all node declarations. + /// Emits diagnostics for bad type declarations. fn instantiateNodeTypes(analyzer: *Analyzer, ast_node: *ast.Node) !void { const sema_node = analyzer.target.nodes.get(ast_node.name.value).?; @@ -365,13 +389,136 @@ const Analyzer = struct { try analyzer.validateType(sema_node.type); } - fn validateType(analyzer: *Analyzer, type_node: *Type) !void { - _ = analyzer; + fn instantiateRules(analyzer: *Analyzer, ast_rule: *ast.Rule) !void { + const sema_rule = analyzer.target.rules.get(ast_rule.name.value).?; + + sema_rule.type = if (ast_rule.ast_type) |ast_type| + try analyzer.resolveType(&ast_type) + else + null; + + sema_rule.productions = try analyzer.target.arena.allocator().alloc(MappedProduction, ast_rule.productions.len()); + errdefer { + analyzer.target.arena.allocator().free(sema_rule.productions); + sema_rule.productions = &.{}; + } + + if (sema_rule.productions.len == 0) { + @panic("empty sema rule!"); + } + + var iter = ast.iterate(ast_rule.productions); + var index: usize = 0; + while (iter.next()) |ast_production| : (index += 1) { + const sema_production = &sema_rule.productions[index]; + + sema_production.* = MappedProduction{ + .production = try analyzer.translateProduction(ast_production.production), + .mapping = null, // Will be instantiated later + }; + } + } + + fn translateProduction(analyzer: *Analyzer, ast_prod: ast.Production) error{OutOfMemory}!Production { + switch (ast_prod) { + .literal => |literal| { + const gop = try analyzer.target.literal_patterns.getOrPut(literal.value); + if (!gop.found_existing) { + gop.value_ptr.* = try analyzer.target.arena.allocator().create(Pattern); + gop.value_ptr.*.* = .{ + .location = literal.location, // place of first use + .name = literal.value, + .data = .{ .literal_match = literal.value }, + }; + } + return Production{ .terminal = gop.value_ptr.* }; + }, + .terminal => |terminal| { + if (analyzer.target.patterns.get(terminal.identifier)) |pattern| { + return Production{ .terminal = pattern }; + } else { + try analyzer.emitDiagnostic(terminal.location, .reference_to_undeclared_pattern, .{ + .identifier = analyzer.strings.get(terminal.identifier), + }); + return Production{ .terminal = &BAD_PATTERN_SENTINEL }; + } + }, + .recursion => |recursion| { + if (analyzer.target.rules.get(recursion.identifier)) |rule| { + return Production{ .recursion = rule }; + } else { + try analyzer.emitDiagnostic(recursion.location, .reference_to_undeclared_rule, .{ + .identifier = analyzer.strings.get(recursion.identifier), + }); + return Production{ .recursion = &BAD_RULE_SENTINEL }; + } + }, + .sequence => |sequence| { + if (sequence.len() == 0) + @panic("bad sequence: empty"); + + var seq = std.ArrayList(Production).init(analyzer.target.arena.allocator()); + defer seq.deinit(); + + try seq.ensureTotalCapacityPrecise(sequence.len()); + + var iter = ast.iterate(sequence); + while (iter.next()) |inner_prod| { + const inner_sema = try analyzer.translateProduction(inner_prod.*); + seq.appendAssumeCapacity(inner_sema); + } + + return Production{ + .sequence = seq.toOwnedSlice() catch @panic("bad capacity"), + }; + }, + .optional => |optional| { + const nested = try analyzer.target.arena.allocator().create(Production); + errdefer analyzer.target.arena.allocator().destroy(nested); + nested.* = try analyzer.translateProduction(.{ .sequence = optional }); + return .{ .optional = nested }; + }, + .repetition_zero => |repetition| { + const nested = try analyzer.target.arena.allocator().create(Production); + errdefer analyzer.target.arena.allocator().destroy(nested); + nested.* = try analyzer.translateProduction(.{ .sequence = repetition }); + return .{ .repetition_zero = nested }; + }, + .repetition_one => |repetition| { + const nested = try analyzer.target.arena.allocator().create(Production); + errdefer analyzer.target.arena.allocator().destroy(nested); + nested.* = try analyzer.translateProduction(.{ .sequence = repetition }); + return .{ .repetition_one = nested }; + }, + } + } + + /// Checks if the given type is semantically ok or emits compiler errors if not. + fn validateType(analyzer: *Analyzer, type_node: *Type) error{OutOfMemory}!void { if (type_node == &BAD_TYPE_SENTINEL) { @panic("bad sentinel"); } + + switch (type_node.*) { + .code_literal, .user_type => {}, // always fine + .optional => |child_type| try analyzer.validateType(child_type), + .record, .variant => |compound_type| { + var fields = compound_type.fields.iterator(); + while (fields.next()) |kv| { + const field_type = kv.value_ptr.type; + try analyzer.validateType(field_type); + } + }, + .named => |node| { + if (node == &BAD_NODE_SENTINEL) { + @panic("bad node!"); + } + }, + } } + /// Constructs a new compound type from the given AST declaration. Will emit diagnostics + /// on error and returns an incomplete type if errors happened. fn createCompoundType(analyzer: *Analyzer, def: ast.CompoundType) !*CompoundType { const ct = try analyzer.target.arena.allocator().create(CompoundType); errdefer analyzer.target.arena.allocator().destroy(ct); @@ -386,9 +533,20 @@ const Analyzer = struct { var iter = ast.iterate(def.fields); while (iter.next()) |field_def| { const field_type = try analyzer.resolveType(&field_def.type); - ct.fields.putAssumeCapacityNoClobber(field_def.name.value, .{ + const gop_result = ct.fields.getOrPutAssumeCapacity(field_def.name.value); + + if (gop_result.found_existing) { + try analyzer.emitDiagnostic(field_def.location, .duplicate_compound_field, .{ + .previous_location = gop_result.value_ptr.location, + .identifier = analyzer.strings.get(field_def.name.value), + }); + continue; + } + + gop_result.value_ptr.* = .{ .type = field_type, - }); + .location = field_def.location, + }; } return ct; @@ -400,7 +558,7 @@ const Analyzer = struct { ct.* = undefined; } - fn resolveType(analyzer: *Analyzer, type_node: *ast.TypeSpec) error{OutOfMemory}!*Type { + fn resolveType(analyzer: *Analyzer, type_node: *const ast.TypeSpec) error{OutOfMemory}!*Type { var compound_type: ?*CompoundType = null; var proto_type: Type = switch (type_node.*) { .reference => |def| .{ diff --git a/test/analysis/reject/duplicate-field-record.ptk b/test/analysis/reject/duplicate-field-record.ptk new file mode 100644 index 0000000..3a64f2a --- /dev/null +++ b/test/analysis/reject/duplicate-field-record.ptk @@ -0,0 +1,7 @@ +# expected: E1307 + +node bad = record + x: `bool`, + y: `bool`, + x: `bool` +; \ No newline at end of file diff --git a/test/analysis/reject/duplicate-field-variant.ptk b/test/analysis/reject/duplicate-field-variant.ptk new file mode 100644 index 0000000..377a38a --- /dev/null +++ b/test/analysis/reject/duplicate-field-variant.ptk @@ -0,0 +1,7 @@ +# expected: E1307 + +node bad = variant + x: `bool`, + y: `bool`, + x: `bool` +; \ No newline at end of file From a5a50422250e03b917edda4db129fcd98f558a63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 1 Dec 2023 14:39:23 +0100 Subject: [PATCH 18/20] Implements sema grammar dumper --- build.zig | 5 +- examples/ptkgen/ast-with-unions.ptk | 6 +- examples/ptkgen/grammar.ptk | 6 +- src/ptkgen/main.zig | 5 + src/ptkgen/sema.zig | 5 +- src/ptkgen/sema_dump.zig | 173 ++++++++++++++++++ .../production-undeclared-pattern-ref.ptk | 3 + .../reject/production-undeclared-rule-ref.ptk | 3 + 8 files changed, 197 insertions(+), 9 deletions(-) create mode 100644 src/ptkgen/sema_dump.zig create mode 100644 test/analysis/reject/production-undeclared-pattern-ref.ptk create mode 100644 test/analysis/reject/production-undeclared-rule-ref.ptk diff --git a/build.zig b/build.zig index 7bcc743..1b9b0ce 100644 --- a/build.zig +++ b/build.zig @@ -144,7 +144,7 @@ const analyis_accept_files = [_][]const u8{ const analyis_reject_files = [_][]const u8{ "test/analysis/reject/duplicate-node.ptk", - // "test/analysis/reject/duplicate-pattern.ptk", // TODO: Implement pattern support in parser + "test/analysis/reject/duplicate-pattern.ptk", "test/analysis/reject/duplicate-rule.ptk", "test/analysis/accept/expect-warn-missing-start.ptk", @@ -155,6 +155,9 @@ const analyis_reject_files = [_][]const u8{ "test/analysis/reject/duplicate-field-record.ptk", "test/analysis/reject/duplicate-field-variant.ptk", + + "test/analysis/reject/production-undeclared-pattern-ref.ptk", + "test/analysis/reject/production-undeclared-rule-ref.ptk", }; const parser_accept_files = [_][]const u8{ diff --git a/examples/ptkgen/ast-with-unions.ptk b/examples/ptkgen/ast-with-unions.ptk index 0133fcd..369c9c9 100644 --- a/examples/ptkgen/ast-with-unions.ptk +++ b/examples/ptkgen/ast-with-unions.ptk @@ -55,9 +55,9 @@ node TLDeclaration = variant module : !module ; -node namespace = @extern; -node interface = @extern; -node module = @extern; +node namespace = @Namespace; +node interface = @Interface; +node module = @Module; rule toplevel-decl : !TLDeclaration = => ns: $0 # this is syntax for a union field selector as unions are not compounds diff --git a/examples/ptkgen/grammar.ptk b/examples/ptkgen/grammar.ptk index 11c29f5..6a6d95a 100644 --- a/examples/ptkgen/grammar.ptk +++ b/examples/ptkgen/grammar.ptk @@ -70,11 +70,12 @@ rule value_list = ( "," )* ; - # TODO: +rule type = "empty"; + pattern rule_ref = literal ""; -pattern identifier = literal ""; +pattern identifier = regex "[A-Za-z_][A-Za-z0-9_]*"; pattern string_literal = literal ""; pattern userval = literal ""; pattern token_ref = literal ""; @@ -82,4 +83,3 @@ pattern code_literal = literal ""; pattern value_ref = literal ""; pattern mapped_value = literal ""; -rule type = "empty"; \ No newline at end of file diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 4384d44..42ba2c0 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -11,6 +11,7 @@ const sema = @import("sema.zig"); const intl = @import("intl.zig"); const parser = @import("parser.zig"); const ast_dump = @import("ast_dump.zig"); +const sema_dump = @import("sema_dump.zig"); const Diagnostics = @import("Diagnostics.zig"); @@ -312,6 +313,10 @@ fn compileFile( // TODO: Implement parsergen / tablegen / highlightergen if (options.test_mode == .none) { + std.debug.print("ast dump:\n", .{}); ast_dump.dump(string_pool, tree); + + std.debug.print("\n\nsema dump:\n", .{}); + sema_dump.dump(string_pool, grammar); } } diff --git a/src/ptkgen/sema.zig b/src/ptkgen/sema.zig index 837b36b..ba18328 100644 --- a/src/ptkgen/sema.zig +++ b/src/ptkgen/sema.zig @@ -77,7 +77,7 @@ pub const Node = struct { pub const Pattern = struct { location: ptk.Location, name: String, - + is_literal: bool, data: Data, pub const Data = union(enum) { @@ -324,7 +324,7 @@ const Analyzer = struct { instance.* = .{ .location = pattern.name.location, .name = pattern.name.value, - + .is_literal = false, .data = undefined, // created in phase 2 }; }, @@ -429,6 +429,7 @@ const Analyzer = struct { .location = literal.location, // place of first use .name = literal.value, .data = .{ .literal_match = literal.value }, + .is_literal = true, }; } return Production{ .terminal = gop.value_ptr.* }; diff --git a/src/ptkgen/sema_dump.zig b/src/ptkgen/sema_dump.zig new file mode 100644 index 0000000..3fe684f --- /dev/null +++ b/src/ptkgen/sema_dump.zig @@ -0,0 +1,173 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const sema = @import("sema.zig"); +const parser = @import("parser.zig"); + +pub fn dump(strings: *const ptk.strings.Pool, grammar: sema.Grammar) void { + var printer = SemaPrinter{ + .strings = strings, + }; + + SemaPrinter.print("literal patterns:\n", .{}); + printer.dumpPatterns(grammar.literal_patterns); + + SemaPrinter.print("\nuser patterns:\n", .{}); + printer.dumpPatterns(grammar.patterns); + + SemaPrinter.print("\nstart rule: ", .{}); + if (grammar.start) |start| { + SemaPrinter.print("<{}>\n", .{printer.fmtId(start.rule.name)}); + } else { + SemaPrinter.print("-none-\n", .{}); + } + + SemaPrinter.print("\nast nodes:\n", .{}); + printer.dumpNodes(grammar.nodes); + + SemaPrinter.print("\nrules:\n", .{}); + printer.dumpRules(grammar.rules); +} + +const SemaPrinter = struct { + const print = std.debug.print; + + strings: *const ptk.strings.Pool, + + fn dumpPatterns(printer: SemaPrinter, patterns: sema.StringHashMap(*sema.Pattern)) void { + for (patterns.values()) |pattern| { + print("pattern {} = ", .{printer.fmtId(pattern.name)}); + + switch (pattern.data) { + inline else => |value, tag| print("{s} \"{}\"", .{ @tagName(tag), printer.fmtString(value) }), + } + + print(";\n", .{}); + } + } + + fn dumpNodes(printer: SemaPrinter, nodes: sema.StringHashMap(*sema.Node)) void { + for (nodes.values()) |node| { + print("node {} = ", .{printer.fmtId(node.name)}); + + printer.dumpType(node.type); + + print(";\n", .{}); + } + } + + fn dumpRules(printer: SemaPrinter, rules: sema.StringHashMap(*sema.Rule)) void { + for (rules.values()) |rule| { + print("rule {}", .{printer.fmtId(rule.name)}); + + if (rule.type) |rule_type| { + print(": ", .{}); + printer.dumpType(rule_type); + } + + print(" = ", .{}); + + for (rule.productions, 0..) |production, i| { + if (i > 0) print("\n | ", .{}); + printer.dumpMappedProduction(production); + } + + print(";\n", .{}); + } + } + + fn dumpMappedProduction(printer: SemaPrinter, mapped_prod: sema.MappedProduction) void { + printer.dumpProduction(mapped_prod.production); + + if (mapped_prod.mapping) |mapping| { + print(" -> ", .{}); + printer.dumpMapping(mapping); + } + } + + fn dumpProduction(printer: SemaPrinter, production: sema.Production) void { + switch (production) { + .terminal => |terminal| { + if (terminal.is_literal) { + print("\"{}\"", .{printer.fmtString(terminal.data.literal_match)}); + } else { + print("${}", .{printer.fmtId(terminal.name)}); + } + }, + .recursion => |recursion| print("<{}>", .{printer.fmtId(recursion.name)}), + .sequence => |sequence| { + for (sequence, 0..) |item, i| { + if (i > 0) + print(" ", .{}); + printer.dumpProduction(item); + } + }, + .optional => |optional| { + print("(", .{}); + printer.dumpProduction(optional.*); + print(")?", .{}); + }, + .repetition_zero => |repetition_zero| { + print("(", .{}); + printer.dumpProduction(repetition_zero.*); + print(")*", .{}); + }, + .repetition_one => |repetition_one| { + print("(", .{}); + printer.dumpProduction(repetition_one.*); + print(")+", .{}); + }, + } + } + + fn dumpMapping(printer: SemaPrinter, mapping: sema.Mapping) void { + _ = mapping; + _ = printer; + } + + fn dumpType(printer: SemaPrinter, stype: *sema.Type) void { + switch (stype.*) { + .code_literal => |literal| print("`{}`", .{printer.fmtString(literal)}), + .user_type => |literal| print("@{}", .{printer.fmtId(literal)}), + .optional => |inner| { + print("optional ", .{}); + printer.dumpType(inner); + }, + inline .record, .variant => |compound, tag| { + print("{s} ", .{@tagName(tag)}); + for (compound.fields.keys(), compound.fields.values(), 0..) |name, field, i| { + if (i > 0) + print(", ", .{}); + print("{}: ", .{printer.fmtId(name)}); + printer.dumpType(field.type); + } + }, + .named => |other| print("!{}", .{printer.fmtId(other.name)}), + } + } + + fn fmtString(printer: SemaPrinter, str: ptk.strings.String) StringPrinter { + return StringPrinter{ .printer = printer, .str = str, .mode = .text }; + } + + fn fmtId(printer: SemaPrinter, str: ptk.strings.String) StringPrinter { + return StringPrinter{ .printer = printer, .str = str, .mode = .id }; + } + + const StringPrinter = struct { + printer: SemaPrinter, + str: ptk.strings.String, + mode: enum { id, text }, + + pub fn format(strpr: StringPrinter, fmt: []const u8, opt: std.fmt.FormatOptions, writer: anytype) !void { + _ = opt; + _ = fmt; + + const text = strpr.printer.strings.get(strpr.str); + switch (strpr.mode) { + .id => try writer.print("{}", .{std.zig.fmtId(text)}), + .text => try writer.print("{}", .{std.zig.fmtEscapes(text)}), + } + } + }; +}; diff --git a/test/analysis/reject/production-undeclared-pattern-ref.ptk b/test/analysis/reject/production-undeclared-pattern-ref.ptk new file mode 100644 index 0000000..10e66f0 --- /dev/null +++ b/test/analysis/reject/production-undeclared-pattern-ref.ptk @@ -0,0 +1,3 @@ +# expected: E1305 + +rule foo = $pat; \ No newline at end of file diff --git a/test/analysis/reject/production-undeclared-rule-ref.ptk b/test/analysis/reject/production-undeclared-rule-ref.ptk new file mode 100644 index 0000000..a5525cc --- /dev/null +++ b/test/analysis/reject/production-undeclared-rule-ref.ptk @@ -0,0 +1,3 @@ +# expected: E1303 + +rule foo = ; \ No newline at end of file From 768cd58603801fbaf4f3fc7603e3ebe8fd8b11e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Fri, 1 Dec 2023 16:07:28 +0100 Subject: [PATCH 19/20] Adds basic json output of parsed grammar. --- src/ptkgen/{ast_dump.zig => dump/ast.zig} | 4 +- src/ptkgen/dump/json.zig | 231 ++++++++++++++++++++ src/ptkgen/{sema_dump.zig => dump/sema.zig} | 4 +- src/ptkgen/intl.zig | 30 ++- src/ptkgen/intl/en.json | 27 ++- src/ptkgen/main.zig | 80 ++++++- 6 files changed, 367 insertions(+), 9 deletions(-) rename src/ptkgen/{ast_dump.zig => dump/ast.zig} (99%) create mode 100644 src/ptkgen/dump/json.zig rename src/ptkgen/{sema_dump.zig => dump/sema.zig} (98%) diff --git a/src/ptkgen/ast_dump.zig b/src/ptkgen/dump/ast.zig similarity index 99% rename from src/ptkgen/ast_dump.zig rename to src/ptkgen/dump/ast.zig index 226b324..9c5d675 100644 --- a/src/ptkgen/ast_dump.zig +++ b/src/ptkgen/dump/ast.zig @@ -1,8 +1,8 @@ const std = @import("std"); const ptk = @import("parser-toolkit"); -const ast = @import("ast.zig"); -const parser = @import("parser.zig"); +const ast = @import("../ast.zig"); +const parser = @import("../parser.zig"); pub fn dump(strings: *const ptk.strings.Pool, decls: parser.Document) void { var printer = AstPrinter{ diff --git a/src/ptkgen/dump/json.zig b/src/ptkgen/dump/json.zig new file mode 100644 index 0000000..f777011 --- /dev/null +++ b/src/ptkgen/dump/json.zig @@ -0,0 +1,231 @@ +const std = @import("std"); +const ptk = @import("parser-toolkit"); + +const sema = @import("../sema.zig"); +const parser = @import("../parser.zig"); + +pub fn createJsonValue( + arena: *std.heap.ArenaAllocator, + strings: *const ptk.strings.Pool, + grammar: sema.Grammar, +) !std.json.Value { + const allocator = arena.allocator(); + + var mapper = JsonMapper{ + .allocator = allocator, + .strings = strings, + }; + + var root = std.json.ObjectMap.init(allocator); + errdefer root.deinit(); + + if (grammar.start) |start| { + try root.put("start", mapper.jsonString(start.rule.name)); + } else { + try root.put("start", .null); + } + + { + var list = mapper.newArray(); + errdefer list.deinit(); + + var iter = grammar.literal_patterns.iterator(); + while (iter.next()) |kvp| { + try list.append(mapper.jsonString(kvp.value_ptr.*.data.literal_match)); + } + + try root.put("literal_patterns", .{ .array = list }); + } + + { + var patterns = std.json.ObjectMap.init(allocator); + errdefer patterns.deinit(); + + var iter = grammar.patterns.iterator(); + while (iter.next()) |kvp| { + const spattern: *sema.Pattern = kvp.value_ptr.*; + + var jpattern = std.json.ObjectMap.init(allocator); + errdefer jpattern.deinit(); + + // try jpattern.put("name", .{ .string = strings.get(spattern.name) }); + try jpattern.put("kind", .{ .string = @tagName(spattern.data) }); + switch (spattern.data) { + inline else => |val| try jpattern.put("data", mapper.jsonString(val)), + } + + try patterns.putNoClobber( + strings.get(kvp.key_ptr.*), + .{ .object = jpattern }, + ); + } + + try root.put("patterns", .{ .object = patterns }); + } + + { + var nodes = std.json.ObjectMap.init(allocator); + errdefer nodes.deinit(); + + var iter = grammar.nodes.iterator(); + while (iter.next()) |kvp| { + const snode: *sema.Node = kvp.value_ptr.*; + + var jtype = try mapper.convertType(snode.type); + + try nodes.putNoClobber( + strings.get(kvp.key_ptr.*), + jtype, + ); + } + + try root.put("ast_nodes", .{ .object = nodes }); + } + + { + var rules = std.json.ObjectMap.init(allocator); + errdefer rules.deinit(); + + var iter = grammar.rules.iterator(); + while (iter.next()) |kvp| { + const srule: *sema.Rule = kvp.value_ptr.*; + + var jrule = mapper.newObject(); + errdefer jrule.deinit(); + + if (srule.type) |rule_type| { + var jtype = try mapper.convertType(rule_type); + try jrule.putNoClobber("type", jtype); + } else { + try jrule.putNoClobber("type", .null); + } + + { + var jprods = mapper.newArray(); + errdefer jprods.deinit(); + + try jprods.resize(srule.productions.len); + + for (jprods.items, srule.productions) |*jmprod_val, mapped_production| { + var jmprod = mapper.newObject(); + errdefer jmprod.deinit(); + + var jprod = try mapper.convertProduction(mapped_production.production); + + try jmprod.putNoClobber("production", jprod); + + if (mapped_production.mapping) |mapping| { + var jmap = try mapper.convertMapping(mapping); + try jmprod.putNoClobber("mapping", jmap); + } else { + try jmprod.putNoClobber("mapping", .null); + } + + jmprod_val.* = .{ .object = jmprod }; + } + + try jrule.putNoClobber("mapped_productions", .{ .array = jprods }); + } + + try rules.putNoClobber( + strings.get(kvp.key_ptr.*), + .{ .object = jrule }, + ); + } + + try root.put("rules", .{ .object = rules }); + } + + return std.json.Value{ .object = root }; +} + +const JsonMapper = struct { + allocator: std.mem.Allocator, + strings: *const ptk.strings.Pool, + + fn convertProduction(mapper: JsonMapper, production: sema.Production) error{OutOfMemory}!std.json.Value { + var jtype = mapper.newObject(); + errdefer jtype.deinit(); + + try jtype.putNoClobber("kind", .{ .string = @tagName(production) }); + + const data: std.json.Value = switch (production) { + .terminal => |terminal| blk: { + if (terminal.is_literal) { + try jtype.put("kind", .{ .string = "literal-terminal" }); + } + break :blk mapper.jsonString(terminal.name); + }, + .recursion => |recursion| mapper.jsonString(recursion.name), + + .sequence => |sequence| blk: { + var list = mapper.newArray(); + errdefer list.deinit(); + + try list.resize(sequence.len); + + for (list.items, sequence) |*dst, src| { + dst.* = try mapper.convertProduction(src); + } + + break :blk .{ .array = list }; + }, + + .optional, .repetition_zero, .repetition_one => |optional| try mapper.convertProduction(optional.*), + }; + try jtype.putNoClobber("data", data); + + return .{ .object = jtype }; + } + + fn convertMapping(mapper: JsonMapper, mapping: sema.Mapping) error{OutOfMemory}!std.json.Value { + _ = mapping; + _ = mapper; + + @panic("implement generation of mappings"); + } + + fn convertType(mapper: JsonMapper, stype: *sema.Type) error{OutOfMemory}!std.json.Value { + const data: std.json.Value = switch (stype.*) { + .code_literal, .user_type => |literal| mapper.jsonString(literal), + .named => |named| mapper.jsonString(named.name), + + .optional => |inner| try mapper.convertType(inner), + + .record, .variant => |compound| blk: { + var fields = mapper.newObject(); + errdefer fields.deinit(); + + for (compound.fields.keys(), compound.fields.values()) |name, field| { + var field_type = try mapper.convertType(field.type); + try fields.putNoClobber( + mapper.strings.get(name), + field_type, + ); + } + + break :blk .{ .object = fields }; + }, + }; + + var jtype = mapper.newObject(); + errdefer jtype.deinit(); + + try jtype.putNoClobber("kind", .{ .string = @tagName(stype.*) }); + try jtype.putNoClobber("data", data); + + return .{ .object = jtype }; + } + + fn jsonString(mapper: JsonMapper, string: ptk.strings.String) std.json.Value { + return .{ .string = mapper.strings.get(string) }; + } + + fn newObject(mapper: JsonMapper) std.json.ObjectMap { + return std.json.ObjectMap.init(mapper.allocator); + } + + fn newArray(mapper: JsonMapper) std.json.Array { + return std.json.Array.init(mapper.allocator); + } +}; diff --git a/src/ptkgen/sema_dump.zig b/src/ptkgen/dump/sema.zig similarity index 98% rename from src/ptkgen/sema_dump.zig rename to src/ptkgen/dump/sema.zig index 3fe684f..f338341 100644 --- a/src/ptkgen/sema_dump.zig +++ b/src/ptkgen/dump/sema.zig @@ -1,8 +1,8 @@ const std = @import("std"); const ptk = @import("parser-toolkit"); -const sema = @import("sema.zig"); -const parser = @import("parser.zig"); +const sema = @import("../sema.zig"); +const parser = @import("../parser.zig"); pub fn dump(strings: *const ptk.strings.Pool, grammar: sema.Grammar) void { var printer = SemaPrinter{ diff --git a/src/ptkgen/intl.zig b/src/ptkgen/intl.zig index 13ff049..fa0e3d5 100644 --- a/src/ptkgen/intl.zig +++ b/src/ptkgen/intl.zig @@ -76,14 +76,40 @@ pub const Localization = struct { FileTooBig: []const u8, InvalidSourceEncoding: []const u8, + + DiskQuota: []const u8, + NoSpaceLeft: []const u8, + DeviceBusy: []const u8, + InvalidArgument: []const u8, + NotOpenForWriting: []const u8, + LockViolation: []const u8, + ProcessFdQuotaExceeded: []const u8, + SystemFdQuotaExceeded: []const u8, + SharingViolation: []const u8, + PathAlreadyExists: []const u8, + FileNotFound: []const u8, + PipeBusy: []const u8, + NameTooLong: []const u8, + InvalidUtf8: []const u8, + BadPathName: []const u8, + NetworkNotFound: []const u8, + InvalidHandle: []const u8, + SymLinkLoop: []const u8, + NoDevice: []const u8, + NotDir: []const u8, + FileLocksNotSupported: []const u8, + FileBusy: []const u8, + LinkQuotaExceeded: []const u8, + ReadOnlyFileSystem: []const u8, + RenameAcrossMountPoints: []const u8, }, pub fn generate(comptime buffer: []const u8) Localization { @setEvalBranchQuota(1_000_000); - var alloc_buf: [buffer.len]u8 = undefined; + var alloc_buf: [4 * buffer.len]u8 = undefined; var fba = std.heap.FixedBufferAllocator.init(&alloc_buf); - return std.json.parseFromSliceLeaky(Localization, fba.allocator(), buffer, .{}) catch @compileError("failed to parse json"); + return std.json.parseFromSliceLeaky(Localization, fba.allocator(), buffer, .{}) catch |err| @compileError(std.fmt.comptimePrint("failed to parse json: {}", .{err})); } }; diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index c0f5eb8..fe62e4d 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -49,6 +49,31 @@ "NotOpenForReading": "not open for reading", "NetNameDeleted": "net name deleted", "FileTooBig": "Input file exceeds resources", - "InvalidSourceEncoding": "invalid source encoding" + "InvalidSourceEncoding": "invalid source encoding", + "DiskQuota": "disk quota", + "NoSpaceLeft": "no space left", + "DeviceBusy": "device busy", + "InvalidArgument": "invalid argument", + "NotOpenForWriting": "not open for writing", + "LockViolation": "lock violation", + "ProcessFdQuotaExceeded": "process fd quota exceeded", + "SystemFdQuotaExceeded": "system fd quota exceeded", + "SharingViolation": "sharing violation", + "PathAlreadyExists": "path already exists", + "FileNotFound": "file not found", + "PipeBusy": "pipe busy", + "NameTooLong": "name too long", + "InvalidUtf8": "invalid utf8", + "BadPathName": "bad path name", + "NetworkNotFound": "network not found", + "InvalidHandle": "invalid handle", + "SymLinkLoop": "sym link loop", + "NoDevice": "no device", + "NotDir": "not dir", + "FileLocksNotSupported": "file locks not supported", + "FileBusy": "file busy", + "LinkQuotaExceeded": "link quota exceeded", + "ReadOnlyFileSystem": "read only file system", + "RenameAcrossMountPoints": "rename across mount points" } } \ No newline at end of file diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 42ba2c0..2983bd1 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -10,8 +10,9 @@ const ast = @import("ast.zig"); const sema = @import("sema.zig"); const intl = @import("intl.zig"); const parser = @import("parser.zig"); -const ast_dump = @import("ast_dump.zig"); -const sema_dump = @import("sema_dump.zig"); +const ast_dump = @import("dump/ast.zig"); +const sema_dump = @import("dump/sema.zig"); +const json_dump = @import("dump/json.zig"); const Diagnostics = @import("Diagnostics.zig"); @@ -20,11 +21,17 @@ comptime { _ = parser; } +pub const Format = enum { + json, + // zig, +}; + pub const CliOptions = struct { help: bool = false, output: ?[]const u8 = null, test_mode: TestMode = .none, trace: bool = false, + format: Format = .json, @"max-file-size": u32 = 4 * 1024, // 4 MB of source code is a lot! @@ -47,6 +54,8 @@ pub const CliOptions = struct { .@"max-file-size" = "Maximum input file size in KiB (default: 4096)", .trace = "Prints a parse trace", + + .format = "Selects the output format of the grammar. Can be one of [ json, zig ]", }, }; }; @@ -196,6 +205,7 @@ fn convertErrorToDiagnostics(diagnostics: *Diagnostics, file_name: []const u8, e }, .file_limit_exceeded, .{}); }, + // input errors: error.InputOutput, error.AccessDenied, error.BrokenPipe, @@ -208,6 +218,33 @@ fn convertErrorToDiagnostics(diagnostics: *Diagnostics, file_name: []const u8, e error.ConnectionTimedOut, error.NotOpenForReading, error.NetNameDeleted, + + // output errors: + error.DiskQuota, + error.NoSpaceLeft, + error.DeviceBusy, + error.InvalidArgument, + error.NotOpenForWriting, + error.LockViolation, + error.ProcessFdQuotaExceeded, + error.SystemFdQuotaExceeded, + error.SharingViolation, + error.PathAlreadyExists, + error.FileNotFound, + error.PipeBusy, + error.NameTooLong, + error.InvalidUtf8, + error.BadPathName, + error.NetworkNotFound, + error.InvalidHandle, + error.SymLinkLoop, + error.NoDevice, + error.NotDir, + error.FileLocksNotSupported, + error.FileBusy, + error.LinkQuotaExceeded, + error.ReadOnlyFileSystem, + error.RenameAcrossMountPoints, => |e| { try diagnostics.emit(.{ .source = file_name, @@ -319,4 +356,43 @@ fn compileFile( std.debug.print("\n\nsema dump:\n", .{}); sema_dump.dump(string_pool, grammar); } + + if (options.test_mode != .none) + return; + + // Output generation: + { + const use_stdout = (options.output == null) or std.mem.eql(u8, options.output.?, "-"); + + var atomic_output_file: std.fs.AtomicFile = undefined; + if (!use_stdout) { + atomic_output_file = try std.fs.cwd().atomicFile(options.output.?, .{}); + } + defer if (!use_stdout) + atomic_output_file.deinit(); + + var output_file = if (use_stdout) + std.io.getStdOut() + else + atomic_output_file.file; + + // write to output_file here: + switch (options.format) { + .json => { + var arena = std.heap.ArenaAllocator.init(allocator); + defer arena.deinit(); + + var json_repr: std.json.Value = try json_dump.createJsonValue( + &arena, + string_pool, + grammar, + ); + + try std.json.stringify(json_repr, .{}, output_file.writer()); + }, + } + + if (!use_stdout) + try atomic_output_file.finish(); + } } From ee986c1c5daab34c3bf790476d475fa6b4f46d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20=22xq=22=20Quei=C3=9Fner?= Date: Mon, 3 Jun 2024 20:11:39 +0200 Subject: [PATCH 20/20] Backup --- docs/semantics.md | 23 ++ src/ptkgen/Diagnostics.zig | 53 +++ src/ptkgen/dump/json.zig | 69 +++- src/ptkgen/intl/en.json | 9 +- src/ptkgen/main.zig | 7 +- src/ptkgen/sema.zig | 385 +++++++++++++++++- .../accept/map-simple-builtin-fncall-0.ptk | 1 + .../accept/map-simple-builtin-fncall-1.ptk | 1 + .../accept/map-simple-builtin-fncall-4.ptk | 1 + .../accept/map-simple-code-literal.ptk | 1 + test/analysis/accept/map-simple-list-0.ptk | 1 + test/analysis/accept/map-simple-list-1.ptk | 1 + test/analysis/accept/map-simple-list-4.ptk | 1 + test/analysis/accept/map-simple-record-0.ptk | 1 + test/analysis/accept/map-simple-record-1.ptk | 1 + test/analysis/accept/map-simple-record-4.ptk | 1 + test/analysis/accept/map-simple-ruleref.ptk | 1 + .../accept/map-simple-user-fncall-0.ptk | 1 + .../accept/map-simple-user-fncall-1.ptk | 1 + .../accept/map-simple-user-fncall-4.ptk | 1 + .../accept/map-simple-user-literal.ptk | 1 + test/analysis/accept/map-simple-variant.ptk | 1 + test/analysis/reject/map-ruleref-oob.ptk | 2 + 23 files changed, 555 insertions(+), 9 deletions(-) create mode 100644 docs/semantics.md create mode 100644 test/analysis/accept/map-simple-builtin-fncall-0.ptk create mode 100644 test/analysis/accept/map-simple-builtin-fncall-1.ptk create mode 100644 test/analysis/accept/map-simple-builtin-fncall-4.ptk create mode 100644 test/analysis/accept/map-simple-code-literal.ptk create mode 100644 test/analysis/accept/map-simple-list-0.ptk create mode 100644 test/analysis/accept/map-simple-list-1.ptk create mode 100644 test/analysis/accept/map-simple-list-4.ptk create mode 100644 test/analysis/accept/map-simple-record-0.ptk create mode 100644 test/analysis/accept/map-simple-record-1.ptk create mode 100644 test/analysis/accept/map-simple-record-4.ptk create mode 100644 test/analysis/accept/map-simple-ruleref.ptk create mode 100644 test/analysis/accept/map-simple-user-fncall-0.ptk create mode 100644 test/analysis/accept/map-simple-user-fncall-1.ptk create mode 100644 test/analysis/accept/map-simple-user-fncall-4.ptk create mode 100644 test/analysis/accept/map-simple-user-literal.ptk create mode 100644 test/analysis/accept/map-simple-variant.ptk create mode 100644 test/analysis/reject/map-ruleref-oob.ptk diff --git a/docs/semantics.md b/docs/semantics.md new file mode 100644 index 0000000..7d23443 --- /dev/null +++ b/docs/semantics.md @@ -0,0 +1,23 @@ +# PtkGen Semantics + +## Context References + +tl;dr: `$n` can access the elements of the top-level productions of a rule. + +```rb +rule r = "hello" "world" => $0; # access "hello" +rule r = "hello" "world" => $1; # access "world" +``` + +### Index Resolution + +1. Flatten hierarchy +2. Use index in flattened list + +```rb +rule r = a b c d e f g h; # [ a b c d e f g h ] => flat sequence +rule r = a b ( c d e )? f g h; # [ a b c? d? e? f g h ] => `c`, `d`, `e` get promoted to optional) +rule r = a b ( c d e )* f g h; # [ a b [[c d e]] f g h ] => `c d e` get promoted to list of lists ([[c d e], [c d e], ...]) +rule r = a b ( c d e )+ f g h; # [ a b [[c d e]] f g h ] => `c d e` get promoted to list of lists ([[c d e], [c d e], ...]) +rule r = a b ( c d e ) f g h; # [ a b c d e f g h ] => `c d e` gets flattened into the master list +``` diff --git a/src/ptkgen/Diagnostics.zig b/src/ptkgen/Diagnostics.zig index 16677f5..6559663 100644 --- a/src/ptkgen/Diagnostics.zig +++ b/src/ptkgen/Diagnostics.zig @@ -54,6 +54,18 @@ pub const Code = enum(u16) { duplicate_compound_field = 1307, + context_reference_out_of_bounds = 1308, + + variant_does_not_exist = 1309, + + record_field_does_not_exist = 1310, + record_field_already_initialized = 1311, + record_field_not_initialized = 1312, + + mapping_requires_typed_rule = 1313, + + invalid_builtin_function = 1314, + // semantic warnings (4000-4099): missing_start_symbol = 4000, @@ -221,6 +233,35 @@ pub fn Data(comptime code: Code) type { previous_location: ptk.Location, }, + .context_reference_out_of_bounds => struct { + index: u32, + limit: u32, + }, + + .variant_does_not_exist => struct { + field: []const u8, + type_location: ptk.Location, + }, + + .record_field_does_not_exist => struct { + field: []const u8, + type_location: ptk.Location, + }, + .record_field_already_initialized => struct { + field: []const u8, + prev_init: ptk.Location, + }, + .record_field_not_initialized => struct { + field: []const u8, + field_location: ptk.Location, + }, + + .mapping_requires_typed_rule => NoDiagnosticData, + + .invalid_builtin_function => struct { + name: []const u8, + }, + // else => @compileError(std.fmt.comptimePrint("Code {} has no diagnostic type associated!", .{code})), }; } @@ -342,6 +383,18 @@ fn Formatter(comptime T: type) type { } }, + // integers: + + u32 => struct { + value: T, + pub fn format(item: @This(), fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void { + _ = options; + _ = fmt; + + try writer.print("{}", .{item.value}); + } + }, + else => @compileError(std.fmt.comptimePrint("{s} is not a supported diagnostic type!", .{@typeName(T)})), }; } diff --git a/src/ptkgen/dump/json.zig b/src/ptkgen/dump/json.zig index f777011..0da58ee 100644 --- a/src/ptkgen/dump/json.zig +++ b/src/ptkgen/dump/json.zig @@ -179,10 +179,71 @@ const JsonMapper = struct { } fn convertMapping(mapper: JsonMapper, mapping: sema.Mapping) error{OutOfMemory}!std.json.Value { - _ = mapping; - _ = mapper; + var jtype = mapper.newObject(); + errdefer jtype.deinit(); + + try jtype.putNoClobber("kind", .{ .string = @tagName(mapping) }); + + switch (mapping) { + .record_initializer => |record_initializer| { + var list = mapper.newArray(); + errdefer list.deinit(); + + try list.resize(record_initializer.fields.len); + + for (list.items, record_initializer.fields) |*dst, src| { + var jfield = mapper.newObject(); + errdefer jfield.deinit(); + + try jfield.putNoClobber("field", mapper.jsonString(src.field.name)); + try jfield.putNoClobber("value", try mapper.convertMapping(src.value)); + + dst.* = .{ .object = jfield }; + } + + try jtype.putNoClobber("fields", .{ .array = list }); + }, + .list_initializer => |list_initializer| { + var list = mapper.newArray(); + errdefer list.deinit(); + + try list.resize(list_initializer.items.len); + + for (list.items, list_initializer.items) |*dst, src| { + dst.* = try mapper.convertMapping(src); + } - @panic("implement generation of mappings"); + try jtype.putNoClobber("items", .{ .array = list }); + }, + .variant_initializer => |variant_initializer| { + try jtype.putNoClobber("field", mapper.jsonString(variant_initializer.field.name)); + try jtype.putNoClobber("value", try mapper.convertMapping(variant_initializer.value.*)); + }, + .user_function_call, .builtin_function_call => |function_call| { + var list = mapper.newArray(); + errdefer list.deinit(); + + try list.resize(function_call.arguments.len); + + for (list.items, function_call.arguments) |*dst, src| { + dst.* = try mapper.convertMapping(src); + } + + try jtype.putNoClobber("arguments", .{ .array = list }); + + try jtype.putNoClobber("function", mapper.jsonString(function_call.function)); + }, + + .code_literal, .user_literal => |literal| { + try jtype.putNoClobber("literal", mapper.jsonString(literal)); + }, + + .context_reference => |context_reference| { + try jtype.putNoClobber("index", .{ .integer = context_reference.index }); + }, + } + + return .{ .object = jtype }; } fn convertType(mapper: JsonMapper, stype: *sema.Type) error{OutOfMemory}!std.json.Value { @@ -206,6 +267,8 @@ const JsonMapper = struct { break :blk .{ .object = fields }; }, + + .token => .null, }; var jtype = mapper.newObject(); diff --git a/src/ptkgen/intl/en.json b/src/ptkgen/intl/en.json index fe62e4d..00ec0ea 100644 --- a/src/ptkgen/intl/en.json +++ b/src/ptkgen/intl/en.json @@ -30,7 +30,14 @@ "reference_to_undeclared_pattern": "Reference to undeclared pattern '{[identifier]}'.", "missing_start_symbol": "Grammar file has no start symbol declared.", "multiple_start_symbols": "Another start rule '({[identifier]})' was already declared here: {[previous_location]}", - "duplicate_compound_field": "Another field named '{s}' was already declared here: {[previous_location]}" + "duplicate_compound_field": "Another field named '{[identifier]s}' was already declared here: {[previous_location]}", + "context_reference_out_of_bounds": "Context reference index out of bounds. {[index]} was given, but the highest possible index is {[limit]}.", + "variant_does_not_exist": "The variant field {[identifier]s} does not exist. The variant type is declared here: {[type_location]}", + "record_field_does_not_exist": "The record field {[field]s} does not exist. The record type is declared here: {[type_location]}", + "record_field_already_initialized": "The record field {[field]s} is already initialized. Previous initialization: {[prev_init]}", + "record_field_not_initialized": "The record field {[field]s} was not initialized. Field declared here: {[field_location]}", + "mapping_requires_typed_rule": "The use of a rule mapping requires that the rule has an explicitly declared type.", + "invalid_builtin_function": "The builtin function {[name]s} does not exist!" }, "errors": { "SyntaxError": "syntax error", diff --git a/src/ptkgen/main.zig b/src/ptkgen/main.zig index 2983bd1..699b991 100644 --- a/src/ptkgen/main.zig +++ b/src/ptkgen/main.zig @@ -35,9 +35,12 @@ pub const CliOptions = struct { @"max-file-size": u32 = 4 * 1024, // 4 MB of source code is a lot! + dump: bool = false, + pub const shorthands = .{ .h = "help", .o = "output", + .D = "dump", }; pub const meta = .{ @@ -56,6 +59,8 @@ pub const CliOptions = struct { .trace = "Prints a parse trace", .format = "Selects the output format of the grammar. Can be one of [ json, zig ]", + + .dump = "Dumps results from parser and sema to stderr.", }, }; }; @@ -349,7 +354,7 @@ fn compileFile( // TODO: Implement parsergen / tablegen / highlightergen - if (options.test_mode == .none) { + if (options.dump) { std.debug.print("ast dump:\n", .{}); ast_dump.dump(string_pool, tree); diff --git a/src/ptkgen/sema.zig b/src/ptkgen/sema.zig index ba18328..bd64364 100644 --- a/src/ptkgen/sema.zig +++ b/src/ptkgen/sema.zig @@ -54,6 +54,8 @@ pub const MappedProduction = struct { mapping: ?Mapping, }; +/// A production is a part of a grammar. Productions consume +/// tokens and generate structure from this. pub const Production = union(enum) { terminal: *Pattern, // literal and terminal ast nodes are wrapped to this recursion: *Rule, // @@ -63,8 +65,51 @@ pub const Production = union(enum) { repetition_one: *Production, // [ ... ]+ }; -pub const Mapping = struct { - // +pub const Mapping = union(enum) { + record_initializer: RecordInitializer, // { a = b, c = d, ... } + list_initializer: ListInitializer, // [ a, b, c, ... ] + variant_initializer: VariantInitializer, // field: ... + + user_function_call: FunctionCall, // @builtin(a,b,c) + builtin_function_call: FunctionCall, // identifier(a,b,c) + + code_literal: String, // `code` + user_literal: String, // @user_data + + context_reference: ContextReference, // $0 +}; + +pub const ContextReference = struct { + index: u32, + production: *Production, + type: *Type, +}; + +const RecordInitializer = struct { + type: *Type, + fields: []FieldInitializer, +}; + +const FieldInitializer = struct { + field: *Field, + value: Mapping, +}; + +const ListInitializer = struct { + type: *Type, + items: []Mapping, +}; + +const VariantInitializer = struct { + type: *Type, + field: *Field, + value: *Mapping, +}; + +const FunctionCall = struct { + return_type: ?*Type, + function: String, + arguments: []Mapping, }; pub const Node = struct { @@ -101,6 +146,9 @@ pub const Type = union(enum) { // ast nodes are basically "named types" and must be handled as such named: *Node, + // builtin types: + token, // points to a PTK token + pub fn id(t: *const Type) TypeId { return @as(TypeId, t.*); } @@ -114,7 +162,7 @@ pub const CompoundType = struct { pub const Field = struct { location: ptk.Location, - // name: String, + name: String, type: *Type, }; @@ -169,6 +217,8 @@ var BAD_TYPE_SENTINEL: Type = undefined; var BAD_NODE_SENTINEL: Node = undefined; var BAD_RULE_SENTINEL: Rule = undefined; var BAD_PATTERN_SENTINEL: Pattern = undefined; +var BAD_PRODUCTION_SENTINEL: Production = undefined; +var BAD_FIELD_SENTINEL: Field = undefined; fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { // Phase 0: Validate productions on legality (coarse error checking) @@ -193,7 +243,8 @@ fn innerAnalysis(analyzer: *Analyzer) AnalyzeError!void { try analyzer.iterateOn(.rule, Analyzer.instantiateRules); // Phase 5: Instantiate and validate AST mappings - + try analyzer.iterateOn(.rule, Analyzer.instantiateMappings); // Create data structures + try analyzer.iterateOn(.rule, Analyzer.linkAndValidateMappedProductions); // Validate if data tr } const Analyzer = struct { @@ -494,6 +545,313 @@ const Analyzer = struct { } } + fn instantiateMappings(analyzer: *Analyzer, ast_rule: *ast.Rule) !void { + const sem_rule: *Rule = analyzer.target.rules.get(ast_rule.name.value).?; + + var iter = ast.iterate(ast_rule.productions); + + for (sem_rule.productions) |*sem_prod| { + const ast_prod = iter.next().?; + sem_prod.mapping = if (ast_prod.mapping) |src_mapping| + try analyzer.translateMapping(src_mapping) + else + null; + } + std.debug.assert(iter.next() == null); + } + + fn translateMapping(analyzer: *Analyzer, ast_mapping: ast.AstMapping) error{OutOfMemory}!Mapping { + switch (ast_mapping) { + .literal => |ref| return Mapping{ .code_literal = ref.value }, + .user_reference => |ref| return Mapping{ .code_literal = ref.value }, + + .context_reference => |ast_context_reference| { + return Mapping{ + .context_reference = .{ + .index = ast_context_reference.index, + .production = &BAD_PRODUCTION_SENTINEL, + .type = &BAD_TYPE_SENTINEL, + }, + }; + }, + + inline .user_function_call, .function_call => |function_call| { + const function_name = function_call.function.value; + + var args = try analyzer.target.arena.allocator().alloc(Mapping, function_call.arguments.len()); + errdefer analyzer.target.arena.allocator().free(args); + + var iter = ast.iterate(function_call.arguments); + for (args) |*item| { + const src = iter.next().?; + item.* = try analyzer.translateMapping(src.*); + } + std.debug.assert(iter.next() == null); + + const fncall = FunctionCall{ + .arguments = args, + .function = function_name, + .return_type = null, + }; + + return switch (ast_mapping) { + .user_function_call => Mapping{ .user_function_call = fncall }, + .function_call => Mapping{ .builtin_function_call = fncall }, + else => unreachable, + }; + }, + + .variant => |ast_variant| { + const init_expr = try analyzer.translateMapping(ast_variant.value.*); + + // ast_variant.field.value + return Mapping{ + .variant_initializer = .{ + .type = &BAD_TYPE_SENTINEL, + .field = &BAD_FIELD_SENTINEL, + .value = try moveToHeap(&analyzer.target.arena, Mapping, init_expr), + }, + }; + }, + + .list => |ast_list| { + var items = try analyzer.target.arena.allocator().alloc(Mapping, ast_list.len()); + errdefer analyzer.target.arena.allocator().free(items); + + var iter = ast.iterate(ast_list); + for (items) |*item| { + const src = iter.next().?; + item.* = try analyzer.translateMapping(src.*); + } + std.debug.assert(iter.next() == null); + + return Mapping{ + .list_initializer = .{ + .items = items, + .type = &BAD_TYPE_SENTINEL, + }, + }; + }, + + .record => |ast_record| { + var fields = try analyzer.target.arena.allocator().alloc(FieldInitializer, ast_record.len()); + errdefer analyzer.target.arena.allocator().free(fields); + + var iter = ast.iterate(ast_record); + for (fields) |*item| { + const src = iter.next().?; + const field_name = src.field.value; + _ = field_name; + item.* = .{ + .field = &BAD_FIELD_SENTINEL, + .value = try analyzer.translateMapping(src.value.*), + }; + } + std.debug.assert(iter.next() == null); + + return Mapping{ + .record_initializer = .{ + .fields = fields, + .type = &BAD_TYPE_SENTINEL, + }, + }; + }, + } + } + + const TypeTransform = struct { + optional: bool = false, + sequence: bool = false, + + pub fn add(tt: TypeTransform, comptime field: enum { optional, sequence }) TypeTransform { + var copy = tt; + @field(copy, @tagName(field)) = true; + return copy; + } + + pub fn format(tt: TypeTransform, fmt: []const u8, opt: std.fmt.FormatOptions, writer: anytype) !void { + _ = fmt; + _ = opt; + var list = std.BoundedArray([]const u8, 2){}; + + if (tt.optional) list.appendAssumeCapacity("opt"); + if (tt.sequence) list.appendAssumeCapacity("seq"); + + try writer.writeAll("TypeTransform("); + + if (list.len == 0) { + try writer.writeAll("none"); + } else { + for (list.slice(), 0..) |item, i| { + if (i > 0) + try writer.writeAll(","); + try writer.writeAll(item); + } + } + + try writer.writeAll(")"); + } + }; + + const IndexedProd = struct { + transform: TypeTransform, + production: *Production, + }; + + const ProductionIndex = std.ArrayList(IndexedProd); + + fn linkAndValidateMappedProductions(analyzer: *Analyzer, ast_rule: *ast.Rule) !void { + const sem_rule: *Rule = analyzer.target.rules.get(ast_rule.name.value).?; + + const has_any_mapping = for (sem_rule.productions) |prod| { + if (prod.mapping != null) + break true; + } else false; + + if (has_any_mapping and sem_rule.type == null) { + try analyzer.emitDiagnostic(sem_rule.location, .mapping_requires_typed_rule, .{}); + return; + } + + if (!has_any_mapping) { + // We're done here, nothing to link and validate. + return; + } + + const rule_type = sem_rule.type.?; + + var iter = ast.iterate(ast_rule.productions); + + var prod_index = ProductionIndex.init(analyzer.arena); + defer prod_index.deinit(); + + for (sem_rule.productions) |*sem_prod| { + const ast_prod = iter.next().?; + + if (ast_prod.mapping) |src_mapping| { + const dst_mapping = &sem_prod.mapping.?; + + // Rebuild index: + prod_index.shrinkRetainingCapacity(0); + try analyzer.rebuildProductionIndex(&prod_index, &sem_prod.production, .{}); + + std.debug.print("index:\n", .{}); + for (0.., prod_index.items) |index, item| { + std.debug.print("[{}]: {} {s}\n", .{ index, item.transform, @tagName(item.production.*) }); + } + + try analyzer.linkAndValidateMapping( + rule_type, + dst_mapping, + src_mapping, + prod_index.items, + ); + } else { + std.debug.assert(sem_prod.mapping == null); + } + } + + std.debug.assert(iter.next() == null); + } + + fn rebuildProductionIndex(analyzer: *Analyzer, prod_index: *ProductionIndex, production: *Production, transform: TypeTransform) error{OutOfMemory}!void { + switch (production.*) { + // Those are terminals and will be appended as-is: + .terminal => try prod_index.append(.{ .production = production, .transform = transform }), + .recursion => try prod_index.append(.{ .production = production, .transform = transform }), + + // Sequences are unwrapped: + .sequence => |list| for (list) |*inner_prod| { + try analyzer.rebuildProductionIndex(prod_index, inner_prod, transform); + }, + + // They just "recurse" into their inner workings, but annotate type changes: + .optional => |inner_prod| { + try analyzer.rebuildProductionIndex(prod_index, inner_prod, transform.add(.optional)); + }, + + .repetition_zero => |inner_prod| { + try analyzer.rebuildProductionIndex(prod_index, inner_prod, transform.add(.sequence)); + }, + + .repetition_one => |inner_prod| { + try analyzer.rebuildProductionIndex(prod_index, inner_prod, transform.add(.sequence)); + }, + } + } + + fn linkAndValidateMapping(analyzer: *Analyzer, type_context: *Type, sem_map: *Mapping, ast_map: ast.AstMapping, production_index: []const IndexedProd) !void { + _ = type_context; + + switch (sem_map.*) { + // Always fine, and terminate recursion: + .code_literal, .user_literal => {}, + + // Rule refs: + + .context_reference => |*context_reference| { + if (context_reference.index >= production_index.len) { + context_reference.production = &BAD_PRODUCTION_SENTINEL; + try analyzer.emitDiagnostic(ast_map.context_reference.location, .context_reference_out_of_bounds, .{ + .index = context_reference.index, + .limit = @as(u32, @truncate(production_index.len - 1)), // should never underflow as empty rules are illegal + }); + return; + } + + context_reference.production = production_index[context_reference.index].production; + + const base_type: *Type = switch (context_reference.production.*) { + // + .terminal => blk: { + var proto: Type = .token; + const canon = try analyzer.getCanonicalType(&proto); + std.debug.assert(canon != &proto); + break :blk canon; + }, + + // Invocations of other + .recursion => |rule| rule.type, + + .sequence, + .optional, + .repetition_zero, + .repetition_one, + => unreachable, // we should not be able to reach those + + }; + + // TODO: Transform type for context reference + + context_reference.type = base_type; + }, + + // Calls: + + .user_function_call => |*user_function_call| { + _ = user_function_call; + }, + + .builtin_function_call => |*builtin_function_call| { + _ = builtin_function_call; + }, + + // Compounds: + + .record_initializer => |*record_initializer| { + _ = record_initializer; + }, + + .list_initializer => |*list_initializer| { + _ = list_initializer; + }, + + .variant_initializer => |*variant_initializer| { + _ = variant_initializer; + }, + } + } + /// Checks if the given type is semantically ok or emits compiler errors if not. fn validateType(analyzer: *Analyzer, type_node: *Type) error{OutOfMemory}!void { if (type_node == &BAD_TYPE_SENTINEL) { @@ -547,6 +905,7 @@ const Analyzer = struct { gop_result.value_ptr.* = .{ .type = field_type, .location = field_def.location, + .name = field_def.name.value, }; } @@ -584,6 +943,10 @@ const Analyzer = struct { errdefer if (compound_type) |ct| analyzer.destroyCompoundType(ct); + return try analyzer.getCanonicalType(&proto_type); + } + + fn getCanonicalType(analyzer: Analyzer, proto_type: *Type) error{OutOfMemory}!?*Type { if (analyzer.getUniqueTypeHandle(&proto_type)) |resolved_type| { analyzer.deduplicated_type_count += 1; // logger.debug("deduplicated a {s}", .{@tagName(resolved_type.*)}); @@ -678,3 +1041,17 @@ const TypeContext = struct { return hasher.final(); } }; + +fn moveToHeap(arena: *std.heap.ArenaAllocator, comptime T: type, template: T) error{OutOfMemory}!*T { + const dupe = try arena.allocator().create(T); + dupe.* = template; + return dupe; +} + +pub const BuiltinFunction = struct { + name: []const u8, +}; + +pub const builtins = struct { + pub const foo = BuiltinFunction{ .name = "foo" }; +}; diff --git a/test/analysis/accept/map-simple-builtin-fncall-0.ptk b/test/analysis/accept/map-simple-builtin-fncall-0.ptk new file mode 100644 index 0000000..b4d4eec --- /dev/null +++ b/test/analysis/accept/map-simple-builtin-fncall-0.ptk @@ -0,0 +1 @@ +rule basic = "hello" => builtin(); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-builtin-fncall-1.ptk b/test/analysis/accept/map-simple-builtin-fncall-1.ptk new file mode 100644 index 0000000..21ebc7f --- /dev/null +++ b/test/analysis/accept/map-simple-builtin-fncall-1.ptk @@ -0,0 +1 @@ +rule basic = "hello" => builtin(`1`); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-builtin-fncall-4.ptk b/test/analysis/accept/map-simple-builtin-fncall-4.ptk new file mode 100644 index 0000000..09e4372 --- /dev/null +++ b/test/analysis/accept/map-simple-builtin-fncall-4.ptk @@ -0,0 +1 @@ +rule basic = "hello" => builtin(`1`, `2`, `3`, `4`); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-code-literal.ptk b/test/analysis/accept/map-simple-code-literal.ptk new file mode 100644 index 0000000..475f0a4 --- /dev/null +++ b/test/analysis/accept/map-simple-code-literal.ptk @@ -0,0 +1 @@ +rule basic = "hello" => `code`; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-list-0.ptk b/test/analysis/accept/map-simple-list-0.ptk new file mode 100644 index 0000000..dffe97f --- /dev/null +++ b/test/analysis/accept/map-simple-list-0.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { }; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-list-1.ptk b/test/analysis/accept/map-simple-list-1.ptk new file mode 100644 index 0000000..ab4e2c2 --- /dev/null +++ b/test/analysis/accept/map-simple-list-1.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { `1` }; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-list-4.ptk b/test/analysis/accept/map-simple-list-4.ptk new file mode 100644 index 0000000..3f970b9 --- /dev/null +++ b/test/analysis/accept/map-simple-list-4.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { `1`, `2`, `3`, `4` }; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-record-0.ptk b/test/analysis/accept/map-simple-record-0.ptk new file mode 100644 index 0000000..8f1a98c --- /dev/null +++ b/test/analysis/accept/map-simple-record-0.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { }; diff --git a/test/analysis/accept/map-simple-record-1.ptk b/test/analysis/accept/map-simple-record-1.ptk new file mode 100644 index 0000000..4cf6bfd --- /dev/null +++ b/test/analysis/accept/map-simple-record-1.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { field = `1` }; diff --git a/test/analysis/accept/map-simple-record-4.ptk b/test/analysis/accept/map-simple-record-4.ptk new file mode 100644 index 0000000..5f03773 --- /dev/null +++ b/test/analysis/accept/map-simple-record-4.ptk @@ -0,0 +1 @@ +rule basic = "hello" => { x = `1`, y = `2`, z = `3`, w = `4` }; diff --git a/test/analysis/accept/map-simple-ruleref.ptk b/test/analysis/accept/map-simple-ruleref.ptk new file mode 100644 index 0000000..4e0bc07 --- /dev/null +++ b/test/analysis/accept/map-simple-ruleref.ptk @@ -0,0 +1 @@ +rule basic = "hello" => $0; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-user-fncall-0.ptk b/test/analysis/accept/map-simple-user-fncall-0.ptk new file mode 100644 index 0000000..82eb16e --- /dev/null +++ b/test/analysis/accept/map-simple-user-fncall-0.ptk @@ -0,0 +1 @@ +rule basic = "hello" => @userFn(); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-user-fncall-1.ptk b/test/analysis/accept/map-simple-user-fncall-1.ptk new file mode 100644 index 0000000..b6b55fe --- /dev/null +++ b/test/analysis/accept/map-simple-user-fncall-1.ptk @@ -0,0 +1 @@ +rule basic = "hello" => @userFn(`1`); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-user-fncall-4.ptk b/test/analysis/accept/map-simple-user-fncall-4.ptk new file mode 100644 index 0000000..ab0bcb2 --- /dev/null +++ b/test/analysis/accept/map-simple-user-fncall-4.ptk @@ -0,0 +1 @@ +rule basic = "hello" => @userFn(`1`, `2`, `3`, `4`); \ No newline at end of file diff --git a/test/analysis/accept/map-simple-user-literal.ptk b/test/analysis/accept/map-simple-user-literal.ptk new file mode 100644 index 0000000..afef9ad --- /dev/null +++ b/test/analysis/accept/map-simple-user-literal.ptk @@ -0,0 +1 @@ +rule basic = "hello" => @externalThingy; \ No newline at end of file diff --git a/test/analysis/accept/map-simple-variant.ptk b/test/analysis/accept/map-simple-variant.ptk new file mode 100644 index 0000000..229b3cb --- /dev/null +++ b/test/analysis/accept/map-simple-variant.ptk @@ -0,0 +1 @@ +rule basic = "hello" => field: `code`; \ No newline at end of file diff --git a/test/analysis/reject/map-ruleref-oob.ptk b/test/analysis/reject/map-ruleref-oob.ptk new file mode 100644 index 0000000..8af2ba4 --- /dev/null +++ b/test/analysis/reject/map-ruleref-oob.ptk @@ -0,0 +1,2 @@ +# expected: E1308 +rule basic = "hello" => $1; \ No newline at end of file