diff --git a/src/compile.zig b/src/compile.zig index 1ffdb8b..ec6dc04 100644 --- a/src/compile.zig +++ b/src/compile.zig @@ -187,6 +187,8 @@ pub const Compiler = struct { fn nextCaptureIndex(c: *Compiler) usize { const s = c.capture_index; + // each capture contains start and end pos, hence add two for + // each iteration c.capture_index += 2; return s; } @@ -350,13 +352,19 @@ pub const Compiler = struct { // 3: restore 1, 4 // ... + if (!subexpr.capturing) { + const p = try c.compileInternal(subexpr.expr); + const hole = p.hole; + return Patch{ .hole = hole, .entry = p.entry }; + } + // Create a partial instruction with a hole outgoing at the current location. const entry = c.insts.items.len; const index = c.nextCaptureIndex(); try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index })); - const p = try c.compileInternal(subexpr); + const p = try c.compileInternal(subexpr.expr); c.fillToNext(p.hole); const h = try c.pushHole(InstHole{ .Save = index + 1 }); diff --git a/src/debug.zig b/src/debug.zig index 842595a..feafcf3 100644 --- a/src/debug.zig +++ b/src/debug.zig @@ -56,7 +56,7 @@ fn dumpExprIndent(e: Expr, indent: usize) void { }, Expr.Capture => |subexpr| { debug.print("{s}\n", .{@tagName(e)}); - dumpExprIndent(subexpr.*, indent + 1); + dumpExprIndent(subexpr.*.expr.*, indent + 1); }, Expr.Repeat => |repeat| { debug.print("{s}(min={d}, max={?d}, greedy={any})\n", .{ @tagName(e), repeat.min, repeat.max, repeat.greedy }); diff --git a/src/parse.zig b/src/parse.zig index c1052b7..1d722d9 100644 --- a/src/parse.zig +++ b/src/parse.zig @@ -48,6 +48,11 @@ pub const Assertion = enum { NotWordBoundaryAscii, }; +/// Extra attributes for group expression. +pub const GroupAttributes = struct { + capturing: bool, +}; + /// A single node of an expression tree. pub const Expr = union(enum) { // Empty match (\w assertion) @@ -57,7 +62,7 @@ pub const Expr = union(enum) { // . character AnyCharNotNL, // Capture group - Capture: *Expr, + Capture: *Group, // *, +, ? Repeat: Repeater, // Character class [a-z0-9] @@ -67,7 +72,7 @@ pub const Expr = union(enum) { // | Alternate: ArrayList(*Expr), // Pseudo stack operator to define start of a capture - PseudoLeftParen, + PseudoLeftParen: GroupAttributes, pub fn isByteClass(re: *const Expr) bool { switch (re.*) { @@ -95,6 +100,13 @@ pub const Expr = union(enum) { } }; +/// A single node of a group. The group could include different modifiers +/// by Perl flag for further features like non-capturing group. +pub const Group = struct { + expr: *Expr, + capturing: bool, +}; + // Private in fmt. fn charToDigit(c: u8, radix: u8) !u8 { const value = switch (c) { @@ -243,6 +255,7 @@ pub const ParseError = error{ InvalidHexDigit, InvalidOctalDigit, UnrecognizedEscapeCode, + UnimplementedModifier, }; pub const ParserOptions = struct { @@ -326,6 +339,10 @@ pub const Parser = struct { return try p.arena.allocator().create(Expr); } + fn createGroup(p: *Parser) !*Group { + return try p.arena.allocator().create(Group); + } + pub fn parse(p: *Parser, re: []const u8) !*Expr { p.it = StringIterator.init(re); // Shorter alias @@ -394,8 +411,24 @@ pub const Parser = struct { // Don't handle alternation just yet, parentheses group together arguments into // a sub-expression only. '(' => { + var capturing = true; + if (it.peekIs('?')) { + // Advance and discard + _ = it.next(); + if (it.peekIs(':')) { + // Advance and discard + _ = it.next(); + capturing = false; + } else { + // NOTE: Other modifiers are considered not implemented + return error.UnimplementedModifier; + } + } + const r = try p.createExpr(); - r.* = Expr{ .PseudoLeftParen = undefined }; + r.* = Expr{ .PseudoLeftParen = .{ + .capturing = capturing, + } }; try p.stack.append(r); }, ')' => { @@ -435,16 +468,29 @@ pub const Parser = struct { return error.UnopenedParentheses; } - // pop the left parentheses that must now exist - debug.assert(p.stack.pop().* == Expr.PseudoLeftParen); + const next_e = p.stack.pop().*; + var capturing: bool = undefined; + switch (next_e) { + // pop the left parentheses that must now exist + .PseudoLeftParen => |e_paren| { + capturing = e_paren.capturing; + }, + else => unreachable, + } + + const group = try p.createGroup(); + group.* = Group{ + .expr = e, + .capturing = capturing, + }; const r = try p.createExpr(); - r.* = Expr{ .Capture = e }; + r.* = Expr{ .Capture = group }; try p.stack.append(r); break; }, // Existing parentheses, push new alternation - .PseudoLeftParen => { + .PseudoLeftParen => |e_paren| { mem.reverse(*Expr, concat.items); const ra = try p.createExpr(); @@ -458,8 +504,14 @@ pub const Parser = struct { ra.* = Expr{ .Concat = concat }; } + const group = try p.createGroup(); + group.* = Group{ + .expr = ra, + .capturing = e_paren.capturing, + }; + const r = try p.createExpr(); - r.* = Expr{ .Capture = ra }; + r.* = Expr{ .Capture = group }; try p.stack.append(r); break; }, diff --git a/src/parse_test.zig b/src/parse_test.zig index fbd79fc..1883476 100644 --- a/src/parse_test.zig +++ b/src/parse_test.zig @@ -84,7 +84,7 @@ fn reprIndent(out: *StaticWriter, e: *Expr, indent: usize) anyerror!void { }, Expr.Capture => |subexpr| { try out.writer().print("cap\n", .{}); - try reprIndent(out, subexpr, indent + 1); + try reprIndent(out, subexpr.*.expr, indent + 1); }, Expr.Repeat => |repeat| { try out.writer().print("rep(", .{}); diff --git a/src/regex_test.zig b/src/regex_test.zig index ae09c47..dfa9450 100644 --- a/src/regex_test.zig +++ b/src/regex_test.zig @@ -141,6 +141,26 @@ test "regex captures" { debug.assert(mem.eql(u8, "ab0123", caps.sliceAt(0).?)); debug.assert(mem.eql(u8, "0123", caps.sliceAt(1).?)); + + var r_non_capturing_1 = try Regex.compile(std.testing.allocator, "ab(?:\\d+)"); + defer r_non_capturing_1.deinit(); + + debug.assert(try r_non_capturing_1.partialMatch("xxxxab0123a")); + + var caps_non_capturing_1 = (try r_non_capturing_1.captures("xxxxab0123a")).?; + defer caps_non_capturing_1.deinit(); + + debug.assert(mem.eql(u8, "ab0123", caps_non_capturing_1.sliceAt(0).?)); + debug.assert(caps_non_capturing_1.slots.len == 2); + + var r_non_capturing_2 = try Regex.compile(std.testing.allocator, "(?:ab(cd))"); + defer r_non_capturing_2.deinit(); + + var caps_non_capturing_2 = (try r_non_capturing_2.captures("xabcdx")).?; + defer caps_non_capturing_2.deinit(); + + debug.assert(mem.eql(u8, "abcd", caps_non_capturing_2.sliceAt(0).?)); + debug.assert(mem.eql(u8, "cd", caps_non_capturing_2.sliceAt(1).?)); } test "regex memory leaks" {