From 7c81fa4f70d4c17b2dd8bbb6128ac9b1b90b062e Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Tue, 6 Dec 2022 16:05:37 +0000 Subject: [PATCH 1/3] fix: ecma ranges with set terminator Fix ECMAScript un-escaped literal '-' when followed by predefined character sets. Also: * Fixed missing error check on parseProperty() call. * Use addChar(ch) helper instead of addRange(ch, ch). Fixes #54 --- go.mod | 2 ++ go.sum | 17 +++++++++++++++ regexp_ecma_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++++ syntax/parser.go | 33 +++++++++++++++++++++++------- 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 go.sum create mode 100644 regexp_ecma_test.go diff --git a/go.mod b/go.mod index 9f7f391..d5db1e4 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/dlclark/regexp2 go 1.13 + +require github.com/stretchr/testify v1.8.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2ec90f7 --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go new file mode 100644 index 0000000..11f8827 --- /dev/null +++ b/regexp_ecma_test.go @@ -0,0 +1,50 @@ +package regexp2_test + +import ( + "testing" + + "github.com/dlclark/regexp2" + "github.com/stretchr/testify/require" +) + +func TestECMA_basic(t *testing.T) { + tests := map[string]struct { + expr string + data string + want []string + }{ + "charset": { + expr: `[a-c]`, + data: "abcd", + want: []string{"a", "b", "c"}, + }, + "charset-set": { + expr: `[a-\s]`, + data: "a-b cd", + want: []string{"a", "-", " "}, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + re, err := regexp2.Compile(tt.expr, regexp2.ECMAScript) + require.NoError(t, err) + + match, err := re.FindStringMatch(tt.data) + require.NoError(t, err) + + var res []string + for match != nil { + for _, g := range match.Groups() { + for _, c := range g.Captures { + res = append(res, c.String()) + } + } + + match, err = re.FindNextMatch(match) + require.NoError(t, err) + } + require.Equal(t, tt.want, res) + }) + } +} diff --git a/syntax/parser.go b/syntax/parser.go index 9dc6e31..4e1d4ad 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -1427,7 +1427,7 @@ func (p *parser) scanCapname() string { return string(p.pattern[startpos:p.textpos()]) } -//Scans contents of [] (not including []'s), and converts to a set. +// Scans contents of [] (not including []'s), and converts to a set. func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { ch := '\x00' chPrev := '\x00' @@ -1467,7 +1467,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'D', 'd': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) } @@ -1476,7 +1480,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'S', 's': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') } @@ -1485,7 +1493,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'W', 'w': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') @@ -1495,7 +1507,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'p', 'P': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } prop, err := p.parseProperty() if err != nil { @@ -1503,14 +1519,17 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } else { - p.parseProperty() + _, err := p.parseProperty() + if err != nil { + return nil, err + } } continue case '-': if !scanOnly { - cc.addRange(ch, ch) + cc.addChar(ch) } continue From bc04c8669f1b1e38a185a20fe8b86ba1079350ae Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Sun, 18 Dec 2022 22:12:38 +0000 Subject: [PATCH 2/3] chore: ignore vscode files Ignore the files created by VSCode. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index fb844c3..5e4c21e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ _testmain.go *.out .DS_Store + +# Ignore vscode status. +.vscode/ From 58fb3a184c9027703d7c219de058459efc6a0c84 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Sun, 18 Dec 2022 22:17:40 +0000 Subject: [PATCH 3/3] fix: handle ecma unicode code point Handle Unicode code points in ECMA mode with both the flag enabled and disabled. This removes four tests which were incorrectly passing due to missing detection of invalid ranges, aligning with the behaviour of .NET and PCRE2 as opposed to PCRE. --- regexp_ecma_test.go | 75 +++++++++++++++++++++++++++++++++++---- regexp_mono_test.go | 2 -- syntax/parser.go | 85 +++++++++++++++++++++++++++++++++------------ testoutput1 | 18 ---------- 4 files changed, 130 insertions(+), 50 deletions(-) diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go index 11f8827..6d5b5d3 100644 --- a/regexp_ecma_test.go +++ b/regexp_ecma_test.go @@ -7,27 +7,88 @@ import ( "github.com/stretchr/testify/require" ) -func TestECMA_basic(t *testing.T) { +func TestECMA_charset(t *testing.T) { tests := map[string]struct { - expr string - data string - want []string + expr string + data string + opt regexp2.RegexOptions + want []string + wantErr string }{ - "charset": { + "basic": { expr: `[a-c]`, data: "abcd", want: []string{"a", "b", "c"}, }, - "charset-set": { + "in-range": { + expr: `[a-\s\b]`, + data: "a-b cd", + want: []string{"a", "-", " "}, + }, + "space": { expr: `[a-\s]`, data: "a-b cd", want: []string{"a", "-", " "}, }, + "word": { + expr: `[a-\w]`, + data: "a-b cd", + want: []string{"a", "-", "b", "c", "d"}, + }, + "digit": { + expr: `[a-\d]`, + data: "a-b1 cd", + want: []string{"a", "-", "1"}, + }, + "slash-p": { + expr: `[a-\p]`, + data: "a-bq cd", + want: []string{"a", "b", "c", "d"}, + }, + "slash-p-literal": { + expr: `[a-\p{x}]`, + data: "a-bq cdx", + want: []string{"a", "b", "c", "d", "x"}, + }, + "invalid-unicode": { + expr: `[a-\p]`, + opt: regexp2.Unicode, + wantErr: "error parsing regexp: incomplete \\p{X} character escape in `[a-\\p]`", + }, + "invalid-unicode-letter": { + expr: `[a-\p{L}]`, + opt: regexp2.Unicode, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[a-\\p{L}]`", + }, + "invalid-slash-P": { + expr: `[a-\P]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\P in `[a-\\P]`", + }, + "invalid-space": { + expr: `[\s-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\s in `[\\s-z]`", + }, + "invalid-word": { + expr: `[\w-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\w in `[\\w-z]`", + }, + "invalid-digit": { + expr: `[\d-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\d in `[\\d-z]`", + }, + "invalid-point": { + expr: `[\p-z]`, + wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[\\p-z]`", + }, } for name, tt := range tests { t.Run(name, func(t *testing.T) { - re, err := regexp2.Compile(tt.expr, regexp2.ECMAScript) + re, err := regexp2.Compile(tt.expr, tt.opt|regexp2.ECMAScript) + if tt.wantErr != "" { + require.EqualError(t, err, tt.wantErr) + return + } require.NoError(t, err) match, err := re.FindStringMatch(tt.data) diff --git a/regexp_mono_test.go b/regexp_mono_test.go index 372e81c..aa3cf7d 100644 --- a/regexp_mono_test.go +++ b/regexp_mono_test.go @@ -710,8 +710,6 @@ func TestMono_Basics(t *testing.T) { runRegexTrial(t, `.[X](.+)+[X][X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.") runRegexTrial(t, `.[X][X](.+)+[X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.") runRegexTrial(t, `tt+$`, 0, "xxxtt", "Pass. Group[0]=(3,2)") - runRegexTrial(t, `([\d-z]+)`, 0, "a0-za", "Pass. Group[0]=(1,3) Group[1]=(1,3)") - runRegexTrial(t, `([\d-\s]+)`, 0, "a0- z", "Pass. Group[0]=(1,3) Group[1]=(1,3)") runRegexTrial(t, `\GX.*X`, 0, "aaaXbX", "Fail.") runRegexTrial(t, `(\d+\.\d+)`, 0, "3.1415926", "Pass. Group[0]=(0,9) Group[1]=(0,9)") runRegexTrial(t, `(\ba.{0,10}br)`, 0, "have a web browser", "Pass. Group[0]=(5,8) Group[1]=(5,8)") diff --git a/syntax/parser.go b/syntax/parser.go index 4e1d4ad..77d6c95 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -75,6 +75,7 @@ const ( // Parser errors ErrUnterminatedComment = "unterminated comment" ErrInvalidCharRange = "invalid character class range" + ErrShorthandCharRange = "cannot create range with shorthand escape sequence \\%c" ErrInvalidRepeatSize = "invalid repeat count" ErrInvalidUTF8 = "invalid UTF-8" ErrCaptureGroupOutOfRange = "capture group number out of range" @@ -1170,14 +1171,19 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { case 'p', 'P': p.moveRight(1) - prop, err := p.parseProperty() + prop, literal, err := p.parseProperty() if err != nil { return nil, err } + cc := &CharSet{} - cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) - if p.useOptionI() { - cc.addLowercase() + if literal { + cc.addChar(ch) + } else { + cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) + if p.useOptionI() { + cc.addLowercase() + } } return newRegexNodeSet(ntSet, p.options, cc), nil @@ -1310,13 +1316,20 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { } // Scans X for \p{X} or \P{X} -func (p *parser) parseProperty() (string, error) { +func (p *parser) parseProperty() (string, bool, error) { + if p.useOptionE() && !p.useOptionU() { + // Unless unicode is enabled ECMA \p is just a literal p + // and \P is invalid depending on the use case, which the + // caller must handle. + return "", true, nil + } + if p.charsRight() < 3 { - return "", p.getErr(ErrIncompleteSlashP) + return "", false, p.getErr(ErrIncompleteSlashP) } ch := p.moveRightGetChar() if ch != '{' { - return "", p.getErr(ErrMalformedSlashP) + return "", false, p.getErr(ErrMalformedSlashP) } startpos := p.textpos() @@ -1330,14 +1343,14 @@ func (p *parser) parseProperty() (string, error) { capname := string(p.pattern[startpos:p.textpos()]) if p.charsRight() == 0 || p.moveRightGetChar() != '}' { - return "", p.getErr(ErrIncompleteSlashP) + return "", false, p.getErr(ErrIncompleteSlashP) } if !isValidUnicodeCat(capname) { - return "", p.getErr(ErrUnknownSlashP, capname) + return "", false, p.getErr(ErrUnknownSlashP, capname) } - return capname, nil + return capname, false, nil } // Returns ReNode type for zero-length assertions with a \ code. @@ -1461,7 +1474,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { closed = true break } - } else if ch == '\\' && p.charsRight() > 0 { switch ch = p.moveRightGetChar(); ch { case 'D', 'd': @@ -1472,6 +1484,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } cc.addChar('-') cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) } @@ -1485,6 +1500,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } cc.addChar('-') cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') } @@ -1498,6 +1516,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } cc.addChar('-') cc.addChar(chPrev) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) } cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') @@ -1506,20 +1527,29 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'p', 'P': if !scanOnly { + prop, literal, err := p.parseProperty() + if err != nil { + return nil, err + } + if inRange { - if !p.useOptionE() { + if p.useOptionE() { + if ch == 'P' || !literal { + return nil, p.getErr(ErrShorthandCharRange, ch) + } + } else if !literal { return nil, p.getErr(ErrBadClassInCharRange, ch) } - cc.addChar('-') - cc.addChar(chPrev) - } - prop, err := p.parseProperty() - if err != nil { - return nil, err + + cc.addRange(chPrev, ch) + inRange = false + } else if p.rangeStart() { + return nil, p.getErr(ErrShorthandCharRange, ch) + } else if !literal { + cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } - cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } else { - _, err := p.parseProperty() + _, _, err := p.parseProperty() if err != nil { return nil, err } @@ -1541,7 +1571,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { return nil, err } fTranslatedChar = true - break // this break will only break out of the switch } } else if ch == '[' { // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. @@ -1598,7 +1627,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { cc.addRange(chPrev, ch) } } - } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { + } else if p.rangeStart() { // this could be the start of a range chPrev = ch inRange = true @@ -1619,7 +1648,10 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } } else { p.moveRight(1) - p.scanCharSet(caseInsensitive, true) + _, err := p.scanCharSet(caseInsensitive, true) + if err != nil { + return nil, err + } } } else { if !scanOnly { @@ -1938,6 +1970,13 @@ func (p *parser) rightMost() bool { return p.currentPos == len(p.pattern) } +// rangeStart returns true if this might be a possible range start, false. +func (p *parser) rangeStart() bool { + return p.charsRight() > 1 && + p.rightChar(0) == '-' && + p.rightChar(1) != ']' +} + // Looks up the slot number for a given name func (p *parser) captureSlotFromName(capname string) int { return p.capnames[capname] diff --git a/testoutput1 b/testoutput1 index fbf63fd..14c61c4 100644 --- a/testoutput1 +++ b/testoutput1 @@ -1995,13 +1995,6 @@ No match aaa No match -/[\d-z]+/ - 12-34z - 0: 12-34z -\= Expect no match - aaa -No match - /\x5c/ \\ 0: \ @@ -5685,17 +5678,6 @@ No match ZABCDEFG 0: 1: - -/^[\d-a]/ - abcde - 0: a - -things - 0: - - 0digit - 0: 0 -\= Expect no match - bcdef -No match /[\s]+/ > \x09\x0a\x0c\x0d\x0b<