From 7c81fa4f70d4c17b2dd8bbb6128ac9b1b90b062e Mon Sep 17 00:00:00 2001
From: Steven Hartland <steven.hartland@multiplay.co.uk>
Date: Tue, 6 Dec 2022 16:05:37 +0000
Subject: [PATCH 1/3] fix: ecma ranges with set terminator

Fix ECMAScript un-escaped literal '-' when followed by predefined
character sets.

Also:
* Fixed missing error check on parseProperty() call.
* Use addChar(ch) helper instead of addRange(ch, ch).

Fixes #54
---
 go.mod              |  2 ++
 go.sum              | 17 +++++++++++++++
 regexp_ecma_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++++
 syntax/parser.go    | 33 +++++++++++++++++++++++-------
 4 files changed, 95 insertions(+), 7 deletions(-)
 create mode 100644 go.sum
 create mode 100644 regexp_ecma_test.go

diff --git a/go.mod b/go.mod
index 9f7f391..d5db1e4 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
 module github.com/dlclark/regexp2
 
 go 1.13
+
+require github.com/stretchr/testify v1.8.1
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..2ec90f7
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,17 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go
new file mode 100644
index 0000000..11f8827
--- /dev/null
+++ b/regexp_ecma_test.go
@@ -0,0 +1,50 @@
+package regexp2_test
+
+import (
+	"testing"
+
+	"github.com/dlclark/regexp2"
+	"github.com/stretchr/testify/require"
+)
+
+func TestECMA_basic(t *testing.T) {
+	tests := map[string]struct {
+		expr string
+		data string
+		want []string
+	}{
+		"charset": {
+			expr: `[a-c]`,
+			data: "abcd",
+			want: []string{"a", "b", "c"},
+		},
+		"charset-set": {
+			expr: `[a-\s]`,
+			data: "a-b cd",
+			want: []string{"a", "-", " "},
+		},
+	}
+
+	for name, tt := range tests {
+		t.Run(name, func(t *testing.T) {
+			re, err := regexp2.Compile(tt.expr, regexp2.ECMAScript)
+			require.NoError(t, err)
+
+			match, err := re.FindStringMatch(tt.data)
+			require.NoError(t, err)
+
+			var res []string
+			for match != nil {
+				for _, g := range match.Groups() {
+					for _, c := range g.Captures {
+						res = append(res, c.String())
+					}
+				}
+
+				match, err = re.FindNextMatch(match)
+				require.NoError(t, err)
+			}
+			require.Equal(t, tt.want, res)
+		})
+	}
+}
diff --git a/syntax/parser.go b/syntax/parser.go
index 9dc6e31..4e1d4ad 100644
--- a/syntax/parser.go
+++ b/syntax/parser.go
@@ -1427,7 +1427,7 @@ func (p *parser) scanCapname() string {
 	return string(p.pattern[startpos:p.textpos()])
 }
 
-//Scans contents of [] (not including []'s), and converts to a set.
+// Scans contents of [] (not including []'s), and converts to a set.
 func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 	ch := '\x00'
 	chPrev := '\x00'
@@ -1467,7 +1467,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 			case 'D', 'd':
 				if !scanOnly {
 					if inRange {
-						return nil, p.getErr(ErrBadClassInCharRange, ch)
+						if !p.useOptionE() {
+							return nil, p.getErr(ErrBadClassInCharRange, ch)
+						}
+						cc.addChar('-')
+						cc.addChar(chPrev)
 					}
 					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
 				}
@@ -1476,7 +1480,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 			case 'S', 's':
 				if !scanOnly {
 					if inRange {
-						return nil, p.getErr(ErrBadClassInCharRange, ch)
+						if !p.useOptionE() {
+							return nil, p.getErr(ErrBadClassInCharRange, ch)
+						}
+						cc.addChar('-')
+						cc.addChar(chPrev)
 					}
 					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
 				}
@@ -1485,7 +1493,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 			case 'W', 'w':
 				if !scanOnly {
 					if inRange {
-						return nil, p.getErr(ErrBadClassInCharRange, ch)
+						if !p.useOptionE() {
+							return nil, p.getErr(ErrBadClassInCharRange, ch)
+						}
+						cc.addChar('-')
+						cc.addChar(chPrev)
 					}
 
 					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
@@ -1495,7 +1507,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 			case 'p', 'P':
 				if !scanOnly {
 					if inRange {
-						return nil, p.getErr(ErrBadClassInCharRange, ch)
+						if !p.useOptionE() {
+							return nil, p.getErr(ErrBadClassInCharRange, ch)
+						}
+						cc.addChar('-')
+						cc.addChar(chPrev)
 					}
 					prop, err := p.parseProperty()
 					if err != nil {
@@ -1503,14 +1519,17 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					}
 					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
 				} else {
-					p.parseProperty()
+					_, err := p.parseProperty()
+					if err != nil {
+						return nil, err
+					}
 				}
 
 				continue
 
 			case '-':
 				if !scanOnly {
-					cc.addRange(ch, ch)
+					cc.addChar(ch)
 				}
 				continue
 

From bc04c8669f1b1e38a185a20fe8b86ba1079350ae Mon Sep 17 00:00:00 2001
From: Steven Hartland <steven.hartland@multiplay.co.uk>
Date: Sun, 18 Dec 2022 22:12:38 +0000
Subject: [PATCH 2/3] chore: ignore vscode files

Ignore the files created by VSCode.
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index fb844c3..5e4c21e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,6 @@ _testmain.go
 *.out
 
 .DS_Store
+
+# Ignore vscode status.
+.vscode/

From 58fb3a184c9027703d7c219de058459efc6a0c84 Mon Sep 17 00:00:00 2001
From: Steven Hartland <steven.hartland@multiplay.co.uk>
Date: Sun, 18 Dec 2022 22:17:40 +0000
Subject: [PATCH 3/3] fix: handle ecma unicode code point

Handle Unicode code points in ECMA mode with both the flag enabled and
disabled.

This removes four tests which were incorrectly passing due to missing
detection of invalid ranges, aligning with the behaviour of .NET and
PCRE2 as opposed to PCRE.
---
 regexp_ecma_test.go | 75 +++++++++++++++++++++++++++++++++++----
 regexp_mono_test.go |  2 --
 syntax/parser.go    | 85 +++++++++++++++++++++++++++++++++------------
 testoutput1         | 18 ----------
 4 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go
index 11f8827..6d5b5d3 100644
--- a/regexp_ecma_test.go
+++ b/regexp_ecma_test.go
@@ -7,27 +7,88 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-func TestECMA_basic(t *testing.T) {
+func TestECMA_charset(t *testing.T) {
 	tests := map[string]struct {
-		expr string
-		data string
-		want []string
+		expr    string
+		data    string
+		opt     regexp2.RegexOptions
+		want    []string
+		wantErr string
 	}{
-		"charset": {
+		"basic": {
 			expr: `[a-c]`,
 			data: "abcd",
 			want: []string{"a", "b", "c"},
 		},
-		"charset-set": {
+		"in-range": {
+			expr: `[a-\s\b]`,
+			data: "a-b cd",
+			want: []string{"a", "-", " "},
+		},
+		"space": {
 			expr: `[a-\s]`,
 			data: "a-b cd",
 			want: []string{"a", "-", " "},
 		},
+		"word": {
+			expr: `[a-\w]`,
+			data: "a-b cd",
+			want: []string{"a", "-", "b", "c", "d"},
+		},
+		"digit": {
+			expr: `[a-\d]`,
+			data: "a-b1 cd",
+			want: []string{"a", "-", "1"},
+		},
+		"slash-p": {
+			expr: `[a-\p]`,
+			data: "a-bq cd",
+			want: []string{"a", "b", "c", "d"},
+		},
+		"slash-p-literal": {
+			expr: `[a-\p{x}]`,
+			data: "a-bq cdx",
+			want: []string{"a", "b", "c", "d", "x"},
+		},
+		"invalid-unicode": {
+			expr:    `[a-\p]`,
+			opt:     regexp2.Unicode,
+			wantErr: "error parsing regexp: incomplete \\p{X} character escape in `[a-\\p]`",
+		},
+		"invalid-unicode-letter": {
+			expr:    `[a-\p{L}]`,
+			opt:     regexp2.Unicode,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[a-\\p{L}]`",
+		},
+		"invalid-slash-P": {
+			expr:    `[a-\P]`,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\P in `[a-\\P]`",
+		},
+		"invalid-space": {
+			expr:    `[\s-z]`,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\s in `[\\s-z]`",
+		},
+		"invalid-word": {
+			expr:    `[\w-z]`,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\w in `[\\w-z]`",
+		},
+		"invalid-digit": {
+			expr:    `[\d-z]`,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\d in `[\\d-z]`",
+		},
+		"invalid-point": {
+			expr:    `[\p-z]`,
+			wantErr: "error parsing regexp: cannot create range with shorthand escape sequence \\p in `[\\p-z]`",
+		},
 	}
 
 	for name, tt := range tests {
 		t.Run(name, func(t *testing.T) {
-			re, err := regexp2.Compile(tt.expr, regexp2.ECMAScript)
+			re, err := regexp2.Compile(tt.expr, tt.opt|regexp2.ECMAScript)
+			if tt.wantErr != "" {
+				require.EqualError(t, err, tt.wantErr)
+				return
+			}
 			require.NoError(t, err)
 
 			match, err := re.FindStringMatch(tt.data)
diff --git a/regexp_mono_test.go b/regexp_mono_test.go
index 372e81c..aa3cf7d 100644
--- a/regexp_mono_test.go
+++ b/regexp_mono_test.go
@@ -710,8 +710,6 @@ func TestMono_Basics(t *testing.T) {
 	runRegexTrial(t, `.[X](.+)+[X][X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.")
 	runRegexTrial(t, `.[X][X](.+)+[X]`, 0, "bbbbXXXaaaaaaaaa", "Fail.")
 	runRegexTrial(t, `tt+$`, 0, "xxxtt", "Pass. Group[0]=(3,2)")
-	runRegexTrial(t, `([\d-z]+)`, 0, "a0-za", "Pass. Group[0]=(1,3) Group[1]=(1,3)")
-	runRegexTrial(t, `([\d-\s]+)`, 0, "a0- z", "Pass. Group[0]=(1,3) Group[1]=(1,3)")
 	runRegexTrial(t, `\GX.*X`, 0, "aaaXbX", "Fail.")
 	runRegexTrial(t, `(\d+\.\d+)`, 0, "3.1415926", "Pass. Group[0]=(0,9) Group[1]=(0,9)")
 	runRegexTrial(t, `(\ba.{0,10}br)`, 0, "have a web browser", "Pass. Group[0]=(5,8) Group[1]=(5,8)")
diff --git a/syntax/parser.go b/syntax/parser.go
index 4e1d4ad..77d6c95 100644
--- a/syntax/parser.go
+++ b/syntax/parser.go
@@ -75,6 +75,7 @@ const (
 	// Parser errors
 	ErrUnterminatedComment        = "unterminated comment"
 	ErrInvalidCharRange           = "invalid character class range"
+	ErrShorthandCharRange         = "cannot create range with shorthand escape sequence \\%c"
 	ErrInvalidRepeatSize          = "invalid repeat count"
 	ErrInvalidUTF8                = "invalid UTF-8"
 	ErrCaptureGroupOutOfRange     = "capture group number out of range"
@@ -1170,14 +1171,19 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
 
 	case 'p', 'P':
 		p.moveRight(1)
-		prop, err := p.parseProperty()
+		prop, literal, err := p.parseProperty()
 		if err != nil {
 			return nil, err
 		}
+
 		cc := &CharSet{}
-		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
-		if p.useOptionI() {
-			cc.addLowercase()
+		if literal {
+			cc.addChar(ch)
+		} else {
+			cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
+			if p.useOptionI() {
+				cc.addLowercase()
+			}
 		}
 
 		return newRegexNodeSet(ntSet, p.options, cc), nil
@@ -1310,13 +1316,20 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
 }
 
 // Scans X for \p{X} or \P{X}
-func (p *parser) parseProperty() (string, error) {
+func (p *parser) parseProperty() (string, bool, error) {
+	if p.useOptionE() && !p.useOptionU() {
+		// Unless unicode is enabled ECMA \p is just a literal p
+		// and \P is invalid depending on the use case, which the
+		// caller must handle.
+		return "", true, nil
+	}
+
 	if p.charsRight() < 3 {
-		return "", p.getErr(ErrIncompleteSlashP)
+		return "", false, p.getErr(ErrIncompleteSlashP)
 	}
 	ch := p.moveRightGetChar()
 	if ch != '{' {
-		return "", p.getErr(ErrMalformedSlashP)
+		return "", false, p.getErr(ErrMalformedSlashP)
 	}
 
 	startpos := p.textpos()
@@ -1330,14 +1343,14 @@ func (p *parser) parseProperty() (string, error) {
 	capname := string(p.pattern[startpos:p.textpos()])
 
 	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
-		return "", p.getErr(ErrIncompleteSlashP)
+		return "", false, p.getErr(ErrIncompleteSlashP)
 	}
 
 	if !isValidUnicodeCat(capname) {
-		return "", p.getErr(ErrUnknownSlashP, capname)
+		return "", false, p.getErr(ErrUnknownSlashP, capname)
 	}
 
-	return capname, nil
+	return capname, false, nil
 }
 
 // Returns ReNode type for zero-length assertions with a \ code.
@@ -1461,7 +1474,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 				closed = true
 				break
 			}
-
 		} else if ch == '\\' && p.charsRight() > 0 {
 			switch ch = p.moveRightGetChar(); ch {
 			case 'D', 'd':
@@ -1472,6 +1484,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 						}
 						cc.addChar('-')
 						cc.addChar(chPrev)
+						inRange = false
+					} else if p.rangeStart() {
+						return nil, p.getErr(ErrShorthandCharRange, ch)
 					}
 					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
 				}
@@ -1485,6 +1500,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 						}
 						cc.addChar('-')
 						cc.addChar(chPrev)
+						inRange = false
+					} else if p.rangeStart() {
+						return nil, p.getErr(ErrShorthandCharRange, ch)
 					}
 					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
 				}
@@ -1498,6 +1516,9 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 						}
 						cc.addChar('-')
 						cc.addChar(chPrev)
+						inRange = false
+					} else if p.rangeStart() {
+						return nil, p.getErr(ErrShorthandCharRange, ch)
 					}
 
 					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
@@ -1506,20 +1527,29 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 
 			case 'p', 'P':
 				if !scanOnly {
+					prop, literal, err := p.parseProperty()
+					if err != nil {
+						return nil, err
+					}
+
 					if inRange {
-						if !p.useOptionE() {
+						if p.useOptionE() {
+							if ch == 'P' || !literal {
+								return nil, p.getErr(ErrShorthandCharRange, ch)
+							}
+						} else if !literal {
 							return nil, p.getErr(ErrBadClassInCharRange, ch)
 						}
-						cc.addChar('-')
-						cc.addChar(chPrev)
-					}
-					prop, err := p.parseProperty()
-					if err != nil {
-						return nil, err
+
+						cc.addRange(chPrev, ch)
+						inRange = false
+					} else if p.rangeStart() {
+						return nil, p.getErr(ErrShorthandCharRange, ch)
+					} else if !literal {
+						cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
 					}
-					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
 				} else {
-					_, err := p.parseProperty()
+					_, _, err := p.parseProperty()
 					if err != nil {
 						return nil, err
 					}
@@ -1541,7 +1571,6 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					return nil, err
 				}
 				fTranslatedChar = true
-				break // this break will only break out of the switch
 			}
 		} else if ch == '[' {
 			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
@@ -1598,7 +1627,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					cc.addRange(chPrev, ch)
 				}
 			}
-		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
+		} else if p.rangeStart() {
 			// this could be the start of a range
 			chPrev = ch
 			inRange = true
@@ -1619,7 +1648,10 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 				}
 			} else {
 				p.moveRight(1)
-				p.scanCharSet(caseInsensitive, true)
+				_, err := p.scanCharSet(caseInsensitive, true)
+				if err != nil {
+					return nil, err
+				}
 			}
 		} else {
 			if !scanOnly {
@@ -1938,6 +1970,13 @@ func (p *parser) rightMost() bool {
 	return p.currentPos == len(p.pattern)
 }
 
+// rangeStart returns true if this might be a possible range start, false.
+func (p *parser) rangeStart() bool {
+	return p.charsRight() > 1 &&
+		p.rightChar(0) == '-' &&
+		p.rightChar(1) != ']'
+}
+
 // Looks up the slot number for a given name
 func (p *parser) captureSlotFromName(capname string) int {
 	return p.capnames[capname]
diff --git a/testoutput1 b/testoutput1
index fbf63fd..14c61c4 100644
--- a/testoutput1
+++ b/testoutput1
@@ -1995,13 +1995,6 @@ No match
     aaa
 No match
 
-/[\d-z]+/
-    12-34z
- 0: 12-34z
-\= Expect no match
-    aaa
-No match
-
 /\x5c/
     \\
  0: \
@@ -5685,17 +5678,6 @@ No match
     ZABCDEFG
  0: 
  1: 
-
-/^[\d-a]/
-    abcde
- 0: a
-    -things
- 0: -
-    0digit
- 0: 0
-\= Expect no match
-    bcdef    
-No match
     
 /[\s]+/
     > \x09\x0a\x0c\x0d\x0b<