Commit 5ccaf025 authored by Paul Wankadia's avatar Paul Wankadia Committed by Russ Cox

regexp/syntax: fix factoring of common prefixes in alternations

In the past, `a.*?c|a.*?b` was factored to `a.*?[bc]`. Thus, given
"abc" as its input string, the automaton would consume "ab" and
then stop (when unanchored) whereas it should consume all of "abc"
as per leftmost semantics.

Fixes #13812.

Change-Id: I67ac0a353d7793b3d0c9c4aaf22d157621dfe784
Reviewed-on: https://go-review.googlesource.com/18357Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent 8ae584f2
...@@ -470,9 +470,14 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { ...@@ -470,9 +470,14 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
} }
sub = out sub = out
// Round 2: Factor out common complex prefixes, // Round 2: Factor out common simple prefixes,
// just the first piece of each concatenation, // just the first piece of each concatenation.
// whatever it is. This is good enough a lot of the time. // This will be good enough a lot of the time.
//
// Complex subexpressions (e.g. involving quantifiers)
// are not safe to factor because that collapses their
// distinct paths through the automaton, which affects
// correctness in some cases.
start = 0 start = 0
out = sub[:0] out = sub[:0]
var first *Regexp var first *Regexp
...@@ -485,7 +490,9 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { ...@@ -485,7 +490,9 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
var ifirst *Regexp var ifirst *Regexp
if i < len(sub) { if i < len(sub) {
ifirst = p.leadingRegexp(sub[i]) ifirst = p.leadingRegexp(sub[i])
if first != nil && first.Equal(ifirst) { if first != nil && first.Equal(ifirst) &&
// first must be a character class OR a fixed repeat of a character class.
(isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
continue continue
} }
} }
......
...@@ -172,7 +172,7 @@ var parseTests = []parseTest{ ...@@ -172,7 +172,7 @@ var parseTests = []parseTest{
// Factoring. // Factoring.
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
// Bug fixes. // Bug fixes.
{`(?:.)`, `dot{}`}, {`(?:.)`, `dot{}`},
...@@ -195,12 +195,13 @@ var parseTests = []parseTest{ ...@@ -195,12 +195,13 @@ var parseTests = []parseTest{
{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
{`(?:xx|yy)c|(?:xx|yy)d`, {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
`cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`},
{`x{2}|x{2}[0-9]`, {`x{2}|x{2}[0-9]`,
`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
{`x{2}y|x{2}[0-9]y`, {`x{2}y|x{2}[0-9]y`,
`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
{`a.*?c|a.*?b`,
`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
// Valid repetitions. // Valid repetitions.
{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
......
...@@ -3665,3 +3665,8 @@ regexps ...@@ -3665,3 +3665,8 @@ regexps
"(?:a\\C*|ba\\C)$" "(?:a\\C*|ba\\C)$"
-;-;-;- -;-;-;-
-;1-4;-;1-4 -;1-4;-;1-4
strings
"abc"
regexps
"a.*?c|a.*?b"
0-3;0-3;0-3;0-3
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment