Commit c62069cc authored by Rob Pike's avatar Rob Pike

add a match arena to regexp to avoid generating garbage.

simple regexps run 20x faster.
the regex-dna benchmark goes 3x faster.

R=rsc
CC=golang-dev
https://golang.org/cl/156108
parent b30f753d
...@@ -60,6 +60,7 @@ type tester struct { ...@@ -60,6 +60,7 @@ type tester struct {
} }
var matches = []tester{ var matches = []tester{
tester{`^abcdefg`, "abcdefg", vec{0, 7}},
tester{`a+`, "baaab", vec{1, 4}}, tester{`a+`, "baaab", vec{1, 4}},
tester{"abcd..", "abcdef", vec{0, 6}}, tester{"abcd..", "abcdef", vec{0, 6}},
tester{``, "", vec{0, 0}}, tester{``, "", vec{0, 0}},
...@@ -450,3 +451,29 @@ func TestAllMatches(t *testing.T) { ...@@ -450,3 +451,29 @@ func TestAllMatches(t *testing.T) {
} }
} }
} }
func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50);
b.StopTimer();
re, _ := Compile(x);
b.StartTimer();
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!");
break;
}
}
}
func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 49);
b.StopTimer();
re, _ := Compile("^" + x);
b.StartTimer();
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!");
break;
}
}
}
...@@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error { ...@@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error {
return p.error; return p.error;
} }
// return regular text at the beginning of str // Extract regular text from the beginning of the pattern.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() { func (re *Regexp) setPrefix() {
var b []byte; var b []byte;
var utf = make([]byte, utf8.UTFMax); var utf = make([]byte, utf8.UTFMax);
...@@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp { ...@@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp {
return regexp; return regexp;
} }
// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
head *matchVec;
len int; // length of match vector
}
type matchVec struct {
m []int; // pairs of bracketing submatches. 0th is start,end
ref int;
next *matchVec;
}
func (a *matchArena) new() *matchVec {
if a.head == nil {
const N = 10;
block := make([]matchVec, N);
for i := 0; i < N; i++ {
b := &block[i];
b.next = a.head;
a.head = b;
}
}
m := a.head;
a.head = m.next;
m.ref = 0;
if m.m == nil {
m.m = make([]int, a.len)
}
return m;
}
func (a *matchArena) free(m *matchVec) {
m.ref--;
if m.ref == 0 {
m.next = a.head;
a.head = m;
}
}
func (a *matchArena) copy(m *matchVec) *matchVec {
m1 := a.new();
copy(m1.m, m.m);
return m1;
}
func (a *matchArena) noMatch() *matchVec {
m := a.new();
for i := range m.m {
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
m.ref = 1;
return m;
}
type state struct { type state struct {
inst instr; // next instruction to execute inst instr; // next instruction to execute
match []int; // pairs of bracketing submatches. 0th is start,end match *matchVec;
} }
// Append new state to to-do list. Leftmost-longest wins so avoid // Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. // adding a state that's already active. The matchVec will be inc-ref'ed
func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state { // if it is assigned to a state.
func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state {
switch inst.kind() { switch inst.kind() {
case _BOT: case _BOT:
if pos == 0 { if pos == 0 {
s = re.addState(s, inst.next(), match, pos, end) s = a.addState(s, inst.next(), match, pos, end)
} }
return s; return s;
case _EOT: case _EOT:
if pos == end { if pos == end {
s = re.addState(s, inst.next(), match, pos, end) s = a.addState(s, inst.next(), match, pos, end)
} }
return s; return s;
case _BRA: case _BRA:
n := inst.(*_Bra).n; n := inst.(*_Bra).n;
match[2*n] = pos; match.m[2*n] = pos;
s = re.addState(s, inst.next(), match, pos, end); s = a.addState(s, inst.next(), match, pos, end);
return s; return s;
case _EBRA: case _EBRA:
n := inst.(*_Ebra).n; n := inst.(*_Ebra).n;
match[2*n+1] = pos; match.m[2*n+1] = pos;
s = re.addState(s, inst.next(), match, pos, end); s = a.addState(s, inst.next(), match, pos, end);
return s; return s;
} }
index := inst.index(); index := inst.index();
l := len(s); l := len(s);
begin := match[0]; begin := match.m[0];
// TODO: Once the state is a vector and we can do insert, have inputs always // TODO: If the state were a vector and we could do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary, // go in order correctly and this "earlier" test is not necessary,
for i := 0; i < l; i++ { for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction if s[i].inst.index() == index && // same instruction
s[i].match[0] <= begin { // earlier match already going; lefmost wins s[i].match.m[0] <= begin { // earlier match already going; lefmost wins
return s return s
} }
} }
...@@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s ...@@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
s = s[0 : l+1]; s = s[0 : l+1];
s[l].inst = inst; s[l].inst = inst;
s[l].match = match; s[l].match = match;
match.ref++;
if inst.kind() == _ALT { if inst.kind() == _ALT {
s1 := make([]int, 2*(re.nbra+1)); s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end);
copy(s1, match);
s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
// give other branch a copy of this match vector // give other branch a copy of this match vector
s1 = make([]int, 2*(re.nbra+1)); s = a.addState(s, inst.next(), a.copy(match), pos, end);
copy(s1, match);
s = re.addState(s, inst.next(), s1, pos, end);
} }
return s; return s;
} }
func noMatch(nbra int) []int {
match := make([]int, 2*(nbra+1));
for i := range match {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
return match;
}
// Accepts either string or bytes - the logic is identical either way. // Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str. // If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements var s [2][]state;
s[0] = make([]state, 10)[0:0]; s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0]; s[1] = make([]state, 10)[0:0];
in, out := 0, 1; in, out := 0, 1;
...@@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { ...@@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
} }
pos += advance + len(re.prefix); pos += advance + len(re.prefix);
} }
arena := &matchArena{nil, 2 * (re.nbra + 1)};
for pos <= end { for pos <= end {
if !found { if !found {
// prime the pump if we haven't seen a match yet // prime the pump if we haven't seen a match yet
match := noMatch(re.nbra); match := arena.noMatch();
match[0] = pos; match.m[0] = pos;
s[out] = re.addState(s[out], re.start.next(), match, pos, end); s[out] = arena.addState(s[out], re.start.next(), match, pos, end);
arena.free(match); // if addState saved it, ref was incremented
} }
in, out = out, in; // old out state is new in state in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state // clear out old state
old := s[out];
for _, state := range old {
arena.free(state.match)
}
s[out] = old[0:0]; // truncate state vector
if found && len(s[in]) == 0 { if found && len(s[in]) == 0 {
// machine has completed // machine has completed
break break
...@@ -791,26 +845,25 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { ...@@ -791,26 +845,25 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
} }
} }
pos += charwidth; pos += charwidth;
for i := 0; i < len(s[in]); i++ { for _, st := range s[in] {
st := s[in][i]; switch st.inst.kind() {
switch s[in][i].inst.kind() {
case _BOT: case _BOT:
case _EOT: case _EOT:
case _CHAR: case _CHAR:
if c == st.inst.(*_Char).char { if c == st.inst.(*_Char).char {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
} }
case _CHARCLASS: case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) { if st.inst.(*_CharClass).matches(c) {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
} }
case _ANY: case _ANY:
if c != endOfFile { if c != endOfFile {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
} }
case _NOTNL: case _NOTNL:
if c != endOfFile && c != '\n' { if c != endOfFile && c != '\n' {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
} }
case _BRA: case _BRA:
case _EBRA: case _EBRA:
...@@ -818,10 +871,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { ...@@ -818,10 +871,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
case _END: case _END:
// choose leftmost longest // choose leftmost longest
if !found || // first if !found || // first
st.match[0] < final.match[0] || // leftmost st.match.m[0] < final.match.m[0] || // leftmost
(st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final = st; final = st;
final.match[1] = pos - charwidth; final.match.ref++;
final.match.m[1] = pos - charwidth;
} }
found = true; found = true;
default: default:
...@@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { ...@@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
} }
} }
} }
if final.match == nil {
return nil
}
// if match found, back up start of match by width of prefix. // if match found, back up start of match by width of prefix.
if re.prefix != "" && len(final.match) > 0 { if re.prefix != "" && len(final.match.m) > 0 {
final.match[0] -= len(re.prefix) final.match.m[0] -= len(re.prefix)
} }
return final.match; return final.match.m;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment