Commit 2662b99f authored by Robert Griesemer's avatar Robert Griesemer

suffixarray: implememted FindAllIndex regexp search

Implementation uses fast suffixarray lookup to find
initial matches if the regular expression starts with
a suitable prefix without meta characters.

R=r, rsc
CC=golang-dev
https://golang.org/cl/3720042
parent 8f68b23b
...@@ -18,7 +18,7 @@ package suffixarray ...@@ -18,7 +18,7 @@ package suffixarray
import ( import (
"bytes" "bytes"
"container/vector" "regexp"
"sort" "sort"
) )
...@@ -73,22 +73,124 @@ func (x *Index) search(s []byte) int { ...@@ -73,22 +73,124 @@ func (x *Index) search(s []byte) int {
// Lookup time is O((log(N) + len(result))*len(s)) where N is the // Lookup time is O((log(N) + len(result))*len(s)) where N is the
// size of the indexed data. // size of the indexed data.
// //
func (x *Index) Lookup(s []byte, n int) []int { func (x *Index) Lookup(s []byte, n int) (result []int) {
var res vector.IntVector
if len(s) > 0 && n != 0 { if len(s) > 0 && n != 0 {
// find matching suffix index i // find matching suffix index i
i := x.search(s) i := x.search(s)
// x.at(i-1) < s <= x.at(i) // x.at(i-1) < s <= x.at(i)
// collect the following suffixes with matching prefixes // collect the following suffixes with matching prefixes
for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) { for (n < 0 || len(result) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) {
res.Push(x.sa[i]) result = append(result, x.sa[i])
i++ i++
} }
} }
return
}
// FindAllIndex returns a sorted list of non-overlapping matches of the
// regular expression r, where a match is a pair of indices specifying
// the matched slice of x.Bytes(). If n < 0, all matches are returned
// in successive order. Otherwise, at most n matches are returned and
// they may not be successive. The result is nil if there are no matches,
// or if n == 0.
//
func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
// a non-empty literal prefix is used to determine possible
// match start indices with Lookup
prefix, complete := r.LiteralPrefix()
lit := []byte(prefix)
// worst-case scenario: no literal prefix
if prefix == "" {
return r.FindAllIndex(x.data, n)
}
// if regexp is a literal just use Lookup and convert its
// result into match pairs
if complete {
// Lookup returns indices that may belong to overlapping matches.
// After eliminating them, we may end up with fewer than n matches.
// If we don't have enough at the end, redo the search with an
// increased value n1, but only if Lookup returned all the requested
// indices in the first place (if it returned fewer than that then
// there cannot be more).
for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
indices := x.Lookup(lit, n1)
if len(indices) == 0 {
return
}
sort.SortInts(indices)
pairs := make([]int, 2*len(indices))
result = make([][]int, len(indices))
count := 0
prev := 0
for _, i := range indices {
if count == n {
break
}
// ignore indices leading to overlapping matches
if prev <= i {
j := 2 * count
pairs[j+0] = i
pairs[j+1] = i + len(lit)
result[count] = pairs[j : j+2]
count++
prev = i + len(lit)
}
}
result = result[0:count]
if len(result) >= n || len(indices) != n1 {
// found all matches or there's no chance to find more
// (n and n1 can be negative)
break
}
}
if len(result) == 0 {
result = nil
}
return
}
return res // regexp has a non-empty literal prefix; Lookup(lit) computes
// the indices of possible complete matches; use these as starting
// points for anchored searches
// (regexp "^" matches beginning of input, not beginning of line)
r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
// same comment about Lookup applies here as in the loop above
for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
indices := x.Lookup(lit, n1)
if len(indices) == 0 {
return
}
sort.SortInts(indices)
result = result[0:0]
prev := 0
for _, i := range indices {
if len(result) == n {
break
}
m := r.FindIndex(x.data[i:]) // anchored search - will not run off
// ignore indices leading to overlapping matches
if m != nil && prev <= i {
m[0] = i // correct m
m[1] += i
result = append(result, m)
prev = m[1]
}
}
if len(result) >= n || len(indices) != n1 {
// found all matches or there's no chance to find more
// (n and n1 can be negative)
break
}
}
if len(result) == 0 {
result = nil
}
return
} }
......
...@@ -6,6 +6,7 @@ package suffixarray ...@@ -6,6 +6,7 @@ package suffixarray
import ( import (
"container/vector" "container/vector"
"regexp"
"sort" "sort"
"strings" "strings"
"testing" "testing"
...@@ -15,7 +16,7 @@ import ( ...@@ -15,7 +16,7 @@ import (
type testCase struct { type testCase struct {
name string // name of test case name string // name of test case
source string // source to index source string // source to index
lookups []string // strings to lookup patterns []string // patterns to lookup
} }
...@@ -26,6 +27,9 @@ var testCases = []testCase{ ...@@ -26,6 +27,9 @@ var testCases = []testCase{
[]string{ []string{
"", "",
"foo", "foo",
"(foo)",
".*",
"a*",
}, },
}, },
...@@ -45,6 +49,12 @@ var testCases = []testCase{ ...@@ -45,6 +49,12 @@ var testCases = []testCase{
"aaaaaaaaa", "aaaaaaaaa",
"aaaaaaaaaa", "aaaaaaaaaa",
"aaaaaaaaaaa", // 11 a's "aaaaaaaaaaa", // 11 a's
".",
".*",
"a+",
"aa+",
"aaaa[b]?",
"aaa*",
}, },
}, },
...@@ -58,6 +68,9 @@ var testCases = []testCase{ ...@@ -58,6 +68,9 @@ var testCases = []testCase{
"ab", "ab",
"bc", "bc",
"abc", "abc",
"a.c",
"a(b|c)",
"abc?",
}, },
}, },
...@@ -70,6 +83,7 @@ var testCases = []testCase{ ...@@ -70,6 +83,7 @@ var testCases = []testCase{
"rab", "rab",
"arab", "arab",
"barbar", "barbar",
"bara?bar",
}, },
}, },
...@@ -81,6 +95,7 @@ var testCases = []testCase{ ...@@ -81,6 +95,7 @@ var testCases = []testCase{
"the time", "the time",
"to come the aid", "to come the aid",
"is the time for all good men to come to the aid of their", "is the time for all good men to come to the aid of their",
"to (come|the)?",
}, },
}, },
} }
...@@ -104,8 +119,7 @@ func find(src, s string, n int) []int { ...@@ -104,8 +119,7 @@ func find(src, s string, n int) []int {
} }
func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) { func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
for _, s := range tc.lookups {
res := x.Lookup([]byte(s), n) res := x.Lookup([]byte(s), n)
exp := find(tc.source, s, n) exp := find(tc.source, s, n)
...@@ -119,20 +133,51 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) { ...@@ -119,20 +133,51 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) {
// Index may not find the results in the same order as find) => in general // Index may not find the results in the same order as find) => in general
// we cannot simply check that the res and exp lists are equal // we cannot simply check that the res and exp lists are equal
// check that there are no duplicates // check that each result is in fact a correct match and there are no duplicates
sort.SortInts(res) sort.SortInts(res)
for i, r := range res { for i, r := range res {
if r < 0 || len(tc.source) <= r {
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
} else if !strings.HasPrefix(tc.source[r:], s) {
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
}
if i > 0 && res[i-1] == r { if i > 0 && res[i-1] == r {
t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r) t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
} }
} }
// check that each result is in fact a correct match if n < 0 {
// all results computed - sorted res and exp must be equal
for i, r := range res { for i, r := range res {
if r < 0 || len(src) <= r { e := exp[i]
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(src)) if r != e {
} else if !strings.HasPrefix(src[r:], s) { t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r) }
}
}
}
func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
res := x.FindAllIndex(rx, n)
exp := rx.FindAllStringIndex(tc.source, n)
// check that the lengths match
if len(res) != len(exp) {
t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
}
// if n >= 0 the number of results is limited --- unless n >= all results,
// we may obtain different positions from the Index and from regexp (because
// Index may not find the results in the same order as regexp) => in general
// we cannot simply check that the res and exp lists are equal
// check that each result is in fact a correct match and the result is sorted
for i, r := range res {
if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
} else if !rx.MatchString(tc.source[r[0]:r[1]]) {
t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
} }
} }
...@@ -140,11 +185,20 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) { ...@@ -140,11 +185,20 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) {
// all results computed - sorted res and exp must be equal // all results computed - sorted res and exp must be equal
for i, r := range res { for i, r := range res {
e := exp[i] e := exp[i]
if r != e { if r[0] != e[0] || r[1] != e[1] {
t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r) t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
continue tc.name, rx, i, e[0], e[1], r[0], r[1])
}
} }
} }
}
func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
for _, pat := range tc.patterns {
testLookup(t, tc, x, pat, n)
if rx, err := regexp.Compile(pat); err == nil {
testFindAllIndex(t, tc, x, rx, n)
} }
} }
} }
...@@ -153,9 +207,10 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) { ...@@ -153,9 +207,10 @@ func testLookups(t *testing.T, src string, x *Index, tc *testCase, n int) {
func TestIndex(t *testing.T) { func TestIndex(t *testing.T) {
for _, tc := range testCases { for _, tc := range testCases {
x := New([]byte(tc.source)) x := New([]byte(tc.source))
testLookups(t, tc.source, x, &tc, 0) testLookups(t, &tc, x, 0)
testLookups(t, tc.source, x, &tc, 1) testLookups(t, &tc, x, 1)
testLookups(t, tc.source, x, &tc, 10) testLookups(t, &tc, x, 10)
testLookups(t, tc.source, x, &tc, -1) testLookups(t, &tc, x, 2e9)
testLookups(t, &tc, x, -1)
} }
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment