Commit 23093f86 authored by Rémy Oudompheng's avatar Rémy Oudompheng

strings: better mean complexity for Count and Index.

The O(n+m) complexity is obtained probabilistically
by using Rabin-Karp algorithm, which provides the needed complexity
unless exceptional collisions occur, without memory allocation.

benchmark                 old ns/op    new ns/op    delta
BenchmarkIndexHard1         6532331      4045886  -38.06%
BenchmarkIndexHard2         8178173      4038975  -50.61%
BenchmarkIndexHard3         6973687      4042591  -42.03%
BenchmarkCountHard1         6270864      4071090  -35.08%
BenchmarkCountHard2         7838039      4072853  -48.04%
BenchmarkCountHard3         6697828      4071964  -39.20%
BenchmarkIndexTorture       2730546        28934  -98.94%
BenchmarkCountTorture       2729622        29064  -98.94%

Fixes #4600.

R=rsc, donovanhide, remyoudompheng
CC=golang-dev
https://golang.org/cl/7314095
parent f1037f0e
......@@ -36,15 +36,35 @@ func explode(s string, n int) []string {
return a
}
// primeRK is the prime base used in Rabin-Karp algorithm.
const primeRK = 16777619
// hashstr returns the hash and the appropriate multiplicative
// factor for use in Rabin-Karp algorithm.
func hashstr(sep string) (uint32, uint32) {
hash := uint32(0)
for i := 0; i < len(sep); i++ {
hash = hash*primeRK + uint32(sep[i])
}
var pow, sq uint32 = 1, primeRK
for i := len(sep); i > 0; i >>= 1 {
if i&1 != 0 {
pow *= sq
}
sq *= sq
}
return hash, pow
}
// Count counts the number of non-overlapping instances of sep in s.
func Count(s, sep string) int {
if sep == "" {
return utf8.RuneCountInString(s) + 1
}
c := sep[0]
l := len(sep)
n := 0
if l == 1 {
if len(sep) == 1 {
// special case worth making fast
for i := 0; i < len(s); i++ {
if s[i] == c {
......@@ -53,11 +73,26 @@ func Count(s, sep string) int {
}
return n
}
for i := 0; i+l <= len(s); i++ {
if s[i] == c && s[i:i+l] == sep {
if len(sep) > len(s) {
return 0
}
hashsep, pow := hashstr(sep)
h := uint32(0)
for i := 0; i < len(sep); i++ {
h = h*primeRK + uint32(s[i])
}
lastmatch := 0
for i := len(sep); ; i++ {
// Invariant: h = hash(s[i-l : i])
if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
n++
i += l - 1
lastmatch = i
}
if i >= len(s) {
break
}
h = h*primeRK + uint32(s[i])
h -= pow * uint32(s[i-len(sep)])
}
return n
}
......@@ -94,10 +129,25 @@ func Index(s, sep string) int {
return -1
}
// n > 1
for i := 0; i+n <= len(s); i++ {
if s[i] == c && s[i:i+n] == sep {
return i
if n > len(s) {
return -1
}
// Hash sep.
hashsep, pow := hashstr(sep)
var h uint32
for i := 0; i < n; i++ {
h = h*primeRK + uint32(s[i])
}
for i := n; ; i++ {
// Invariant: h = hash(s[i-n : i])
if h == hashsep && s[i-n:i] == sep {
return i - n
}
if i >= len(s) {
break
}
h = h*primeRK + uint32(s[i])
h -= pow * uint32(s[i-n])
}
return -1
}
......
......@@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) {
}
}
func makeBenchInputHard() string {
tokens := [...]string{
"<a>", "<p>", "<b>", "<strong>",
"</a>", "</p>", "</b>", "</strong>",
"hello", "world",
}
x := make([]byte, 0, 1<<20)
for len(x) < 1<<20 {
i := rand.Intn(len(tokens))
x = append(x, tokens[i]...)
}
return string(x)
}
var benchInputHard = makeBenchInputHard()
func benchmarkIndexHard(b *testing.B, sep string) {
for i := 0; i < b.N; i++ {
Index(benchInputHard, sep)
}
}
func benchmarkCountHard(b *testing.B, sep string) {
for i := 0; i < b.N; i++ {
Count(benchInputHard, sep)
}
}
func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") }
func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "</pre>") }
func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "<b>hello world</b>") }
func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") }
func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "</pre>") }
func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "<b>hello world</b>") }
var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10)
var benchNeedleTorture = Repeat("ABC", 1<<10+1)
func BenchmarkIndexTorture(b *testing.B) {
for i := 0; i < b.N; i++ {
Index(benchInputTorture, benchNeedleTorture)
}
}
func BenchmarkCountTorture(b *testing.B) {
for i := 0; i < b.N; i++ {
Count(benchInputTorture, benchNeedleTorture)
}
}
var makeFieldsInput = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment