Commit 6ca324f2 authored by Russ Cox's avatar Russ Cox

index/suffixarray: index 3-10X faster in half the memory

This CL changes the index/suffixarray construction algorithm from QSufSort to SAIS.

For an N-byte input, QSufSort runs in O(N log N) time and requires
an N-int temporary work space in addition to the N-int output.

In contrast, SAIS runs in O(N) time and, for essentially all real inputs,
is able to use the N-int output buffer as its temporary work space.
(In pathological cases, SAIS must allocate a temporary work space of
at most N/2 ints. There exist more complex variants that guarantee
to avoid the work space in all cases, but they hardly seem worth the cost
given how rare these pathological cases are.)

The SAIS code therefore uses 50% of the memory across the board.
It also runs 3-10X faster on real input text.

This CL also adds more extensive algorithmic tests, including an
exhaustive test over small inputs to catch corner case problems.

name                                   old speed      new speed        delta
New/text=opticks/size=100K/bits=32-12  6.15MB/s ± 1%   26.79MB/s ± 1%   +335.89%  (p=0.008 n=5+5)
New/text=opticks/size=100K/bits=64-12  5.90MB/s ± 2%   27.29MB/s ± 2%   +362.23%  (p=0.008 n=5+5)
New/text=opticks/size=500K/bits=32-12  4.99MB/s ± 3%   25.37MB/s ± 2%   +408.01%  (p=0.008 n=5+5)
New/text=opticks/size=500K/bits=64-12  4.88MB/s ± 1%   25.66MB/s ± 4%   +425.52%  (p=0.008 n=5+5)
New/text=go/size=100K/bits=32-12       5.81MB/s ± 1%   26.49MB/s ± 2%   +355.85%  (p=0.008 n=5+5)
New/text=go/size=100K/bits=64-12       5.76MB/s ± 2%   26.65MB/s ± 3%   +362.60%  (p=0.008 n=5+5)
New/text=go/size=500K/bits=32-12       4.91MB/s ± 1%   25.12MB/s ± 2%   +411.86%  (p=0.008 n=5+5)
New/text=go/size=500K/bits=64-12       4.83MB/s ± 2%   25.79MB/s ± 2%   +434.44%  (p=0.008 n=5+5)
New/text=go/size=1M/bits=32-12         4.62MB/s ± 2%   24.87MB/s ± 2%   +438.78%  (p=0.008 n=5+5)
New/text=go/size=1M/bits=64-12         4.39MB/s ± 2%   24.61MB/s ± 2%   +460.68%  (p=0.008 n=5+5)
New/text=go/size=5M/bits=32-12         2.85MB/s ± 2%   24.78MB/s ± 7%   +768.33%  (p=0.008 n=5+5)
New/text=go/size=5M/bits=64-12         2.28MB/s ± 1%   18.70MB/s ± 7%   +719.63%  (p=0.008 n=5+5)
New/text=go/size=10M/bits=32-12        2.08MB/s ± 1%   21.04MB/s ± 6%   +909.60%  (p=0.008 n=5+5)
New/text=go/size=10M/bits=64-12        1.83MB/s ± 1%   16.64MB/s ± 2%   +809.18%  (p=0.008 n=5+5)
New/text=go/size=50M/bits=32-12        1.51MB/s ± 0%   10.58MB/s ± 1%   +602.52%  (p=0.008 n=5+5)
New/text=go/size=50M/bits=64-12        1.34MB/s ± 4%    9.00MB/s ± 1%   +569.35%  (p=0.008 n=5+5)
New/text=zero/size=100K/bits=32-12     4.17MB/s ± 0%  157.56MB/s ± 1%  +3678.42%  (p=0.016 n=4+5)
New/text=zero/size=100K/bits=64-12     4.19MB/s ± 2%  162.72MB/s ± 2%  +3783.63%  (p=0.008 n=5+5)
New/text=zero/size=500K/bits=32-12     3.72MB/s ± 5%  159.17MB/s ± 1%  +4176.57%  (p=0.008 n=5+5)
New/text=zero/size=500K/bits=64-12     3.77MB/s ± 3%  164.95MB/s ± 4%  +4277.60%  (p=0.008 n=5+5)
New/text=zero/size=1M/bits=32-12       3.46MB/s ± 3%  158.42MB/s ± 1%  +4476.08%  (p=0.008 n=5+5)
New/text=zero/size=1M/bits=64-12       3.41MB/s ± 4%  163.70MB/s ± 2%  +4700.65%  (p=0.008 n=5+5)
New/text=zero/size=5M/bits=32-12       3.12MB/s ± 2%  151.92MB/s ± 4%  +4775.48%  (p=0.008 n=5+5)
New/text=zero/size=5M/bits=64-12       3.09MB/s ± 2%  166.19MB/s ± 2%  +5274.84%  (p=0.008 n=5+5)
New/text=zero/size=10M/bits=32-12      2.97MB/s ± 1%  157.75MB/s ± 1%  +5211.38%  (p=0.008 n=5+5)
New/text=zero/size=10M/bits=64-12      2.92MB/s ± 1%  162.75MB/s ± 2%  +5473.77%  (p=0.008 n=5+5)
New/text=zero/size=50M/bits=32-12      2.67MB/s ± 1%  144.43MB/s ± 5%  +5305.39%  (p=0.008 n=5+5)
New/text=zero/size=50M/bits=64-12      2.61MB/s ± 1%  125.19MB/s ± 2%  +4700.33%  (p=0.016 n=5+4)
New/text=rand/size=100K/bits=32-12     8.69MB/s ± 6%   27.60MB/s ± 1%   +217.73%  (p=0.008 n=5+5)
New/text=rand/size=100K/bits=64-12     8.92MB/s ± 1%   26.37MB/s ± 4%   +195.50%  (p=0.008 n=5+5)
New/text=rand/size=500K/bits=32-12     7.11MB/s ± 2%   25.23MB/s ± 2%   +254.78%  (p=0.008 n=5+5)
New/text=rand/size=500K/bits=64-12     7.08MB/s ± 1%   25.45MB/s ± 2%   +259.56%  (p=0.008 n=5+5)
New/text=rand/size=1M/bits=32-12       6.45MB/s ± 2%   24.47MB/s ± 3%   +279.11%  (p=0.008 n=5+5)
New/text=rand/size=1M/bits=64-12       6.09MB/s ± 4%   23.00MB/s ± 4%   +277.85%  (p=0.008 n=5+5)
New/text=rand/size=5M/bits=32-12       3.68MB/s ± 3%   10.34MB/s ± 5%   +181.08%  (p=0.008 n=5+5)
New/text=rand/size=5M/bits=64-12       3.25MB/s ± 1%    6.23MB/s ± 1%    +91.93%  (p=0.008 n=5+5)
New/text=rand/size=10M/bits=32-12      3.03MB/s ± 1%    5.61MB/s ± 2%    +85.28%  (p=0.008 n=5+5)
New/text=rand/size=10M/bits=64-12      2.80MB/s ± 1%    4.29MB/s ± 2%    +53.40%  (p=0.008 n=5+5)
New/text=rand/size=50M/bits=32-12      2.11MB/s ± 0%    2.45MB/s ± 1%    +16.23%  (p=0.029 n=4+4)
New/text=rand/size=50M/bits=64-12      2.04MB/s ± 1%    2.24MB/s ± 1%    +10.03%  (p=0.016 n=5+4)
SaveRestore/bits=32-12                  327MB/s ± 5%     319MB/s ± 2%       ~     (p=0.310 n=5+5)
SaveRestore/bits=64-12                  306MB/s ± 3%     306MB/s ± 2%       ~     (p=0.841 n=5+5)

name                                   old alloc/op   new alloc/op     delta
New/text=opticks/size=100K/bits=32-12     811kB ± 0%       401kB ± 0%    -50.51%  (p=0.008 n=5+5)
New/text=opticks/size=100K/bits=64-12    1.62MB ± 0%      0.80MB ± 0%    -50.51%  (p=0.008 n=5+5)
New/text=opticks/size=500K/bits=32-12    4.04MB ± 0%      2.01MB ± 0%    -50.37%  (p=0.008 n=5+5)
New/text=opticks/size=500K/bits=64-12    8.07MB ± 0%      4.01MB ± 0%    -50.36%  (p=0.016 n=4+5)
New/text=go/size=100K/bits=32-12          811kB ± 0%       401kB ± 0%       ~     (p=0.079 n=4+5)
New/text=go/size=100K/bits=64-12         1.62MB ± 0%      0.80MB ± 0%    -50.50%  (p=0.008 n=5+5)
New/text=go/size=500K/bits=32-12         4.04MB ± 0%      2.01MB ± 0%       ~     (p=0.079 n=4+5)
New/text=go/size=500K/bits=64-12         8.07MB ± 0%      4.01MB ± 0%    -50.36%  (p=0.000 n=4+5)
New/text=go/size=1M/bits=32-12           8.07MB ± 0%      4.01MB ± 0%    -50.36%  (p=0.008 n=5+5)
New/text=go/size=1M/bits=64-12           16.1MB ± 0%       8.0MB ± 0%    -50.36%  (p=0.008 n=5+5)
New/text=go/size=5M/bits=32-12           40.2MB ± 0%      20.0MB ± 0%    -50.18%  (p=0.008 n=5+5)
New/text=go/size=5M/bits=64-12           80.3MB ± 0%      40.0MB ± 0%    -50.18%  (p=0.008 n=5+5)
New/text=go/size=10M/bits=32-12          80.2MB ± 0%      40.0MB ± 0%    -50.09%  (p=0.000 n=5+4)
New/text=go/size=10M/bits=64-12           160MB ± 0%        80MB ± 0%    -50.09%  (p=0.000 n=5+4)
New/text=go/size=50M/bits=32-12           402MB ± 0%       200MB ± 0%    -50.29%  (p=0.000 n=5+4)
New/text=go/size=50M/bits=64-12           805MB ± 0%       400MB ± 0%    -50.29%  (p=0.000 n=5+4)
New/text=zero/size=100K/bits=32-12       1.46MB ± 0%      0.40MB ± 0%    -72.46%  (p=0.008 n=5+5)
New/text=zero/size=100K/bits=64-12       3.02MB ± 0%      0.80MB ± 0%    -73.45%  (p=0.008 n=5+5)
New/text=zero/size=500K/bits=32-12       8.66MB ± 0%      2.01MB ± 0%       ~     (p=0.079 n=4+5)
New/text=zero/size=500K/bits=64-12       19.7MB ± 0%       4.0MB ± 0%    -79.63%  (p=0.008 n=5+5)
New/text=zero/size=1M/bits=32-12         19.7MB ± 0%       4.0MB ± 0%       ~     (p=0.079 n=4+5)
New/text=zero/size=1M/bits=64-12         39.0MB ± 0%       8.0MB ± 0%    -79.48%  (p=0.000 n=5+4)
New/text=zero/size=5M/bits=32-12         85.2MB ± 0%      20.0MB ± 0%    -76.52%  (p=0.008 n=5+5)
New/text=zero/size=5M/bits=64-12          169MB ± 0%        40MB ± 0%    -76.27%  (p=0.008 n=5+5)
New/text=zero/size=10M/bits=32-12         169MB ± 0%        40MB ± 0%    -76.26%  (p=0.000 n=5+4)
New/text=zero/size=10M/bits=64-12         333MB ± 0%        80MB ± 0%    -75.99%  (p=0.008 n=5+5)
New/text=zero/size=50M/bits=32-12         739MB ± 0%       200MB ± 0%    -72.93%  (p=0.000 n=4+5)
New/text=zero/size=50M/bits=64-12        1.63GB ± 0%      0.40GB ± 0%    -75.42%  (p=0.008 n=5+5)
New/text=rand/size=100K/bits=32-12        807kB ± 0%       401kB ± 0%    -50.25%  (p=0.008 n=5+5)
New/text=rand/size=100K/bits=64-12       1.61MB ± 0%      0.80MB ± 0%    -50.25%  (p=0.008 n=5+5)
New/text=rand/size=500K/bits=32-12       4.04MB ± 0%      2.01MB ± 0%       ~     (p=0.079 n=4+5)
New/text=rand/size=500K/bits=64-12       8.07MB ± 0%      4.01MB ± 0%       ~     (p=0.079 n=4+5)
New/text=rand/size=1M/bits=32-12         8.07MB ± 0%      4.01MB ± 0%    -50.36%  (p=0.000 n=5+4)
New/text=rand/size=1M/bits=64-12         16.1MB ± 0%       8.0MB ± 0%    -50.36%  (p=0.008 n=5+5)
New/text=rand/size=5M/bits=32-12         40.3MB ± 0%      20.0MB ± 0%    -50.35%  (p=0.029 n=4+4)
New/text=rand/size=5M/bits=64-12         80.7MB ± 0%      40.0MB ± 0%       ~     (p=0.079 n=4+5)
New/text=rand/size=10M/bits=32-12        80.7MB ± 0%      40.0MB ± 0%    -50.41%  (p=0.008 n=5+5)
New/text=rand/size=10M/bits=64-12         161MB ± 0%        80MB ± 0%    -50.44%  (p=0.029 n=4+4)
New/text=rand/size=50M/bits=32-12         403MB ± 0%       200MB ± 0%    -50.36%  (p=0.000 n=5+4)
New/text=rand/size=50M/bits=64-12         806MB ± 0%       400MB ± 0%       ~     (p=0.079 n=4+5)
SaveRestore/bits=32-12                   5.28MB ± 0%      5.28MB ± 0%       ~     (p=1.000 n=5+5)
SaveRestore/bits=64-12                   9.47MB ± 0%      9.47MB ± 0%       ~     (p=0.286 n=5+5)

https://perf.golang.org/search?q=upload:20190426.1

Fixes #15480.

Change-Id: I0790f6edf67f5a9c02b4462632b4942e0c37988b
Reviewed-on: https://go-review.googlesource.com/c/go/+/174100
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarEric Roshan-Eisner <edre@google.com>
Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
parent 7a43f8a5
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Gen generates sais2.go by duplicating functions in sais.go
// using different input types.
// See the comment at the top of sais.go for details.
package main
import (
"bytes"
"io/ioutil"
"log"
"strings"
)
func main() {
log.SetPrefix("gen: ")
log.SetFlags(0)
data, err := ioutil.ReadFile("sais.go")
if err != nil {
log.Fatal(err)
}
x := bytes.Index(data, []byte("\n\n"))
if x < 0 {
log.Fatal("cannot find blank line after copyright comment")
}
var buf bytes.Buffer
buf.Write(data[:x])
buf.WriteString("\n\n// Code generated by go generate; DO NOT EDIT.\n\npackage suffixarray\n")
for {
x := bytes.Index(data, []byte("\nfunc "))
if x < 0 {
break
}
data = data[x:]
p := bytes.IndexByte(data, '(')
if p < 0 {
p = len(data)
}
name := string(data[len("\nfunc "):p])
x = bytes.Index(data, []byte("\n}\n"))
if x < 0 {
log.Fatalf("cannot find end of func %s", name)
}
fn := string(data[:x+len("\n}\n")])
data = data[x+len("\n}"):]
if strings.HasSuffix(name, "_32") {
buf.WriteString(fix32.Replace(fn))
}
if strings.HasSuffix(name, "_8_32") {
// x_8_32 -> x_8_64 done above
fn = fix8_32.Replace(stripByteOnly(fn))
buf.WriteString(fn)
buf.WriteString(fix32.Replace(fn))
}
}
if err := ioutil.WriteFile("sais2.go", buf.Bytes(), 0666); err != nil {
log.Fatal(err)
}
}
var fix32 = strings.NewReplacer(
"32", "64",
"int32", "int64",
)
var fix8_32 = strings.NewReplacer(
"_8_32", "_32",
"byte", "int32",
)
func stripByteOnly(s string) string {
lines := strings.SplitAfter(s, "\n")
w := 0
for _, line := range lines {
if !strings.Contains(line, "256") && !strings.Contains(line, "byte-only") {
lines[w] = line
w++
}
}
return strings.Join(lines[:w], "")
}
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Gen64 generates qsufsort64.go from qsufsort.go by s/32/64/g.
package main
import (
"bytes"
"io/ioutil"
"log"
)
func main() {
log.SetPrefix("gen64: ")
log.SetFlags(0)
data, err := ioutil.ReadFile("qsufsort.go")
if err != nil {
log.Fatal(err)
}
data = bytes.Replace(data, []byte("\n\n"), []byte("\n\n// Code generated by gen64.go; DO NOT EDIT.\n//go:generate go run gen64.go\n\n"), 1)
data = bytes.Replace(data, []byte("32"), []byte("64"), -1)
if err := ioutil.WriteFile("qsufsort64.go", data, 0666); err != nil {
log.Fatal(err)
}
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This algorithm is based on "Faster Suffix Sorting"
// by N. Jesper Larsson and Kunihiko Sadakane
// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
// code: http://www.larsson.dogma.net/qsufsort.c
// This algorithm computes the suffix array sa by computing its inverse.
// Consecutive groups of suffixes in sa are labeled as sorted groups or
// unsorted groups. For a given pass of the sorter, all suffixes are ordered
// up to their first h characters, and sa is h-ordered. Suffixes in their
// final positions and unambiguously sorted in h-order are in a sorted group.
// Consecutive groups of suffixes with identical first h characters are an
// unsorted group. In each pass of the algorithm, unsorted groups are sorted
// according to the group number of their following suffix.
// In the implementation, if sa[i] is negative, it indicates that i is
// the first element of a sorted group of length -sa[i], and can be skipped.
// An unsorted group sa[i:k] is given the group number of the index of its
// last element, k-1. The group numbers are stored in the inverse slice (inv),
// and when all groups are sorted, this slice is the inverse suffix array.
package suffixarray
import (
"sort"
)
func qsufsort32(data []byte) []int32 {
// initial sorting by first byte of suffix
sa := sortedByFirstByte32(data)
if len(sa) < 2 {
return sa
}
// initialize the group lookup table
// this becomes the inverse of the suffix array when all groups are sorted
inv := initGroups32(sa, data)
// the index starts 1-ordered
sufSortable := &suffixSortable32{sa: sa, inv: inv, h: 1}
for sa[0] > -int32(len(sa)) { // until all suffixes are one big sorted group
// The suffixes are h-ordered, make them 2*h-ordered
pi := int32(0) // pi is first position of first group
sl := int32(0) // sl is negated length of sorted groups
for pi < int32(len(sa)) {
if s := sa[pi]; s < 0 { // if pi starts sorted group
pi -= s // skip over sorted group
sl += s // add negated length to sl
} else { // if pi starts unsorted group
if sl != 0 {
sa[pi+sl] = sl // combine sorted groups before pi
sl = 0
}
pk := inv[s] + 1 // pk-1 is last position of unsorted group
sufSortable.sa = sa[pi:pk]
sort.Sort(sufSortable)
sufSortable.updateGroups(pi)
pi = pk // next group
}
}
if sl != 0 { // if the array ends with a sorted group
sa[pi+sl] = sl // combine sorted groups at end of sa
}
sufSortable.h *= 2 // double sorted depth
}
for i := range sa { // reconstruct suffix array from inverse
sa[inv[i]] = int32(i)
}
return sa
}
func sortedByFirstByte32(data []byte) []int32 {
// total byte counts
var count [256]int
for _, b := range data {
count[b]++
}
// make count[b] equal index of first occurrence of b in sorted array
sum := 0
for b := range count {
count[b], sum = sum, count[b]+sum
}
// iterate through bytes, placing index into the correct spot in sa
sa := make([]int32, len(data))
for i, b := range data {
sa[count[b]] = int32(i)
count[b]++
}
return sa
}
func initGroups32(sa []int32, data []byte) []int32 {
// label contiguous same-letter groups with the same group number
inv := make([]int32, len(data))
prevGroup := int32(len(sa)) - 1
groupByte := data[sa[prevGroup]]
for i := int32(len(sa)) - 1; i >= 0; i-- {
if b := data[sa[i]]; b < groupByte {
if prevGroup == i+1 {
sa[i+1] = -1
}
groupByte = b
prevGroup = i
}
inv[sa[i]] = prevGroup
if prevGroup == 0 {
sa[0] = -1
}
}
// Separate out the final suffix to the start of its group.
// This is necessary to ensure the suffix "a" is before "aba"
// when using a potentially unstable sort.
lastByte := data[len(data)-1]
s := int32(-1)
for i := range sa {
if sa[i] >= 0 {
if data[sa[i]] == lastByte && s == -1 {
s = int32(i)
}
if sa[i] == int32(len(sa))-1 {
sa[i], sa[s] = sa[s], sa[i]
inv[sa[s]] = s
sa[s] = -1 // mark it as an isolated sorted group
break
}
}
}
return inv
}
type suffixSortable32 struct {
sa []int32
inv []int32
h int32
buf []int32 // common scratch space
}
func (x *suffixSortable32) Len() int { return len(x.sa) }
func (x *suffixSortable32) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
func (x *suffixSortable32) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
func (x *suffixSortable32) updateGroups(offset int32) {
bounds := x.buf[0:0]
group := x.inv[x.sa[0]+x.h]
for i := 1; i < len(x.sa); i++ {
if g := x.inv[x.sa[i]+x.h]; g > group {
bounds = append(bounds, int32(i))
group = g
}
}
bounds = append(bounds, int32(len(x.sa)))
x.buf = bounds
// update the group numberings after all new groups are determined
prev := int32(0)
for _, b := range bounds {
for i := prev; i < b; i++ {
x.inv[x.sa[i]] = offset + b - 1
}
if b-prev == 1 {
x.sa[prev] = -1
}
prev = b
}
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by gen64.go; DO NOT EDIT.
//go:generate go run gen64.go
// This algorithm is based on "Faster Suffix Sorting"
// by N. Jesper Larsson and Kunihiko Sadakane
// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
// code: http://www.larsson.dogma.net/qsufsort.c
// This algorithm computes the suffix array sa by computing its inverse.
// Consecutive groups of suffixes in sa are labeled as sorted groups or
// unsorted groups. For a given pass of the sorter, all suffixes are ordered
// up to their first h characters, and sa is h-ordered. Suffixes in their
// final positions and unambiguously sorted in h-order are in a sorted group.
// Consecutive groups of suffixes with identical first h characters are an
// unsorted group. In each pass of the algorithm, unsorted groups are sorted
// according to the group number of their following suffix.
// In the implementation, if sa[i] is negative, it indicates that i is
// the first element of a sorted group of length -sa[i], and can be skipped.
// An unsorted group sa[i:k] is given the group number of the index of its
// last element, k-1. The group numbers are stored in the inverse slice (inv),
// and when all groups are sorted, this slice is the inverse suffix array.
package suffixarray
import (
"sort"
)
func qsufsort64(data []byte) []int64 {
// initial sorting by first byte of suffix
sa := sortedByFirstByte64(data)
if len(sa) < 2 {
return sa
}
// initialize the group lookup table
// this becomes the inverse of the suffix array when all groups are sorted
inv := initGroups64(sa, data)
// the index starts 1-ordered
sufSortable := &suffixSortable64{sa: sa, inv: inv, h: 1}
for sa[0] > -int64(len(sa)) { // until all suffixes are one big sorted group
// The suffixes are h-ordered, make them 2*h-ordered
pi := int64(0) // pi is first position of first group
sl := int64(0) // sl is negated length of sorted groups
for pi < int64(len(sa)) {
if s := sa[pi]; s < 0 { // if pi starts sorted group
pi -= s // skip over sorted group
sl += s // add negated length to sl
} else { // if pi starts unsorted group
if sl != 0 {
sa[pi+sl] = sl // combine sorted groups before pi
sl = 0
}
pk := inv[s] + 1 // pk-1 is last position of unsorted group
sufSortable.sa = sa[pi:pk]
sort.Sort(sufSortable)
sufSortable.updateGroups(pi)
pi = pk // next group
}
}
if sl != 0 { // if the array ends with a sorted group
sa[pi+sl] = sl // combine sorted groups at end of sa
}
sufSortable.h *= 2 // double sorted depth
}
for i := range sa { // reconstruct suffix array from inverse
sa[inv[i]] = int64(i)
}
return sa
}
func sortedByFirstByte64(data []byte) []int64 {
// total byte counts
var count [256]int
for _, b := range data {
count[b]++
}
// make count[b] equal index of first occurrence of b in sorted array
sum := 0
for b := range count {
count[b], sum = sum, count[b]+sum
}
// iterate through bytes, placing index into the correct spot in sa
sa := make([]int64, len(data))
for i, b := range data {
sa[count[b]] = int64(i)
count[b]++
}
return sa
}
func initGroups64(sa []int64, data []byte) []int64 {
// label contiguous same-letter groups with the same group number
inv := make([]int64, len(data))
prevGroup := int64(len(sa)) - 1
groupByte := data[sa[prevGroup]]
for i := int64(len(sa)) - 1; i >= 0; i-- {
if b := data[sa[i]]; b < groupByte {
if prevGroup == i+1 {
sa[i+1] = -1
}
groupByte = b
prevGroup = i
}
inv[sa[i]] = prevGroup
if prevGroup == 0 {
sa[0] = -1
}
}
// Separate out the final suffix to the start of its group.
// This is necessary to ensure the suffix "a" is before "aba"
// when using a potentially unstable sort.
lastByte := data[len(data)-1]
s := int64(-1)
for i := range sa {
if sa[i] >= 0 {
if data[sa[i]] == lastByte && s == -1 {
s = int64(i)
}
if sa[i] == int64(len(sa))-1 {
sa[i], sa[s] = sa[s], sa[i]
inv[sa[s]] = s
sa[s] = -1 // mark it as an isolated sorted group
break
}
}
}
return inv
}
type suffixSortable64 struct {
sa []int64
inv []int64
h int64
buf []int64 // common scratch space
}
func (x *suffixSortable64) Len() int { return len(x.sa) }
func (x *suffixSortable64) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
func (x *suffixSortable64) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
func (x *suffixSortable64) updateGroups(offset int64) {
bounds := x.buf[0:0]
group := x.inv[x.sa[0]+x.h]
for i := 1; i < len(x.sa); i++ {
if g := x.inv[x.sa[i]+x.h]; g > group {
bounds = append(bounds, int64(i))
group = g
}
}
bounds = append(bounds, int64(len(x.sa)))
x.buf = bounds
// update the group numberings after all new groups are determined
prev := int64(0)
for _, b := range bounds {
for i := prev; i < b; i++ {
x.inv[x.sa[i]] = offset + b - 1
}
if b-prev == 1 {
x.sa[prev] = -1
}
prev = b
}
}
This diff is collapsed.
This diff is collapsed.
...@@ -72,13 +72,15 @@ func (a *ints) slice(i, j int) ints { ...@@ -72,13 +72,15 @@ func (a *ints) slice(i, j int) ints {
} }
// New creates a new Index for data. // New creates a new Index for data.
// Index creation time is O(N*log(N)) for N = len(data). // Index creation time is O(N) for N = len(data).
func New(data []byte) *Index { func New(data []byte) *Index {
ix := &Index{data: data} ix := &Index{data: data}
if len(data) <= maxData32 { if len(data) <= maxData32 {
ix.sa.int32 = qsufsort32(data) ix.sa.int32 = make([]int32, len(data))
text_32(data, ix.sa.int32)
} else { } else {
ix.sa.int64 = qsufsort64(data) ix.sa.int64 = make([]int64, len(data))
text_64(data, ix.sa.int64)
} }
return ix return ix
} }
......
...@@ -314,6 +314,158 @@ func TestIndex64(t *testing.T) { ...@@ -314,6 +314,158 @@ func TestIndex64(t *testing.T) {
testIndex(t) testIndex(t)
} }
func TestNew32(t *testing.T) {
test(t, func(x []byte) []int {
sa := make([]int32, len(x))
text_32(x, sa)
out := make([]int, len(sa))
for i, v := range sa {
out[i] = int(v)
}
return out
})
}
func TestNew64(t *testing.T) {
test(t, func(x []byte) []int {
sa := make([]int64, len(x))
text_64(x, sa)
out := make([]int, len(sa))
for i, v := range sa {
out[i] = int(v)
}
return out
})
}
// test tests an arbitrary suffix array construction function.
// Generates many inputs, builds and checks suffix arrays.
func test(t *testing.T, build func([]byte) []int) {
t.Run("ababab...", func(t *testing.T) {
// Very repetitive input has numLMS = len(x)/2-1
// at top level, the largest it can be.
// But maxID is only two (aba and ab$).
size := 100000
if testing.Short() {
size = 10000
}
x := make([]byte, size)
for i := range x {
x[i] = "ab"[i%2]
}
testSA(t, x, build)
})
t.Run("forcealloc", func(t *testing.T) {
// Construct a pathological input that forces
// recurse_32 to allocate a new temporary buffer.
// The input must have more than N/3 LMS-substrings,
// which we arrange by repeating an SLSLSLSLSLSL pattern
// like ababab... above, but then we must also arrange
// for a large number of distinct LMS-substrings.
// We use this pattern:
// 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ...
// This gives approximately 2¹⁵ distinct LMS-substrings.
// We need to repeat at least one substring, though,
// or else the recursion can be bypassed entirely.
x := make([]byte, 100000, 100001)
lo := byte(1)
hi := byte(255)
for i := range x {
if i%2 == 0 {
x[i] = lo
} else {
x[i] = hi
hi--
if hi <= lo {
lo++
if lo == 0 {
lo = 1
}
hi = 255
}
}
}
x[:cap(x)][len(x)] = 0 // for sais.New
testSA(t, x, build)
})
t.Run("exhaustive2", func(t *testing.T) {
// All inputs over {0,1} up to length 21.
// Runs in about 10 seconds on my laptop.
x := make([]byte, 30)
numFail := 0
for n := 0; n <= 21; n++ {
if n > 12 && testing.Short() {
break
}
x[n] = 0 // for sais.New
testRec(t, x[:n], 0, 2, &numFail, build)
}
})
t.Run("exhaustive3", func(t *testing.T) {
// All inputs over {0,1,2} up to length 14.
// Runs in about 10 seconds on my laptop.
x := make([]byte, 30)
numFail := 0
for n := 0; n <= 14; n++ {
if n > 8 && testing.Short() {
break
}
x[n] = 0 // for sais.New
testRec(t, x[:n], 0, 3, &numFail, build)
}
})
}
// testRec fills x[i:] with all possible combinations of values in [1,max]
// and then calls testSA(t, x, build) for each one.
func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
if i < len(x) {
for x[i] = 1; x[i] <= byte(max); x[i]++ {
testRec(t, x, i+1, max, numFail, build)
}
return
}
if !testSA(t, x, build) {
*numFail++
if *numFail >= 10 {
t.Errorf("stopping after %d failures", *numFail)
t.FailNow()
}
}
}
// testSA tests the suffix array build function on the input x.
// It constructs the suffix array and then checks that it is correct.
func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
defer func() {
if e := recover(); e != nil {
t.Logf("build %v", x)
panic(e)
}
}()
sa := build(x)
if len(sa) != len(x) {
t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
return false
}
for i := 0; i+1 < len(sa); i++ {
if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
t.Errorf("build %s: sa out of range: %v\n", x, sa)
return false
}
if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
return false
}
}
return true
}
var ( var (
benchdata = make([]byte, 1e6) benchdata = make([]byte, 1e6)
benchrand = make([]byte, 1e6) benchrand = make([]byte, 1e6)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment