index/suffixarray: index 3-10X faster in half the memory

This CL changes the index/suffixarray construction algorithm from QSufSort to SAIS. For an N-byte input, QSufSort runs in O(N log N) time and requires an N-int temporary work space in addition to the N-int output. In contrast, SAIS runs in O(N) time and, for essentially all real inputs, is able to use the N-int output buffer as its temporary work space. (In pathological cases, SAIS must allocate a temporary work space of at most N/2 ints. There exist more complex variants that guarantee to avoid the work space in all cases, but they hardly seem worth the cost given how rare these pathological cases are.) The SAIS code therefore uses 50% of the memory across the board. It also runs 3-10X faster on real input text. This CL also adds more extensive algorithmic tests, including an exhaustive test over small inputs to catch corner case problems. name old speed new speed delta New/text=opticks/size=100K/bits=32-12 6.15MB/s ± 1% 26.79MB/s ± 1% +335.89% (p=0.008 n=5+5) New/text=opticks/size=100K/bits=64-12 5.90MB/s ± 2% 27.29MB/s ± 2% +362.23% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=32-12 4.99MB/s ± 3% 25.37MB/s ± 2% +408.01% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=64-12 4.88MB/s ± 1% 25.66MB/s ± 4% +425.52% (p=0.008 n=5+5) New/text=go/size=100K/bits=32-12 5.81MB/s ± 1% 26.49MB/s ± 2% +355.85% (p=0.008 n=5+5) New/text=go/size=100K/bits=64-12 5.76MB/s ± 2% 26.65MB/s ± 3% +362.60% (p=0.008 n=5+5) New/text=go/size=500K/bits=32-12 4.91MB/s ± 1% 25.12MB/s ± 2% +411.86% (p=0.008 n=5+5) New/text=go/size=500K/bits=64-12 4.83MB/s ± 2% 25.79MB/s ± 2% +434.44% (p=0.008 n=5+5) New/text=go/size=1M/bits=32-12 4.62MB/s ± 2% 24.87MB/s ± 2% +438.78% (p=0.008 n=5+5) New/text=go/size=1M/bits=64-12 4.39MB/s ± 2% 24.61MB/s ± 2% +460.68% (p=0.008 n=5+5) New/text=go/size=5M/bits=32-12 2.85MB/s ± 2% 24.78MB/s ± 7% +768.33% (p=0.008 n=5+5) New/text=go/size=5M/bits=64-12 2.28MB/s ± 1% 18.70MB/s ± 7% +719.63% (p=0.008 n=5+5) New/text=go/size=10M/bits=32-12 2.08MB/s ± 1% 21.04MB/s ± 6% +909.60% (p=0.008 n=5+5) New/text=go/size=10M/bits=64-12 1.83MB/s ± 1% 16.64MB/s ± 2% +809.18% (p=0.008 n=5+5) New/text=go/size=50M/bits=32-12 1.51MB/s ± 0% 10.58MB/s ± 1% +602.52% (p=0.008 n=5+5) New/text=go/size=50M/bits=64-12 1.34MB/s ± 4% 9.00MB/s ± 1% +569.35% (p=0.008 n=5+5) New/text=zero/size=100K/bits=32-12 4.17MB/s ± 0% 157.56MB/s ± 1% +3678.42% (p=0.016 n=4+5) New/text=zero/size=100K/bits=64-12 4.19MB/s ± 2% 162.72MB/s ± 2% +3783.63% (p=0.008 n=5+5) New/text=zero/size=500K/bits=32-12 3.72MB/s ± 5% 159.17MB/s ± 1% +4176.57% (p=0.008 n=5+5) New/text=zero/size=500K/bits=64-12 3.77MB/s ± 3% 164.95MB/s ± 4% +4277.60% (p=0.008 n=5+5) New/text=zero/size=1M/bits=32-12 3.46MB/s ± 3% 158.42MB/s ± 1% +4476.08% (p=0.008 n=5+5) New/text=zero/size=1M/bits=64-12 3.41MB/s ± 4% 163.70MB/s ± 2% +4700.65% (p=0.008 n=5+5) New/text=zero/size=5M/bits=32-12 3.12MB/s ± 2% 151.92MB/s ± 4% +4775.48% (p=0.008 n=5+5) New/text=zero/size=5M/bits=64-12 3.09MB/s ± 2% 166.19MB/s ± 2% +5274.84% (p=0.008 n=5+5) New/text=zero/size=10M/bits=32-12 2.97MB/s ± 1% 157.75MB/s ± 1% +5211.38% (p=0.008 n=5+5) New/text=zero/size=10M/bits=64-12 2.92MB/s ± 1% 162.75MB/s ± 2% +5473.77% (p=0.008 n=5+5) New/text=zero/size=50M/bits=32-12 2.67MB/s ± 1% 144.43MB/s ± 5% +5305.39% (p=0.008 n=5+5) New/text=zero/size=50M/bits=64-12 2.61MB/s ± 1% 125.19MB/s ± 2% +4700.33% (p=0.016 n=5+4) New/text=rand/size=100K/bits=32-12 8.69MB/s ± 6% 27.60MB/s ± 1% +217.73% (p=0.008 n=5+5) New/text=rand/size=100K/bits=64-12 8.92MB/s ± 1% 26.37MB/s ± 4% +195.50% (p=0.008 n=5+5) New/text=rand/size=500K/bits=32-12 7.11MB/s ± 2% 25.23MB/s ± 2% +254.78% (p=0.008 n=5+5) New/text=rand/size=500K/bits=64-12 7.08MB/s ± 1% 25.45MB/s ± 2% +259.56% (p=0.008 n=5+5) New/text=rand/size=1M/bits=32-12 6.45MB/s ± 2% 24.47MB/s ± 3% +279.11% (p=0.008 n=5+5) New/text=rand/size=1M/bits=64-12 6.09MB/s ± 4% 23.00MB/s ± 4% +277.85% (p=0.008 n=5+5) New/text=rand/size=5M/bits=32-12 3.68MB/s ± 3% 10.34MB/s ± 5% +181.08% (p=0.008 n=5+5) New/text=rand/size=5M/bits=64-12 3.25MB/s ± 1% 6.23MB/s ± 1% +91.93% (p=0.008 n=5+5) New/text=rand/size=10M/bits=32-12 3.03MB/s ± 1% 5.61MB/s ± 2% +85.28% (p=0.008 n=5+5) New/text=rand/size=10M/bits=64-12 2.80MB/s ± 1% 4.29MB/s ± 2% +53.40% (p=0.008 n=5+5) New/text=rand/size=50M/bits=32-12 2.11MB/s ± 0% 2.45MB/s ± 1% +16.23% (p=0.029 n=4+4) New/text=rand/size=50M/bits=64-12 2.04MB/s ± 1% 2.24MB/s ± 1% +10.03% (p=0.016 n=5+4) SaveRestore/bits=32-12 327MB/s ± 5% 319MB/s ± 2% ~ (p=0.310 n=5+5) SaveRestore/bits=64-12 306MB/s ± 3% 306MB/s ± 2% ~ (p=0.841 n=5+5) name old alloc/op new alloc/op delta New/text=opticks/size=100K/bits=32-12 811kB ± 0% 401kB ± 0% -50.51% (p=0.008 n=5+5) New/text=opticks/size=100K/bits=64-12 1.62MB ± 0% 0.80MB ± 0% -50.51% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% -50.37% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.016 n=4+5) New/text=go/size=100K/bits=32-12 811kB ± 0% 401kB ± 0% ~ (p=0.079 n=4+5) New/text=go/size=100K/bits=64-12 1.62MB ± 0% 0.80MB ± 0% -50.50% (p=0.008 n=5+5) New/text=go/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=go/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.000 n=4+5) New/text=go/size=1M/bits=32-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.008 n=5+5) New/text=go/size=1M/bits=64-12 16.1MB ± 0% 8.0MB ± 0% -50.36% (p=0.008 n=5+5) New/text=go/size=5M/bits=32-12 40.2MB ± 0% 20.0MB ± 0% -50.18% (p=0.008 n=5+5) New/text=go/size=5M/bits=64-12 80.3MB ± 0% 40.0MB ± 0% -50.18% (p=0.008 n=5+5) New/text=go/size=10M/bits=32-12 80.2MB ± 0% 40.0MB ± 0% -50.09% (p=0.000 n=5+4) New/text=go/size=10M/bits=64-12 160MB ± 0% 80MB ± 0% -50.09% (p=0.000 n=5+4) New/text=go/size=50M/bits=32-12 402MB ± 0% 200MB ± 0% -50.29% (p=0.000 n=5+4) New/text=go/size=50M/bits=64-12 805MB ± 0% 400MB ± 0% -50.29% (p=0.000 n=5+4) New/text=zero/size=100K/bits=32-12 1.46MB ± 0% 0.40MB ± 0% -72.46% (p=0.008 n=5+5) New/text=zero/size=100K/bits=64-12 3.02MB ± 0% 0.80MB ± 0% -73.45% (p=0.008 n=5+5) New/text=zero/size=500K/bits=32-12 8.66MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=zero/size=500K/bits=64-12 19.7MB ± 0% 4.0MB ± 0% -79.63% (p=0.008 n=5+5) New/text=zero/size=1M/bits=32-12 19.7MB ± 0% 4.0MB ± 0% ~ (p=0.079 n=4+5) New/text=zero/size=1M/bits=64-12 39.0MB ± 0% 8.0MB ± 0% -79.48% (p=0.000 n=5+4) New/text=zero/size=5M/bits=32-12 85.2MB ± 0% 20.0MB ± 0% -76.52% (p=0.008 n=5+5) New/text=zero/size=5M/bits=64-12 169MB ± 0% 40MB ± 0% -76.27% (p=0.008 n=5+5) New/text=zero/size=10M/bits=32-12 169MB ± 0% 40MB ± 0% -76.26% (p=0.000 n=5+4) New/text=zero/size=10M/bits=64-12 333MB ± 0% 80MB ± 0% -75.99% (p=0.008 n=5+5) New/text=zero/size=50M/bits=32-12 739MB ± 0% 200MB ± 0% -72.93% (p=0.000 n=4+5) New/text=zero/size=50M/bits=64-12 1.63GB ± 0% 0.40GB ± 0% -75.42% (p=0.008 n=5+5) New/text=rand/size=100K/bits=32-12 807kB ± 0% 401kB ± 0% -50.25% (p=0.008 n=5+5) New/text=rand/size=100K/bits=64-12 1.61MB ± 0% 0.80MB ± 0% -50.25% (p=0.008 n=5+5) New/text=rand/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=1M/bits=32-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.000 n=5+4) New/text=rand/size=1M/bits=64-12 16.1MB ± 0% 8.0MB ± 0% -50.36% (p=0.008 n=5+5) New/text=rand/size=5M/bits=32-12 40.3MB ± 0% 20.0MB ± 0% -50.35% (p=0.029 n=4+4) New/text=rand/size=5M/bits=64-12 80.7MB ± 0% 40.0MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=10M/bits=32-12 80.7MB ± 0% 40.0MB ± 0% -50.41% (p=0.008 n=5+5) New/text=rand/size=10M/bits=64-12 161MB ± 0% 80MB ± 0% -50.44% (p=0.029 n=4+4) New/text=rand/size=50M/bits=32-12 403MB ± 0% 200MB ± 0% -50.36% (p=0.000 n=5+4) New/text=rand/size=50M/bits=64-12 806MB ± 0% 400MB ± 0% ~ (p=0.079 n=4+5) SaveRestore/bits=32-12 5.28MB ± 0% 5.28MB ± 0% ~ (p=1.000 n=5+5) SaveRestore/bits=64-12 9.47MB ± 0% 9.47MB ± 0% ~ (p=0.286 n=5+5) https://perf.golang.org/search?q=upload:20190426.1 Fixes #15480. Change-Id: I0790f6edf67f5a9c02b4462632b4942e0c37988b Reviewed-on: https://go-review.googlesource.com/c/go/+/174100 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Eric Roshan-Eisner <edre@google.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

index/suffixarray: index 3-10X faster in half the memory
This CL changes the index/suffixarray construction algorithm from QSufSort to SAIS. For an N-byte input, QSufSort runs in O(N log N) time and requires an N-int temporary work space in addition to the N-int output. In contrast, SAIS runs in O(N) time and, for essentially all real inputs, is able to use the N-int output buffer as its temporary work space. (In pathological cases, SAIS must allocate a temporary work space of at most N/2 ints. There exist more complex variants that guarantee to avoid the work space in all cases, but they hardly seem worth the cost given how rare these pathological cases are.) The SAIS code therefore uses 50% of the memory across the board. It also runs 3-10X faster on real input text. This CL also adds more extensive algorithmic tests, including an exhaustive test over small inputs to catch corner case problems. name old speed new speed delta New/text=opticks/size=100K/bits=32-12 6.15MB/s ± 1% 26.79MB/s ± 1% +335.89% (p=0.008 n=5+5) New/text=opticks/size=100K/bits=64-12 5.90MB/s ± 2% 27.29MB/s ± 2% +362.23% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=32-12 4.99MB/s ± 3% 25.37MB/s ± 2% +408.01% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=64-12 4.88MB/s ± 1% 25.66MB/s ± 4% +425.52% (p=0.008 n=5+5) New/text=go/size=100K/bits=32-12 5.81MB/s ± 1% 26.49MB/s ± 2% +355.85% (p=0.008 n=5+5) New/text=go/size=100K/bits=64-12 5.76MB/s ± 2% 26.65MB/s ± 3% +362.60% (p=0.008 n=5+5) New/text=go/size=500K/bits=32-12 4.91MB/s ± 1% 25.12MB/s ± 2% +411.86% (p=0.008 n=5+5) New/text=go/size=500K/bits=64-12 4.83MB/s ± 2% 25.79MB/s ± 2% +434.44% (p=0.008 n=5+5) New/text=go/size=1M/bits=32-12 4.62MB/s ± 2% 24.87MB/s ± 2% +438.78% (p=0.008 n=5+5) New/text=go/size=1M/bits=64-12 4.39MB/s ± 2% 24.61MB/s ± 2% +460.68% (p=0.008 n=5+5) New/text=go/size=5M/bits=32-12 2.85MB/s ± 2% 24.78MB/s ± 7% +768.33% (p=0.008 n=5+5) New/text=go/size=5M/bits=64-12 2.28MB/s ± 1% 18.70MB/s ± 7% +719.63% (p=0.008 n=5+5) New/text=go/size=10M/bits=32-12 2.08MB/s ± 1% 21.04MB/s ± 6% +909.60% (p=0.008 n=5+5) New/text=go/size=10M/bits=64-12 1.83MB/s ± 1% 16.64MB/s ± 2% +809.18% (p=0.008 n=5+5) New/text=go/size=50M/bits=32-12 1.51MB/s ± 0% 10.58MB/s ± 1% +602.52% (p=0.008 n=5+5) New/text=go/size=50M/bits=64-12 1.34MB/s ± 4% 9.00MB/s ± 1% +569.35% (p=0.008 n=5+5) New/text=zero/size=100K/bits=32-12 4.17MB/s ± 0% 157.56MB/s ± 1% +3678.42% (p=0.016 n=4+5) New/text=zero/size=100K/bits=64-12 4.19MB/s ± 2% 162.72MB/s ± 2% +3783.63% (p=0.008 n=5+5) New/text=zero/size=500K/bits=32-12 3.72MB/s ± 5% 159.17MB/s ± 1% +4176.57% (p=0.008 n=5+5) New/text=zero/size=500K/bits=64-12 3.77MB/s ± 3% 164.95MB/s ± 4% +4277.60% (p=0.008 n=5+5) New/text=zero/size=1M/bits=32-12 3.46MB/s ± 3% 158.42MB/s ± 1% +4476.08% (p=0.008 n=5+5) New/text=zero/size=1M/bits=64-12 3.41MB/s ± 4% 163.70MB/s ± 2% +4700.65% (p=0.008 n=5+5) New/text=zero/size=5M/bits=32-12 3.12MB/s ± 2% 151.92MB/s ± 4% +4775.48% (p=0.008 n=5+5) New/text=zero/size=5M/bits=64-12 3.09MB/s ± 2% 166.19MB/s ± 2% +5274.84% (p=0.008 n=5+5) New/text=zero/size=10M/bits=32-12 2.97MB/s ± 1% 157.75MB/s ± 1% +5211.38% (p=0.008 n=5+5) New/text=zero/size=10M/bits=64-12 2.92MB/s ± 1% 162.75MB/s ± 2% +5473.77% (p=0.008 n=5+5) New/text=zero/size=50M/bits=32-12 2.67MB/s ± 1% 144.43MB/s ± 5% +5305.39% (p=0.008 n=5+5) New/text=zero/size=50M/bits=64-12 2.61MB/s ± 1% 125.19MB/s ± 2% +4700.33% (p=0.016 n=5+4) New/text=rand/size=100K/bits=32-12 8.69MB/s ± 6% 27.60MB/s ± 1% +217.73% (p=0.008 n=5+5) New/text=rand/size=100K/bits=64-12 8.92MB/s ± 1% 26.37MB/s ± 4% +195.50% (p=0.008 n=5+5) New/text=rand/size=500K/bits=32-12 7.11MB/s ± 2% 25.23MB/s ± 2% +254.78% (p=0.008 n=5+5) New/text=rand/size=500K/bits=64-12 7.08MB/s ± 1% 25.45MB/s ± 2% +259.56% (p=0.008 n=5+5) New/text=rand/size=1M/bits=32-12 6.45MB/s ± 2% 24.47MB/s ± 3% +279.11% (p=0.008 n=5+5) New/text=rand/size=1M/bits=64-12 6.09MB/s ± 4% 23.00MB/s ± 4% +277.85% (p=0.008 n=5+5) New/text=rand/size=5M/bits=32-12 3.68MB/s ± 3% 10.34MB/s ± 5% +181.08% (p=0.008 n=5+5) New/text=rand/size=5M/bits=64-12 3.25MB/s ± 1% 6.23MB/s ± 1% +91.93% (p=0.008 n=5+5) New/text=rand/size=10M/bits=32-12 3.03MB/s ± 1% 5.61MB/s ± 2% +85.28% (p=0.008 n=5+5) New/text=rand/size=10M/bits=64-12 2.80MB/s ± 1% 4.29MB/s ± 2% +53.40% (p=0.008 n=5+5) New/text=rand/size=50M/bits=32-12 2.11MB/s ± 0% 2.45MB/s ± 1% +16.23% (p=0.029 n=4+4) New/text=rand/size=50M/bits=64-12 2.04MB/s ± 1% 2.24MB/s ± 1% +10.03% (p=0.016 n=5+4) SaveRestore/bits=32-12 327MB/s ± 5% 319MB/s ± 2% ~ (p=0.310 n=5+5) SaveRestore/bits=64-12 306MB/s ± 3% 306MB/s ± 2% ~ (p=0.841 n=5+5) name old alloc/op new alloc/op delta New/text=opticks/size=100K/bits=32-12 811kB ± 0% 401kB ± 0% -50.51% (p=0.008 n=5+5) New/text=opticks/size=100K/bits=64-12 1.62MB ± 0% 0.80MB ± 0% -50.51% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% -50.37% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.016 n=4+5) New/text=go/size=100K/bits=32-12 811kB ± 0% 401kB ± 0% ~ (p=0.079 n=4+5) New/text=go/size=100K/bits=64-12 1.62MB ± 0% 0.80MB ± 0% -50.50% (p=0.008 n=5+5) New/text=go/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=go/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.000 n=4+5) New/text=go/size=1M/bits=32-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.008 n=5+5) New/text=go/size=1M/bits=64-12 16.1MB ± 0% 8.0MB ± 0% -50.36% (p=0.008 n=5+5) New/text=go/size=5M/bits=32-12 40.2MB ± 0% 20.0MB ± 0% -50.18% (p=0.008 n=5+5) New/text=go/size=5M/bits=64-12 80.3MB ± 0% 40.0MB ± 0% -50.18% (p=0.008 n=5+5) New/text=go/size=10M/bits=32-12 80.2MB ± 0% 40.0MB ± 0% -50.09% (p=0.000 n=5+4) New/text=go/size=10M/bits=64-12 160MB ± 0% 80MB ± 0% -50.09% (p=0.000 n=5+4) New/text=go/size=50M/bits=32-12 402MB ± 0% 200MB ± 0% -50.29% (p=0.000 n=5+4) New/text=go/size=50M/bits=64-12 805MB ± 0% 400MB ± 0% -50.29% (p=0.000 n=5+4) New/text=zero/size=100K/bits=32-12 1.46MB ± 0% 0.40MB ± 0% -72.46% (p=0.008 n=5+5) New/text=zero/size=100K/bits=64-12 3.02MB ± 0% 0.80MB ± 0% -73.45% (p=0.008 n=5+5) New/text=zero/size=500K/bits=32-12 8.66MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=zero/size=500K/bits=64-12 19.7MB ± 0% 4.0MB ± 0% -79.63% (p=0.008 n=5+5) New/text=zero/size=1M/bits=32-12 19.7MB ± 0% 4.0MB ± 0% ~ (p=0.079 n=4+5) New/text=zero/size=1M/bits=64-12 39.0MB ± 0% 8.0MB ± 0% -79.48% (p=0.000 n=5+4) New/text=zero/size=5M/bits=32-12 85.2MB ± 0% 20.0MB ± 0% -76.52% (p=0.008 n=5+5) New/text=zero/size=5M/bits=64-12 169MB ± 0% 40MB ± 0% -76.27% (p=0.008 n=5+5) New/text=zero/size=10M/bits=32-12 169MB ± 0% 40MB ± 0% -76.26% (p=0.000 n=5+4) New/text=zero/size=10M/bits=64-12 333MB ± 0% 80MB ± 0% -75.99% (p=0.008 n=5+5) New/text=zero/size=50M/bits=32-12 739MB ± 0% 200MB ± 0% -72.93% (p=0.000 n=4+5) New/text=zero/size=50M/bits=64-12 1.63GB ± 0% 0.40GB ± 0% -75.42% (p=0.008 n=5+5) New/text=rand/size=100K/bits=32-12 807kB ± 0% 401kB ± 0% -50.25% (p=0.008 n=5+5) New/text=rand/size=100K/bits=64-12 1.61MB ± 0% 0.80MB ± 0% -50.25% (p=0.008 n=5+5) New/text=rand/size=500K/bits=32-12 4.04MB ± 0% 2.01MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=500K/bits=64-12 8.07MB ± 0% 4.01MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=1M/bits=32-12 8.07MB ± 0% 4.01MB ± 0% -50.36% (p=0.000 n=5+4) New/text=rand/size=1M/bits=64-12 16.1MB ± 0% 8.0MB ± 0% -50.36% (p=0.008 n=5+5) New/text=rand/size=5M/bits=32-12 40.3MB ± 0% 20.0MB ± 0% -50.35% (p=0.029 n=4+4) New/text=rand/size=5M/bits=64-12 80.7MB ± 0% 40.0MB ± 0% ~ (p=0.079 n=4+5) New/text=rand/size=10M/bits=32-12 80.7MB ± 0% 40.0MB ± 0% -50.41% (p=0.008 n=5+5) New/text=rand/size=10M/bits=64-12 161MB ± 0% 80MB ± 0% -50.44% (p=0.029 n=4+4) New/text=rand/size=50M/bits=32-12 403MB ± 0% 200MB ± 0% -50.36% (p=0.000 n=5+4) New/text=rand/size=50M/bits=64-12 806MB ± 0% 400MB ± 0% ~ (p=0.079 n=4+5) SaveRestore/bits=32-12 5.28MB ± 0% 5.28MB ± 0% ~ (p=1.000 n=5+5) SaveRestore/bits=64-12 9.47MB ± 0% 9.47MB ± 0% ~ (p=0.286 n=5+5) https://perf.golang.org/search?q=upload:20190426.1 Fixes #15480. Change-Id: I0790f6edf67f5a9c02b4462632b4942e0c37988b Reviewed-on: https://go-review.googlesource.com/c/go/+/174100 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Eric Roshan-Eisner <edre@google.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
6ca324f2 · Russ Cox · 7a43f8a5 · 6ca324f2 · 7a43f8a5 · 7a43f8a5
Commit 6ca324f2 authored Apr 26, 2019 by Russ Cox
8 changed files
--- a/src/index/suffixarray/gen.go
+++ b/src/index/suffixarray/gen.go
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// Gen generates sais2.go by duplicating functions in sais.go
+// using different input types.
+// See the comment at the top of sais.go for details.
+package main
+
+import (
+	"bytes"
+	"io/ioutil"
+	"log"
+	"strings"
+)
+
+func main() {
+	log.SetPrefix("gen: ")
+	log.SetFlags(0)
+
+	data, err := ioutil.ReadFile("sais.go")
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	x := bytes.Index(data, []byte("\n\n"))
+	if x < 0 {
+		log.Fatal("cannot find blank line after copyright comment")
+	}
+
+	var buf bytes.Buffer
+	buf.Write(data[:x])
+	buf.WriteString("\n\n// Code generated by go generate; DO NOT EDIT.\n\npackage suffixarray\n")
+
+	for {
+		x := bytes.Index(data, []byte("\nfunc "))
+		if x < 0 {
+			break
+		}
+		data = data[x:]
+		p := bytes.IndexByte(data, '(')
+		if p < 0 {
+			p = len(data)
+		}
+		name := string(data[len("\nfunc "):p])
+
+		x = bytes.Index(data, []byte("\n}\n"))
+		if x < 0 {
+			log.Fatalf("cannot find end of func %s", name)
+		}
+		fn := string(data[:x+len("\n}\n")])
+		data = data[x+len("\n}"):]
+
+		if strings.HasSuffix(name, "_32") {
+			buf.WriteString(fix32.Replace(fn))
+		}
+		if strings.HasSuffix(name, "_8_32") {
+			// x_8_32 -> x_8_64 done above
+			fn = fix8_32.Replace(stripByteOnly(fn))
+			buf.WriteString(fn)
+			buf.WriteString(fix32.Replace(fn))
+		}
+	}
+
+	if err := ioutil.WriteFile("sais2.go", buf.Bytes(), 0666); err != nil {
+		log.Fatal(err)
+	}
+}
+
+var fix32 = strings.NewReplacer(
+	"32", "64",
+	"int32", "int64",
+)
+
+var fix8_32 = strings.NewReplacer(
+	"_8_32", "_32",
+	"byte", "int32",
+)
+
+func stripByteOnly(s string) string {
+	lines := strings.SplitAfter(s, "\n")
+	w := 0
+	for _, line := range lines {
+		if !strings.Contains(line, "256") && !strings.Contains(line, "byte-only") {
+			lines[w] = line
+			w++
+		}
+	}
+	return strings.Join(lines[:w], "")
+}
--- a/src/index/suffixarray/gen64.go
+++ b/src/index/suffixarray/gen64.go
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore
-
-// Gen64 generates qsufsort64.go from qsufsort.go by s/32/64/g.
-package main
-
-import (
-	"bytes"
-	"io/ioutil"
-	"log"
-)
-
-func main() {
-	log.SetPrefix("gen64: ")
-	log.SetFlags(0)
-
-	data, err := ioutil.ReadFile("qsufsort.go")
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	data = bytes.Replace(data, []byte("\n\n"), []byte("\n\n// Code generated by gen64.go; DO NOT EDIT.\n//go:generate go run gen64.go\n\n"), 1)
-	data = bytes.Replace(data, []byte("32"), []byte("64"), -1)
-
-	if err := ioutil.WriteFile("qsufsort64.go", data, 0666); err != nil {
-		log.Fatal(err)
-	}
-}
--- a/src/index/suffixarray/qsufsort.go
+++ b/src/index/suffixarray/qsufsort.go
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This algorithm is based on "Faster Suffix Sorting"
-//   by N. Jesper Larsson and Kunihiko Sadakane
-// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
-// code:  http://www.larsson.dogma.net/qsufsort.c
-
-// This algorithm computes the suffix array sa by computing its inverse.
-// Consecutive groups of suffixes in sa are labeled as sorted groups or
-// unsorted groups. For a given pass of the sorter, all suffixes are ordered
-// up to their first h characters, and sa is h-ordered. Suffixes in their
-// final positions and unambiguously sorted in h-order are in a sorted group.
-// Consecutive groups of suffixes with identical first h characters are an
-// unsorted group. In each pass of the algorithm, unsorted groups are sorted
-// according to the group number of their following suffix.
-
-// In the implementation, if sa[i] is negative, it indicates that i is
-// the first element of a sorted group of length -sa[i], and can be skipped.
-// An unsorted group sa[i:k] is given the group number of the index of its
-// last element, k-1. The group numbers are stored in the inverse slice (inv),
-// and when all groups are sorted, this slice is the inverse suffix array.
-
-package suffixarray
-
-import (
-	"sort"
-)
-
-func qsufsort32(data []byte) []int32 {
-	// initial sorting by first byte of suffix
-	sa := sortedByFirstByte32(data)
-	if len(sa) < 2 {
-		return sa
-	}
-	// initialize the group lookup table
-	// this becomes the inverse of the suffix array when all groups are sorted
-	inv := initGroups32(sa, data)
-
-	// the index starts 1-ordered
-	sufSortable := &suffixSortable32{sa: sa, inv: inv, h: 1}
-
-	for sa[0] > -int32(len(sa)) { // until all suffixes are one big sorted group
-		// The suffixes are h-ordered, make them 2*h-ordered
-		pi := int32(0) // pi is first position of first group
-		sl := int32(0) // sl is negated length of sorted groups
-		for pi < int32(len(sa)) {
-			if s := sa[pi]; s < 0 { // if pi starts sorted group
-				pi -= s // skip over sorted group
-				sl += s // add negated length to sl
-			} else { // if pi starts unsorted group
-				if sl != 0 {
-					sa[pi+sl] = sl // combine sorted groups before pi
-					sl = 0
-				}
-				pk := inv[s] + 1 // pk-1 is last position of unsorted group
-				sufSortable.sa = sa[pi:pk]
-				sort.Sort(sufSortable)
-				sufSortable.updateGroups(pi)
-				pi = pk // next group
-			}
-		}
-		if sl != 0 { // if the array ends with a sorted group
-			sa[pi+sl] = sl // combine sorted groups at end of sa
-		}
-
-		sufSortable.h *= 2 // double sorted depth
-	}
-
-	for i := range sa { // reconstruct suffix array from inverse
-		sa[inv[i]] = int32(i)
-	}
-	return sa
-}
-
-func sortedByFirstByte32(data []byte) []int32 {
-	// total byte counts
-	var count [256]int
-	for _, b := range data {
-		count[b]++
-	}
-	// make count[b] equal index of first occurrence of b in sorted array
-	sum := 0
-	for b := range count {
-		count[b], sum = sum, count[b]+sum
-	}
-	// iterate through bytes, placing index into the correct spot in sa
-	sa := make([]int32, len(data))
-	for i, b := range data {
-		sa[count[b]] = int32(i)
-		count[b]++
-	}
-	return sa
-}
-
-func initGroups32(sa []int32, data []byte) []int32 {
-	// label contiguous same-letter groups with the same group number
-	inv := make([]int32, len(data))
-	prevGroup := int32(len(sa)) - 1
-	groupByte := data[sa[prevGroup]]
-	for i := int32(len(sa)) - 1; i >= 0; i-- {
-		if b := data[sa[i]]; b < groupByte {
-			if prevGroup == i+1 {
-				sa[i+1] = -1
-			}
-			groupByte = b
-			prevGroup = i
-		}
-		inv[sa[i]] = prevGroup
-		if prevGroup == 0 {
-			sa[0] = -1
-		}
-	}
-	// Separate out the final suffix to the start of its group.
-	// This is necessary to ensure the suffix "a" is before "aba"
-	// when using a potentially unstable sort.
-	lastByte := data[len(data)-1]
-	s := int32(-1)
-	for i := range sa {
-		if sa[i] >= 0 {
-			if data[sa[i]] == lastByte && s == -1 {
-				s = int32(i)
-			}
-			if sa[i] == int32(len(sa))-1 {
-				sa[i], sa[s] = sa[s], sa[i]
-				inv[sa[s]] = s
-				sa[s] = -1 // mark it as an isolated sorted group
-				break
-			}
-		}
-	}
-	return inv
-}
-
-type suffixSortable32 struct {
-	sa  []int32
-	inv []int32
-	h   int32
-	buf []int32 // common scratch space
-}
-
-func (x *suffixSortable32) Len() int           { return len(x.sa) }
-func (x *suffixSortable32) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
-func (x *suffixSortable32) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-
-func (x *suffixSortable32) updateGroups(offset int32) {
-	bounds := x.buf[0:0]
-	group := x.inv[x.sa[0]+x.h]
-	for i := 1; i < len(x.sa); i++ {
-		if g := x.inv[x.sa[i]+x.h]; g > group {
-			bounds = append(bounds, int32(i))
-			group = g
-		}
-	}
-	bounds = append(bounds, int32(len(x.sa)))
-	x.buf = bounds
-
-	// update the group numberings after all new groups are determined
-	prev := int32(0)
-	for _, b := range bounds {
-		for i := prev; i < b; i++ {
-			x.inv[x.sa[i]] = offset + b - 1
-		}
-		if b-prev == 1 {
-			x.sa[prev] = -1
-		}
-		prev = b
-	}
-}
--- a/src/index/suffixarray/qsufsort64.go
+++ b/src/index/suffixarray/qsufsort64.go
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Code generated by gen64.go; DO NOT EDIT.
-//go:generate go run gen64.go
-
-// This algorithm is based on "Faster Suffix Sorting"
-//   by N. Jesper Larsson and Kunihiko Sadakane
-// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
-// code:  http://www.larsson.dogma.net/qsufsort.c
-
-// This algorithm computes the suffix array sa by computing its inverse.
-// Consecutive groups of suffixes in sa are labeled as sorted groups or
-// unsorted groups. For a given pass of the sorter, all suffixes are ordered
-// up to their first h characters, and sa is h-ordered. Suffixes in their
-// final positions and unambiguously sorted in h-order are in a sorted group.
-// Consecutive groups of suffixes with identical first h characters are an
-// unsorted group. In each pass of the algorithm, unsorted groups are sorted
-// according to the group number of their following suffix.
-
-// In the implementation, if sa[i] is negative, it indicates that i is
-// the first element of a sorted group of length -sa[i], and can be skipped.
-// An unsorted group sa[i:k] is given the group number of the index of its
-// last element, k-1. The group numbers are stored in the inverse slice (inv),
-// and when all groups are sorted, this slice is the inverse suffix array.
-
-package suffixarray
-
-import (
-	"sort"
-)
-
-func qsufsort64(data []byte) []int64 {
-	// initial sorting by first byte of suffix
-	sa := sortedByFirstByte64(data)
-	if len(sa) < 2 {
-		return sa
-	}
-	// initialize the group lookup table
-	// this becomes the inverse of the suffix array when all groups are sorted
-	inv := initGroups64(sa, data)
-
-	// the index starts 1-ordered
-	sufSortable := &suffixSortable64{sa: sa, inv: inv, h: 1}
-
-	for sa[0] > -int64(len(sa)) { // until all suffixes are one big sorted group
-		// The suffixes are h-ordered, make them 2*h-ordered
-		pi := int64(0) // pi is first position of first group
-		sl := int64(0) // sl is negated length of sorted groups
-		for pi < int64(len(sa)) {
-			if s := sa[pi]; s < 0 { // if pi starts sorted group
-				pi -= s // skip over sorted group
-				sl += s // add negated length to sl
-			} else { // if pi starts unsorted group
-				if sl != 0 {
-					sa[pi+sl] = sl // combine sorted groups before pi
-					sl = 0
-				}
-				pk := inv[s] + 1 // pk-1 is last position of unsorted group
-				sufSortable.sa = sa[pi:pk]
-				sort.Sort(sufSortable)
-				sufSortable.updateGroups(pi)
-				pi = pk // next group
-			}
-		}
-		if sl != 0 { // if the array ends with a sorted group
-			sa[pi+sl] = sl // combine sorted groups at end of sa
-		}
-
-		sufSortable.h *= 2 // double sorted depth
-	}
-
-	for i := range sa { // reconstruct suffix array from inverse
-		sa[inv[i]] = int64(i)
-	}
-	return sa
-}
-
-func sortedByFirstByte64(data []byte) []int64 {
-	// total byte counts
-	var count [256]int
-	for _, b := range data {
-		count[b]++
-	}
-	// make count[b] equal index of first occurrence of b in sorted array
-	sum := 0
-	for b := range count {
-		count[b], sum = sum, count[b]+sum
-	}
-	// iterate through bytes, placing index into the correct spot in sa
-	sa := make([]int64, len(data))
-	for i, b := range data {
-		sa[count[b]] = int64(i)
-		count[b]++
-	}
-	return sa
-}
-
-func initGroups64(sa []int64, data []byte) []int64 {
-	// label contiguous same-letter groups with the same group number
-	inv := make([]int64, len(data))
-	prevGroup := int64(len(sa)) - 1
-	groupByte := data[sa[prevGroup]]
-	for i := int64(len(sa)) - 1; i >= 0; i-- {
-		if b := data[sa[i]]; b < groupByte {
-			if prevGroup == i+1 {
-				sa[i+1] = -1
-			}
-			groupByte = b
-			prevGroup = i
-		}
-		inv[sa[i]] = prevGroup
-		if prevGroup == 0 {
-			sa[0] = -1
-		}
-	}
-	// Separate out the final suffix to the start of its group.
-	// This is necessary to ensure the suffix "a" is before "aba"
-	// when using a potentially unstable sort.
-	lastByte := data[len(data)-1]
-	s := int64(-1)
-	for i := range sa {
-		if sa[i] >= 0 {
-			if data[sa[i]] == lastByte && s == -1 {
-				s = int64(i)
-			}
-			if sa[i] == int64(len(sa))-1 {
-				sa[i], sa[s] = sa[s], sa[i]
-				inv[sa[s]] = s
-				sa[s] = -1 // mark it as an isolated sorted group
-				break
-			}
-		}
-	}
-	return inv
-}
-
-type suffixSortable64 struct {
-	sa  []int64
-	inv []int64
-	h   int64
-	buf []int64 // common scratch space
-}
-
-func (x *suffixSortable64) Len() int           { return len(x.sa) }
-func (x *suffixSortable64) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
-func (x *suffixSortable64) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-
-func (x *suffixSortable64) updateGroups(offset int64) {
-	bounds := x.buf[0:0]
-	group := x.inv[x.sa[0]+x.h]
-	for i := 1; i < len(x.sa); i++ {
-		if g := x.inv[x.sa[i]+x.h]; g > group {
-			bounds = append(bounds, int64(i))
-			group = g
-		}
-	}
-	bounds = append(bounds, int64(len(x.sa)))
-	x.buf = bounds
-
-	// update the group numberings after all new groups are determined
-	prev := int64(0)
-	for _, b := range bounds {
-		for i := prev; i < b; i++ {
-			x.inv[x.sa[i]] = offset + b - 1
-		}
-		if b-prev == 1 {
-			x.sa[prev] = -1
-		}
-		prev = b
-	}
-}
--- a/src/index/suffixarray/sais.go
+++ b/src/index/suffixarray/sais.go
--- a/src/index/suffixarray/sais2.go
+++ b/src/index/suffixarray/sais2.go
--- a/src/index/suffixarray/suffixarray.go
+++ b/src/index/suffixarray/suffixarray.go
@@ -72,13 +72,15 @@ func (a *ints) slice(i, j int) ints {
 }

 // New creates a new Index for data.
-// Index creation time is O(N*log(N)) for N = len(data).
+// Index creation time is O(N) for N = len(data).
 func New(data []byte) *Index {
 	ix := &Index{data: data}
 	if len(data) <= maxData32 {
-		ix.sa.int32 = qsufsort32(data)
+		ix.sa.int32 = make([]int32, len(data))
+		text_32(data, ix.sa.int32)
 	} else {
-		ix.sa.int64 = qsufsort64(data)
+		ix.sa.int64 = make([]int64, len(data))
+		text_64(data, ix.sa.int64)
 	}
 	return ix
 }

--- a/src/index/suffixarray/suffixarray_test.go
+++ b/src/index/suffixarray/suffixarray_test.go
@@ -314,6 +314,158 @@ func TestIndex64(t *testing.T) {
 	testIndex(t)
 }

+func TestNew32(t *testing.T) {
+	test(t, func(x []byte) []int {
+		sa := make([]int32, len(x))
+		text_32(x, sa)
+		out := make([]int, len(sa))
+		for i, v := range sa {
+			out[i] = int(v)
+		}
+		return out
+	})
+}
+
+func TestNew64(t *testing.T) {
+	test(t, func(x []byte) []int {
+		sa := make([]int64, len(x))
+		text_64(x, sa)
+		out := make([]int, len(sa))
+		for i, v := range sa {
+			out[i] = int(v)
+		}
+		return out
+	})
+}
+
+// test tests an arbitrary suffix array construction function.
+// Generates many inputs, builds and checks suffix arrays.
+func test(t *testing.T, build func([]byte) []int) {
+	t.Run("ababab...", func(t *testing.T) {
+		// Very repetitive input has numLMS = len(x)/2-1
+		// at top level, the largest it can be.
+		// But maxID is only two (aba and ab$).
+		size := 100000
+		if testing.Short() {
+			size = 10000
+		}
+		x := make([]byte, size)
+		for i := range x {
+			x[i] = "ab"[i%2]
+		}
+		testSA(t, x, build)
+	})
+
+	t.Run("forcealloc", func(t *testing.T) {
+		// Construct a pathological input that forces
+		// recurse_32 to allocate a new temporary buffer.
+		// The input must have more than N/3 LMS-substrings,
+		// which we arrange by repeating an SLSLSLSLSLSL pattern
+		// like ababab... above, but then we must also arrange
+		// for a large number of distinct LMS-substrings.
+		// We use this pattern:
+		// 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ...
+		// This gives approximately 2¹⁵ distinct LMS-substrings.
+		// We need to repeat at least one substring, though,
+		// or else the recursion can be bypassed entirely.
+		x := make([]byte, 100000, 100001)
+		lo := byte(1)
+		hi := byte(255)
+		for i := range x {
+			if i%2 == 0 {
+				x[i] = lo
+			} else {
+				x[i] = hi
+				hi--
+				if hi <= lo {
+					lo++
+					if lo == 0 {
+						lo = 1
+					}
+					hi = 255
+				}
+			}
+		}
+		x[:cap(x)][len(x)] = 0 // for sais.New
+		testSA(t, x, build)
+	})
+
+	t.Run("exhaustive2", func(t *testing.T) {
+		// All inputs over {0,1} up to length 21.
+		// Runs in about 10 seconds on my laptop.
+		x := make([]byte, 30)
+		numFail := 0
+		for n := 0; n <= 21; n++ {
+			if n > 12 && testing.Short() {
+				break
+			}
+			x[n] = 0 // for sais.New
+			testRec(t, x[:n], 0, 2, &numFail, build)
+		}
+	})
+
+	t.Run("exhaustive3", func(t *testing.T) {
+		// All inputs over {0,1,2} up to length 14.
+		// Runs in about 10 seconds on my laptop.
+		x := make([]byte, 30)
+		numFail := 0
+		for n := 0; n <= 14; n++ {
+			if n > 8 && testing.Short() {
+				break
+			}
+			x[n] = 0 // for sais.New
+			testRec(t, x[:n], 0, 3, &numFail, build)
+		}
+	})
+}
+
+// testRec fills x[i:] with all possible combinations of values in [1,max]
+// and then calls testSA(t, x, build) for each one.
+func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
+	if i < len(x) {
+		for x[i] = 1; x[i] <= byte(max); x[i]++ {
+			testRec(t, x, i+1, max, numFail, build)
+		}
+		return
+	}
+
+	if !testSA(t, x, build) {
+		*numFail++
+		if *numFail >= 10 {
+			t.Errorf("stopping after %d failures", *numFail)
+			t.FailNow()
+		}
+	}
+}
+
+// testSA tests the suffix array build function on the input x.
+// It constructs the suffix array and then checks that it is correct.
+func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
+	defer func() {
+		if e := recover(); e != nil {
+			t.Logf("build %v", x)
+			panic(e)
+		}
+	}()
+	sa := build(x)
+	if len(sa) != len(x) {
+		t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
+		return false
+	}
+	for i := 0; i+1 < len(sa); i++ {
+		if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
+			t.Errorf("build %s: sa out of range: %v\n", x, sa)
+			return false
+		}
+		if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
+			t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
+			return false
+		}
+	}
+
+	return true
+}
+
 var (
 	benchdata = make([]byte, 1e6)
 	benchrand = make([]byte, 1e6)