Commit 5ee7ef90 authored by Robert Griesemer's avatar Robert Griesemer

suffixarray: improved serialization code

Use gobs to serialize indexes instead of encoding/binary.

Even with gobs, serialize data in slices instead of
applying gob to the entire data structure at once,
to reduce the amount of extra buffer memory needed
inside gob.

7x faster Write/Read for new BenchmarkSaveRestore
compared to old code; possibly because encoding/binary
is more expensive for int32 slice elements (interface
call to get little/big endian encoding), while gob's
encoding is fixed (unconfirmed).

new (using gobs):
suffixarray.BenchmarkSaveRestore	       1	2153604000 ns/op

old (using encoding/binary):
suffixarray.BenchmarkSaveRestore	       1	15118322000 ns/op

The actual serialized data is slightly larger then using
the old code for very large indices because full 32bit indices
require 5bytes using gobs instead of 4bytes (encoding/binary)
in serialized form.

R=r
CC=golang-dev
https://golang.org/cl/5087041
parent 86e65bac
......@@ -18,8 +18,8 @@ package suffixarray
import (
"bytes"
"encoding/binary"
"exp/regexp"
"gob"
"io"
"os"
"sort"
......@@ -37,13 +37,18 @@ func New(data []byte) *Index {
return &Index{data, qsufsort(data)}
}
// Read and Write slice the data into successive portions of length gobN,
// so gob can allocate smaller buffers for its I/O.
const gobN = 1 << 16 // slightly better than say 1 << 20 (BenchmarkSaveRestore)
// Read reads the index from r into x; x must not be nil.
func (x *Index) Read(r io.Reader) os.Error {
var n int32
if err := binary.Read(r, binary.LittleEndian, &n); err != nil {
d := gob.NewDecoder(r)
var n int
if err := d.Decode(&n); err != nil {
return err
}
if 2*n < int32(cap(x.data)) || int32(cap(x.data)) < n {
if 2*n < cap(x.data) || cap(x.data) < n {
// new data is significantly smaller or larger then
// existing buffers - allocate new ones
x.data = make([]byte, n)
......@@ -53,28 +58,51 @@ func (x *Index) Read(r io.Reader) os.Error {
x.data = x.data[0:n]
x.sa = x.sa[0:n]
}
if err := binary.Read(r, binary.LittleEndian, x.data); err != nil {
return err
}
if err := binary.Read(r, binary.LittleEndian, x.sa); err != nil {
return err
for i := 0; i < n; {
j := i + gobN
if j > n {
j = n
}
// data holds next piece of x.data; its length is updated by Decode
data := x.data[i:j]
if err := d.Decode(&data); err != nil {
return err
}
if len(data) != j-i {
return os.NewError("suffixarray.Read: inconsistent data format")
}
// sa holds next piece of x.data; its length is updated by Decode
sa := x.sa[i:j]
if err := d.Decode(&sa); err != nil {
return err
}
if len(sa) != j-i {
return os.NewError("suffixarray.Read: inconsistent data format")
}
i = j
}
return nil
}
// Write writes the index x to w.
func (x *Index) Write(w io.Writer) os.Error {
n := int32(len(x.data))
if err := binary.Write(w, binary.LittleEndian, n); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, x.data); err != nil {
e := gob.NewEncoder(w)
n := len(x.data)
if err := e.Encode(n); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, x.sa); err != nil {
return err
for i := 0; i < n; {
j := i + gobN
if j > n {
j = n
}
if err := e.Encode(x.data[i:j]); err != nil {
return err
}
if err := e.Encode(x.sa[i:j]); err != nil {
return err
}
i = j
}
return nil
}
......
......@@ -7,6 +7,7 @@ package suffixarray
import (
"bytes"
"exp/regexp"
"rand"
"sort"
"strings"
"testing"
......@@ -255,3 +256,21 @@ func TestIndex(t *testing.T) {
testLookups(t, &tc, x, -1)
}
}
func BenchmarkSaveRestore(b *testing.B) {
b.StopTimer()
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
data := make([]byte, 10<<20) // 10MB index data
for i := range data {
data[i] = byte(r.Intn(256))
}
x := New(data)
testSaveRestore(nil, nil, x) // verify correctness
buf := bytes.NewBuffer(make([]byte, len(data))) // avoid frequent growing
b.StartTimer()
for i := 0; i < b.N; i++ {
x.Write(buf)
var y Index
y.Read(buf)
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment