Commit 8b7ea648 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: changed implementation of Compare and CompareString to

compare incrementally. Also modified collation API to be more high-level
by removing the need for an explicit buffer to be passed as an argument.
This considerably speeds up Compare and CompareString.  This change also eliminates
the need to reinitialize the normalization buffer for each use of an iter. This
also significantly improves performance for Key and KeyString.

R=r, rsc
CC=golang-dev
https://golang.org/cl/6842050
parent 0d0eff71
......@@ -83,6 +83,13 @@ type Collator struct {
f norm.Form
t *table
_iter [2]iter
}
func (c *Collator) iter(i int) *iter {
// TODO: evaluate performance for making the second iterator optional.
return &c._iter[i]
}
// Locales returns the list of locales for which collating differs from its parent locale.
......@@ -100,11 +107,18 @@ func New(loc string) *Collator {
t = mainTable.indexedTable(idx)
}
}
return &Collator{
return newCollator(t)
}
func newCollator(t *table) *Collator {
c := &Collator{
Strength: Quaternary,
f: norm.NFD,
t: t,
}
c._iter[0].init(c)
c._iter[1].init(c)
return c
}
// SetVariableTop sets all runes with primary strength less than the primary
......@@ -113,63 +127,114 @@ func (c *Collator) SetVariableTop(r rune) {
// TODO: implement
}
// Buffer holds reusable buffers that can be used during collation.
// Reusing a Buffer for the various calls that accept it may avoid
// unnecessary memory allocations.
// Buffer holds keys generated by Key and KeyString.
type Buffer struct {
// TODO: try various parameters and techniques, such as using
// a chan of buffers for a pool.
ba [4096]byte
wa [512]colElem
buf [4096]byte
key []byte
ce []colElem
}
func (b *Buffer) init() {
if b.ce == nil {
b.ce = b.wa[:0]
b.key = b.ba[:0]
} else {
b.ce = b.ce[:0]
if b.key == nil {
b.key = b.buf[:0]
}
}
// ResetKeys clears the buffer used for generated keys. Calling ResetKeys
// invalidates keys previously obtained from Key or KeyFromString.
func (b *Buffer) ResetKeys() {
b.ce = b.ce[:0]
// Reset clears the buffer from previous results generated by Key and KeyString.
func (b *Buffer) Reset() {
b.key = b.key[:0]
}
// Compare returns an integer comparing the two byte slices.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// Compare calls ResetKeys, thereby invalidating keys
// previously generated using Key or KeyFromString using buf.
func (c *Collator) Compare(buf *Buffer, a, b []byte) int {
// TODO: for now we simply compute keys and compare. Once we
// have good benchmarks, move to an implementation that works
// incrementally for the majority of cases.
// - Benchmark with long strings that only vary in modifiers.
buf.ResetKeys()
ka := c.Key(buf, a)
kb := c.Key(buf, b)
defer buf.ResetKeys()
return bytes.Compare(ka, kb)
func (c *Collator) Compare(a, b []byte) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
c.iter(0).setInput(c, a)
c.iter(1).setInput(c, b)
if res := c.compare(); res != 0 {
return res
}
if Identity == c.Strength {
return bytes.Compare(a, b)
}
return 0
}
// CompareString returns an integer comparing the two strings.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// CompareString calls ResetKeys, thereby invalidating keys
// previously generated using Key or KeyFromString using buf.
func (c *Collator) CompareString(buf *Buffer, a, b string) int {
buf.ResetKeys()
ka := c.KeyFromString(buf, a)
kb := c.KeyFromString(buf, b)
defer buf.ResetKeys()
return bytes.Compare(ka, kb)
func (c *Collator) CompareString(a, b string) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
c.iter(0).setInputString(c, a)
c.iter(1).setInputString(c, b)
if res := c.compare(); res != 0 {
return res
}
if Identity == c.Strength {
if a < b {
return -1
} else if a > b {
return 1
}
}
return 0
}
func compareLevel(f func(i *iter) int, a, b *iter) int {
a.pce = 0
b.pce = 0
for {
va := f(a)
vb := f(b)
if va != vb {
if va < vb {
return -1
}
return 1
} else if va == 0 {
break
}
}
return 0
}
func (c *Collator) compare() int {
ia, ib := c.iter(0), c.iter(1)
// Process primary level
if c.Alternate != AltShifted {
// TODO: implement script reordering
// TODO: special hiragana handling
if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
return res
}
} else {
// TODO: handle shifted
}
if Secondary <= c.Strength {
f := (*iter).nextSecondary
if c.Backwards {
f = (*iter).prevSecondary
}
if res := compareLevel(f, ia, ib); res != 0 {
return res
}
}
// TODO: special case handling (Danish?)
if Tertiary <= c.Strength || c.CaseLevel {
if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
return res
}
// TODO: Not needed for the default value of AltNonIgnorable?
if Quaternary <= c.Strength {
if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
return res
}
}
}
return 0
}
func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
func (c *Collator) Prefix(s, prefix []byte) int {
// iterate over s, track bytes consumed.
return 0
}
......@@ -177,12 +242,11 @@ func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
// Key returns the collation key for str.
// Passing the buffer buf may avoid memory allocations.
// The returned slice will point to an allocation in Buffer and will remain
// valid until the next call to buf.ResetKeys().
// valid until the next call to buf.Reset().
func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
c.getColElems(buf, str)
return c.key(buf, buf.ce)
return c.key(buf, c.getColElems(str))
}
// KeyFromString returns the collation key for str.
......@@ -192,8 +256,7 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte {
func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
c.getColElemsString(buf, str)
return c.key(buf, buf.ce)
return c.key(buf, c.getColElemsString(str))
}
func (c *Collator) key(buf *Buffer, w []colElem) []byte {
......@@ -203,35 +266,63 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
return buf.key[kn:]
}
func (c *Collator) getColElems(buf *Buffer, str []byte) {
i := c.iter()
i.src.SetInput(c.f, str)
func (c *Collator) getColElems(str []byte) []colElem {
i := c.iter(0)
i.setInput(c, str)
for !i.done() {
buf.ce = i.next(buf.ce)
i.next()
}
return i.ce
}
func (c *Collator) getColElemsString(buf *Buffer, str string) {
i := c.iter()
i.src.SetInputString(c.f, str)
func (c *Collator) getColElemsString(str string) []colElem {
i := c.iter(0)
i.setInputString(c, str)
for !i.done() {
buf.ce = i.next(buf.ce)
i.next()
}
return i.ce
}
type iter struct {
src norm.Iter
ba [1024]byte
norm [1024]byte
buf []byte
t *table
p int
minBufSize int
wa [512]colElem
ce []colElem
pce int
t *table
_done, eof bool
}
func (c *Collator) iter() iter {
i := iter{t: c.t, minBufSize: c.t.maxContractLen}
i.buf = i.ba[:0]
func (i *iter) init(c *Collator) {
i.t = c.t
i.minBufSize = c.t.maxContractLen
i.ce = i.wa[:0]
i.buf = i.norm[:0]
}
func (i *iter) reset() {
i.ce = i.ce[:0]
i.buf = i.buf[:0]
i.p = 0
i.eof = i.src.Done()
i._done = i.eof
}
func (i *iter) setInput(c *Collator, s []byte) *iter {
i.src.SetInput(c.f, s)
i.reset()
return i
}
func (i *iter) setInputString(c *Collator, s string) *iter {
i.src.SetInputString(c.f, s)
i.reset()
return i
}
......@@ -239,7 +330,7 @@ func (i *iter) done() bool {
return i._done
}
func (i *iter) next(ce []colElem) []colElem {
func (i *iter) next() {
if !i.eof && len(i.buf)-i.p < i.minBufSize {
// replenish buffer
n := copy(i.buf, i.buf[i.p:])
......@@ -250,11 +341,67 @@ func (i *iter) next(ce []colElem) []colElem {
}
if i.p == len(i.buf) {
i._done = true
return ce
return
}
ce, sz := i.t.appendNext(ce, i.buf[i.p:])
sz := 0
i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
i.p += sz
return ce
}
func (i *iter) nextPrimary() int {
for {
for ; i.pce < len(i.ce); i.pce++ {
if v := i.ce[i.pce].primary(); v != 0 {
i.pce++
return v
}
}
if i.done() {
return 0
}
i.next()
}
panic("should not reach here")
}
func (i *iter) nextSecondary() int {
for ; i.pce < len(i.ce); i.pce++ {
if v := i.ce[i.pce].secondary(); v != 0 {
i.pce++
return v
}
}
return 0
}
func (i *iter) prevSecondary() int {
for ; i.pce < len(i.ce); i.pce++ {
if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 {
i.pce++
return v
}
}
return 0
}
func (i *iter) nextTertiary() int {
for ; i.pce < len(i.ce); i.pce++ {
if v := i.ce[i.pce].tertiary(); v != 0 {
i.pce++
return int(v)
}
}
return 0
}
func (i *iter) nextQuaternary() int {
for ; i.pce < len(i.ce); i.pce++ {
if v := i.ce[i.pce].quaternary(); v != 0 {
i.pce++
return v
}
}
return 0
}
func appendPrimary(key []byte, p int) []byte {
......
......@@ -300,7 +300,7 @@ var keyFromElemTests = []keyFromElemTest{
func TestKeyFromElems(t *testing.T) {
buf := collate.Buffer{}
for i, tt := range keyFromElemTests {
buf.ResetKeys()
buf.Reset()
ws := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in)
res := collate.KeyFromElems(tt.opt.collator(), &buf, ws)
if len(res) != len(tt.out) {
......@@ -325,7 +325,6 @@ func TestGetColElems(t *testing.T) {
// error is reported in TestAppendNext
continue
}
buf := collate.Buffer{}
// Create one large test per table
str := make([]byte, 0, 4000)
out := ColElems{}
......@@ -336,7 +335,7 @@ func TestGetColElems(t *testing.T) {
}
}
for j, chk := range append(tt.chk, check{string(str), len(str), out}) {
ws := collate.GetColElems(c, &buf, []byte(chk.in)[:chk.n])
ws := collate.GetColElems(c, []byte(chk.in)[:chk.n])
if len(ws) != len(chk.out) {
t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ws), len(chk.out))
continue
......@@ -404,19 +403,27 @@ type compareTest struct {
var compareTests = []compareTest{
{"a\u0301", "a", 1},
{"a\u0301b", "ab", 1},
{"a", "a\u0301", -1},
{"ab", "a\u0301b", -1},
{"bc", "a\u0301c", 1},
{"ab", "aB", -1},
{"a\u0301", "a\u0301", 0},
{"a", "a", 0},
// Only clip prefixes of whole runes.
{"\u302E", "\u302F", 1},
// Don't clip prefixes when last rune of prefix may be part of contraction.
{"a\u035E", "a\u0301\u035F", -1},
{"a\u0301\u035Fb", "a\u0301\u035F", -1},
}
func TestCompare(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in)
buf := collate.Buffer{}
for i, tt := range compareTests {
if res := c.Compare(&buf, []byte(tt.a), []byte(tt.b)); res != tt.res {
if res := c.Compare([]byte(tt.a), []byte(tt.b)); res != tt.res {
t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
if res := c.CompareString(&buf, tt.a, tt.b); res != tt.res {
if res := c.CompareString(tt.a, tt.b); res != tt.res {
t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
}
......
......@@ -4,8 +4,6 @@
package collate
import "exp/norm"
// Init is used by type Builder in exp/locale/collate/build/
// to create Collator instances. It is for internal use only.
func Init(data interface{}) *Collator {
......@@ -24,11 +22,7 @@ func Init(data interface{}) *Collator {
t.contractElem = init.ContractElems()
t.maxContractLen = init.MaxContractLen()
t.variableTop = init.VariableTop()
return &Collator{
Strength: Quaternary,
f: norm.NFD,
t: t,
}
return newCollator(t)
}
type tableInitializer interface {
......
......@@ -72,10 +72,9 @@ func SetTop(c *Collator, top int) {
c.t.variableTop = uint32(top)
}
func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights {
buf.ResetKeys()
c.getColElems(buf, str)
return convertToWeights(buf.ce)
func GetColElems(c *Collator, str []byte) []Weights {
ce := c.getColElems(str)
return convertToWeights(ce)
}
func ProcessWeights(h AlternateHandling, top int, w []Weights) []Weights {
......
......@@ -180,7 +180,7 @@ func skipAlt(a string) bool {
func failOnError(e error) {
if e != nil {
log.Fatal(e)
log.Panic(e)
}
}
......@@ -677,7 +677,7 @@ func testCollator(c *collate.Collator) {
if bytes.Compare(k0, k) != 0 {
failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
}
buf.ResetKeys()
buf.Reset()
}
fmt.Println("PASS")
}
......
......@@ -236,9 +236,9 @@ func doTest(t Test) {
if strings.Contains(t.name, "NON_IGNOR") {
c.Alternate = collate.AltNonIgnorable
}
prev := t.str[0]
for i := 1; i < len(t.str); i++ {
b.Reset()
s := t.str[i]
ka := c.Key(b, prev)
kb := c.Key(b, s)
......@@ -247,10 +247,10 @@ func doTest(t Test) {
prev = s
continue
}
if r := c.Compare(b, prev, s); r == 1 {
if r := c.Compare(prev, s); r == 1 {
fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r)
}
if r := c.Compare(b, s, prev); r == -1 {
if r := c.Compare(s, prev); r == -1 {
fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r)
}
prev = s
......
......@@ -141,7 +141,7 @@ var appendNextTests = []tableTest{
{"\u0316", [][]int{{0, 220}}},
{"\u0317", [][]int{{0, 220}, {0, 220}}},
{"\u302D", [][]int{{0, 222}}},
{"\u302E", [][]int{{0, 224}}}, // used as starter
{"\u302E", [][]int{{0, 225}}}, // used as starter
{"\u302F", [][]int{{0, 224}}}, // used as starter
{"\u18A9", [][]int{{0, 228}}},
{"\u0300", [][]int{{0, 230}}},
......@@ -169,7 +169,7 @@ var appendNextTests = []tableTest{
{"a\u035Db\u035D", [][]int{{117}}},
{"a\u0301\u035Db", [][]int{{120}}},
{"a\u0301\u035F", [][]int{{121}}},
{"a\u0301\u035Fb", [][]int{{122}}},
{"a\u0301\u035Fb", [][]int{{119}}},
{"\u03B1\u0345", [][]int{{901}, {902}}},
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
{"\u302F\u18A9", [][]int{{0, 130}}},
......@@ -192,7 +192,7 @@ var appendNextTests = []tableTest{
// multiple gaps
{"a\u0301\u035Db", 6, ColElems{w(120)}},
{"a\u0301\u035F", 5, ColElems{w(121)}},
{"a\u0301\u035Fb", 6, ColElems{w(122)}},
{"a\u0301\u035Fb", 6, ColElems{w(119)}},
{"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
{"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},
......
......@@ -91,5 +91,5 @@ func (c *goCollator) Key(b Input) []byte {
}
func (c *goCollator) Compare(a, b Input) int {
return c.c.Compare(&c.buf, a.UTF8, b.UTF8)
return c.c.Compare(a.UTF8, b.UTF8)
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment