Commit adc19ac5 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: adjusted contraction trie to support Myanmar (Burmese),

which has a rather large contraction table. The value of the next state
offset now starts after the current block, instead of before.  This is
slightly less efficient (on extra addition per state change), but gives
some extra range for the offsets.
Also introduced constants for final (0) and noIndex (0xFF).
tables.go is updated in a separate CL.

R=r
CC=golang-dev
https://golang.org/cl/6346092
parent 656b192c
...@@ -41,6 +41,11 @@ import ( ...@@ -41,6 +41,11 @@ import (
// also includes the length and offset to the next sequence of entries // also includes the length and offset to the next sequence of entries
// to check in case of a match. // to check in case of a match.
const (
final = 0
noIndex = 0xFF
)
// ctEntry associates to a matching byte an offset and/or next sequence of // ctEntry associates to a matching byte an offset and/or next sequence of
// bytes to check. A ctEntry c is called final if a match means that the // bytes to check. A ctEntry c is called final if a match means that the
// longest suffix has been found. An entry c is final if c.n == 0. // longest suffix has been found. An entry c is final if c.n == 0.
...@@ -50,24 +55,24 @@ import ( ...@@ -50,24 +55,24 @@ import (
// Examples: // Examples:
// The suffix strings "ab" and "ac" can be represented as: // The suffix strings "ab" and "ac" can be represented as:
// []ctEntry{ // []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' by itself does not match, so i is 0xFF. // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
// } // }
// //
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
// []ctEntry{ // []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' must be followed by 'b'. // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
// {'b', 2, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
// {'d', 'd', 0, 3}, // "abd" -> 3 // {'d', 'd', final, 3}, // "abd" -> 3
// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
// {'d', 'd', 0, 4}, // "abcd" -> 4 // {'d', 'd', final, 4}, // "abcd" -> 4
// } // }
// See genStateTests in contract_test.go for more examples. // See genStateTests in contract_test.go for more examples.
type ctEntry struct { type ctEntry struct {
l uint8 // non-final: byte value to match; final: lowest match in range. l uint8 // non-final: byte value to match; final: lowest match in range.
h uint8 // non-final: relative index to next block; final: highest match in range. h uint8 // non-final: relative index to next block; final: highest match in range.
n uint8 // non-final: length of next block; final: 0 n uint8 // non-final: length of next block; final: final
i uint8 // result offset. Will be 0xFF if more bytes are needed to complete. i uint8 // result offset. Will be noIndex if more bytes are needed to complete.
} }
// contractTrieSet holds a set of contraction tries. The tries are stored // contractTrieSet holds a set of contraction tries. The tries are stored
...@@ -124,7 +129,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { ...@@ -124,7 +129,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
} }
} }
if !added { if !added {
*ct = append(*ct, ctEntry{l: c, i: 0xFF}) *ct = append(*ct, ctEntry{l: c, i: noIndex})
} }
} else { } else {
for j := len(*ct) - 1; j >= start; j-- { for j := len(*ct) - 1; j >= start; j-- {
...@@ -140,7 +145,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { ...@@ -140,7 +145,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
} }
} }
if !added { if !added {
*ct = append(*ct, ctEntry{l: c, h: c, i: uint8(si.index)}) *ct = append(*ct, ctEntry{l: c, h: c, n: final, i: uint8(si.index)})
} }
} }
} }
...@@ -150,7 +155,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { ...@@ -150,7 +155,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
for i, end := start, len(*ct); i < end; i++ { for i, end := start, len(*ct); i < end; i++ {
fe := (*ct)[i] fe := (*ct)[i]
if fe.h == 0 { // uninitialized non-final if fe.h == 0 { // uninitialized non-final
ln := len(*ct) - start ln := len(*ct) - start - n
if ln > 0xFF { if ln > 0xFF {
return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
} }
...@@ -238,16 +243,16 @@ func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) { ...@@ -238,16 +243,16 @@ func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) {
if c >= e.l { if c >= e.l {
p++ p++
if e.l == c { if e.l == c {
if e.i != 0xFF { if e.i != noIndex {
index, ns = int(e.i), p index, ns = int(e.i), p
} }
if e.n != 0 { if e.n != final {
// set to new state // set to new state
i, states, n = 0, states[e.h:], int(e.n) i, states, n = 0, states[int(e.h)+n:], int(e.n)
} else { } else {
return return
} }
} else if e.n == 0 && c <= e.h { } else if e.n == final && c <= e.h {
return int(c-e.l) + int(e.i), p return int(c-e.l) + int(e.i), p
} }
} else { } else {
......
...@@ -111,9 +111,9 @@ var genStateTests = []GenStateTest{ ...@@ -111,9 +111,9 @@ var genStateTests = []GenStateTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 0xFF}, {'a', 0, 1, noIndex},
{'b', 1, 1, 0xFF}, {'b', 0, 1, noIndex},
{'c', 'c', 0, 1}, {'c', 'c', final, 1},
}, },
}, },
{[]stridx{ {[]stridx{
...@@ -123,9 +123,9 @@ var genStateTests = []GenStateTest{ ...@@ -123,9 +123,9 @@ var genStateTests = []GenStateTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 0xFF}, {'a', 0, 1, noIndex},
{'b', 1, 1, 0xFF}, {'b', 0, 1, noIndex},
{'c', 'e', 0, 1}, {'c', 'e', final, 1},
}, },
}, },
{[]stridx{ {[]stridx{
...@@ -135,9 +135,9 @@ var genStateTests = []GenStateTest{ ...@@ -135,9 +135,9 @@ var genStateTests = []GenStateTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 3}, {'a', 0, 1, 3},
{'b', 1, 1, 2}, {'b', 0, 1, 2},
{'c', 'c', 0, 1}, {'c', 'c', final, 1},
}, },
}, },
{[]stridx{ {[]stridx{
...@@ -150,11 +150,11 @@ var genStateTests = []GenStateTest{ ...@@ -150,11 +150,11 @@ var genStateTests = []GenStateTest{
}, },
2, 2,
contractTrieSet{ contractTrieSet{
{'b', 'b', 0, 6}, {'b', 'b', final, 6},
{'a', 2, 2, 5}, {'a', 0, 2, 5},
{'c', 'c', 0, 4}, {'c', 'c', final, 4},
{'b', 2, 1, 3}, {'b', 0, 1, 3},
{'c', 'd', 0, 1}, {'c', 'd', final, 1},
}, },
}, },
{[]stridx{ {[]stridx{
...@@ -168,14 +168,14 @@ var genStateTests = []GenStateTest{ ...@@ -168,14 +168,14 @@ var genStateTests = []GenStateTest{
}, },
2, 2,
contractTrieSet{ contractTrieSet{
{'b', 5, 1, 0xFF}, {'b', 3, 1, noIndex},
{'a', 2, 1, 0xFF}, {'a', 0, 1, noIndex},
{'b', 1, 1, 6}, {'b', 0, 1, 6},
{'c', 1, 1, 4}, {'c', 0, 1, 4},
{'d', 'd', 0, 1}, {'d', 'd', final, 1},
{'c', 1, 1, 7}, {'c', 0, 1, 7},
{'d', 1, 1, 5}, {'d', 0, 1, 5},
{'e', 'f', 0, 2}, {'e', 'f', final, 2},
}, },
}, },
} }
...@@ -251,13 +251,13 @@ func TestPrintContractionTrieSet(t *testing.T) { ...@@ -251,13 +251,13 @@ func TestPrintContractionTrieSet(t *testing.T) {
const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes
var testCTEntries = [8]struct{l,h,n,i uint8}{ var testCTEntries = [8]struct{l,h,n,i uint8}{
{0x62, 0x5, 1, 255}, {0x62, 0x3, 1, 255},
{0x61, 0x2, 1, 255}, {0x61, 0x0, 1, 255},
{0x62, 0x1, 1, 6}, {0x62, 0x0, 1, 6},
{0x63, 0x1, 1, 4}, {0x63, 0x0, 1, 4},
{0x64, 0x64, 0, 1}, {0x64, 0x64, 0, 1},
{0x63, 0x1, 1, 7}, {0x63, 0x0, 1, 7},
{0x64, 0x1, 1, 5}, {0x64, 0x0, 1, 5},
{0x65, 0x66, 0, 2}, {0x65, 0x66, 0, 2},
} }
var testContractTrieSet = contractTrieSet( testCTEntries[:] ) var testContractTrieSet = contractTrieSet( testCTEntries[:] )
......
...@@ -37,6 +37,11 @@ func (s *ctScanner) result() (i, p int) { ...@@ -37,6 +37,11 @@ func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex return s.index, s.pindex
} }
const (
final = 0
noIndex = 0xFF
)
// scan matches the longest suffix at the current location in the input // scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed. // and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int { func (s *ctScanner) scan(p int) int {
...@@ -53,12 +58,12 @@ func (s *ctScanner) scan(p int) int { ...@@ -53,12 +58,12 @@ func (s *ctScanner) scan(p int) int {
if c >= e.l { if c >= e.l {
if e.l == c { if e.l == c {
p++ p++
if e.i != 0xFF { if e.i != noIndex {
s.index = int(e.i) s.index = int(e.i)
s.pindex = p s.pindex = p
} }
if e.n != 0 { if e.n != final {
i, states, n = 0, states[e.h:], int(e.n) i, states, n = 0, states[int(e.h)+n:], int(e.n)
if p >= len(str) || utf8.RuneStart(str[p]) { if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p s.states, s.n, pr = states, n, p
} }
...@@ -67,7 +72,7 @@ func (s *ctScanner) scan(p int) int { ...@@ -67,7 +72,7 @@ func (s *ctScanner) scan(p int) int {
return p return p
} }
continue continue
} else if e.n == 0 && c <= e.h { } else if e.n == final && c <= e.h {
p++ p++
s.done = true s.done = true
s.index = int(c-e.l) + int(e.i) s.index = int(c-e.l) + int(e.i)
......
...@@ -30,8 +30,8 @@ var lookupTests = []LookupTest{ ...@@ -30,8 +30,8 @@ var lookupTests = []LookupTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 0xFF}, {'a', 0, 1, 0xFF},
{'b', 1, 1, 0xFF}, {'b', 0, 1, 0xFF},
{'c', 'c', 0, 1}, {'c', 'c', 0, 1},
}, },
}, },
...@@ -46,8 +46,8 @@ var lookupTests = []LookupTest{ ...@@ -46,8 +46,8 @@ var lookupTests = []LookupTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 0xFF}, {'a', 0, 1, 0xFF},
{'b', 1, 1, 0xFF}, {'b', 0, 1, 0xFF},
{'c', 'e', 0, 1}, {'c', 'e', 0, 1},
}, },
}, },
...@@ -60,8 +60,8 @@ var lookupTests = []LookupTest{ ...@@ -60,8 +60,8 @@ var lookupTests = []LookupTest{
}, },
1, 1,
contractTrieSet{ contractTrieSet{
{'a', 1, 1, 3}, {'a', 0, 1, 3},
{'b', 1, 1, 2}, {'b', 0, 1, 2},
{'c', 'c', 0, 1}, {'c', 'c', 0, 1},
}, },
}, },
...@@ -77,9 +77,9 @@ var lookupTests = []LookupTest{ ...@@ -77,9 +77,9 @@ var lookupTests = []LookupTest{
2, 2,
contractTrieSet{ contractTrieSet{
{'b', 'b', 0, 6}, {'b', 'b', 0, 6},
{'a', 2, 2, 5}, {'a', 0, 2, 5},
{'c', 'c', 0, 4}, {'c', 'c', 0, 4},
{'b', 2, 1, 3}, {'b', 0, 1, 3},
{'c', 'd', 0, 1}, {'c', 'd', 0, 1},
}, },
}, },
...@@ -94,13 +94,13 @@ var lookupTests = []LookupTest{ ...@@ -94,13 +94,13 @@ var lookupTests = []LookupTest{
}, },
2, 2,
contractTrieSet{ contractTrieSet{
{'b', 5, 1, 0xFF}, {'b', 3, 1, 0xFF},
{'a', 2, 1, 0xFF}, {'a', 0, 1, 0xFF},
{'b', 1, 1, 6}, {'b', 0, 1, 6},
{'c', 1, 1, 4}, {'c', 0, 1, 4},
{'d', 'd', 0, 1}, {'d', 'd', 0, 1},
{'c', 1, 1, 7}, {'c', 0, 1, 7},
{'d', 1, 1, 5}, {'d', 0, 1, 5},
{'e', 'f', 0, 2}, {'e', 'f', 0, 2},
}, },
}, },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment