Commit 10838165 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: fixed two bugs uncovered by regression tests.

The first bug was that tertiary ignorables had the same colElem as
implicit colElems, yielding unexpected results. The current encoding
ensures that a non-implicit colElem is never 0.  This fix uncovered
another bug of the trie that indexed incorrectly into the null block.
This was caused by an unfinished optimization that would avoid the
need to max out the most-significant bits of continuation bytes.
This bug was also present in the trie used in exp/norm and has been
fixed there as well. The appearence of the bug was rare, as the lower
blocks happened to be nearly nil.

R=r
CC=golang-dev
https://golang.org/cl/6127070
parent 81d96215
......@@ -25,11 +25,11 @@ const (
// For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp tttttttt, where
// 000ppppp pppppppp pppppppp tttttttt, where
// - p* is primary collation value
// - t* is the tertiary collation value
// Collation elements with a secondary value are of the form
// 00000000 ssssssss ssssssss tttttttt, where
// 01000000 ssssssss ssssssss tttttttt, where
// - s* is the secondary collation value
// - t* is the tertiary collation value
const (
......@@ -37,7 +37,7 @@ const (
maxSecondaryBits = 16
maxTertiaryBits = 8
isPrimary = 0x40000000
isSecondary = 0x40000000
)
func makeCE(weights []int) (uint32, error) {
......@@ -57,10 +57,10 @@ func makeCE(weights []int) (uint32, error) {
return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
}
ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
ce |= isPrimary
} else {
// secondary weight form
ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
ce |= isSecondary
}
return ce, nil
}
......@@ -162,7 +162,6 @@ const (
// http://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if r >= minUnified && r <= maxUnified {
// The most common case for CJK.
return int(r) + commonUnifiedOffset
......
......@@ -29,9 +29,9 @@ func decompCE(in []int) (ce uint32, err error) {
}
var ceTests = []ceTest{
{normalCE, []int{0, 0, 0}, 000},
{normalCE, []int{0, 30, 3}, 0x1E03},
{normalCE, []int{100, defaultSecondary, 3}, 0x40006403},
{normalCE, []int{0, 0, 0}, 0x40000000},
{normalCE, []int{0, 30, 3}, 0x40001E03},
{normalCE, []int{100, defaultSecondary, 3}, 0x6403},
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
{normalCE, []int{100, 1, 3}, 0xFFFF},
{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
......
......@@ -19,7 +19,10 @@ import (
"reflect"
)
const blockSize = 64
const (
blockSize = 64
blockOffset = 2 // Substract 2 blocks to compensate for the 0x80 added to continuation bytes.
)
type trie struct {
index []uint16
......@@ -102,7 +105,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int64 {
if n.isInternal() {
v, ok := index.lookupBlockIdx[h]
if !ok {
v = int64(len(index.lookupBlocks))
v = int64(len(index.lookupBlocks)) - blockOffset
index.lookupBlocks = append(index.lookupBlocks, n)
index.lookupBlockIdx[h] = v
}
......@@ -110,7 +113,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int64 {
} else {
v, ok := index.valueBlockIdx[h]
if !ok {
v = int64(len(index.valueBlocks))
v = int64(len(index.valueBlocks)) - blockOffset
index.valueBlocks = append(index.valueBlocks, n)
index.valueBlockIdx[h] = v
}
......
......@@ -79,24 +79,24 @@ var testLookup = [640]uint16 {
// Block 0x1, offset 0x40
// Block 0x2, offset 0x80
// Block 0x3, offset 0xc0
0x0c2:0x03, 0x0c4:0x04,
0x0c8:0x05,
0x0df:0x06,
0x0e0:0x04,
0x0ef:0x05,
0x0f0:0x07, 0x0f4:0x09,
0x0c2:0x01, 0x0c4:0x02,
0x0c8:0x03,
0x0df:0x04,
0x0e0:0x02,
0x0ef:0x03,
0x0f0:0x05, 0x0f4:0x07,
// Block 0x4, offset 0x100
0x120:0x07, 0x126:0x08,
0x120:0x05, 0x126:0x06,
// Block 0x5, offset 0x140
0x17f:0x09,
0x17f:0x07,
// Block 0x6, offset 0x180
0x180:0x0a, 0x184:0x0b,
0x180:0x08, 0x184:0x09,
// Block 0x7, offset 0x1c0
0x1d0:0x06,
0x1d0:0x04,
// Block 0x8, offset 0x200
0x23f:0x0c,
0x23f:0x0a,
// Block 0x9, offset 0x240
0x24f:0x08,
0x24f:0x06,
}
var testTrie = trie{ testLookup[:], testValues[:]}
......
......@@ -68,17 +68,18 @@ func (ce colElem) ctype() ceType {
// For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp tttttttt, where
// 000ppppp pppppppp pppppppp tttttttt, where
// - p* is primary collation value
// - t* is the tertiary collation value
// Collation elements with a secondary value are of the form
// 00000000 ssssssss ssssssss tttttttt, where
// 01000000 ssssssss ssssssss tttttttt, where
// - s* is the secondary collation value
// - t* is the tertiary collation value
func splitCE(ce colElem) weights {
const secondaryMask = 0x40000000
w := weights{}
w.tertiary = uint8(ce)
if ce&0x40000000 != 0 {
if ce&secondaryMask == 0 {
// primary weight form
w.primary = uint32((ce >> 8) & 0x1FFFFF)
w.secondary = defaultSecondary
......
......@@ -20,14 +20,14 @@ func makeCE(weights []int) colElem {
maxPrimaryBits = 21
maxSecondaryBits = 16
maxTertiaryBits = 8
isPrimary = 0x40000000
isSecondary = 0x40000000
)
var ce colElem
if weights[0] != 0 {
ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
ce |= isPrimary
} else {
ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
ce |= isSecondary
}
return ce
}
......
......@@ -27,15 +27,10 @@ const (
t5 = 0xF8 // 1111 1000
t6 = 0xFC // 1111 1100
te = 0xFE // 1111 1110
maskx = 0x3F // 0011 1111
mask2 = 0x1F // 0001 1111
mask3 = 0x0F // 0000 1111
mask4 = 0x07 // 0000 0111
)
func (t *trie) lookupValue(n uint16, b byte) colElem {
return colElem(t.values[int(n)<<6+int(b&maskx)])
return colElem(t.values[int(n)<<6+int(b)])
}
// lookup returns the trie value for the first UTF-8 encoding in s and
......@@ -67,7 +62,7 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)&maskx
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
......@@ -83,13 +78,13 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)&maskx
o := int(i)<<6 + int(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = int(i)<<6 + int(c2)&maskx
o = int(i)<<6 + int(c2)
i = t.index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
......
......@@ -89,18 +89,18 @@ var testValues = [832]uint32{
}
var testLookup = [640]uint16{
0x0c2: 0x03, 0x0c4: 0x04,
0x0c8: 0x05,
0x0df: 0x06,
0x0e0: 0x04,
0x0ef: 0x05,
0x0f0: 0x07, 0x0f4: 0x09,
0x120: 0x07, 0x126: 0x08,
0x17f: 0x09,
0x180: 0x0a, 0x184: 0x0b,
0x1d0: 0x06,
0x23f: 0x0c,
0x24f: 0x08,
0x0c2: 0x01, 0x0c4: 0x02,
0x0c8: 0x03,
0x0df: 0x04,
0x0e0: 0x02,
0x0ef: 0x03,
0x0f0: 0x05, 0x0f4: 0x07,
0x120: 0x05, 0x126: 0x06,
0x17f: 0x07,
0x180: 0x08, 0x184: 0x09,
0x1d0: 0x04,
0x23f: 0x0a,
0x24f: 0x06,
}
var testTrie = trie{testLookup[:], testValues[:]}
This diff is collapsed.
......@@ -23,7 +23,7 @@ type trie struct {
// the value for b is by r.value + (b - r.lo) * stride.
func (t *trie) lookupValue(n uint8, b byte) uint16 {
if n < t.cutoff {
return t.values[uint16(n)<<6+uint16(b&maskx)]
return t.values[uint16(n)<<6+uint16(b)]
}
offset := t.sparseOffset[n-t.cutoff]
header := t.sparse[offset]
......@@ -53,11 +53,6 @@ const (
t5 = 0xF8 // 1111 1000
t6 = 0xFC // 1111 1100
te = 0xFE // 1111 1110
maskx = 0x3F // 0011 1111
mask2 = 0x1F // 0001 1111
mask3 = 0x0F // 0000 1111
mask4 = 0x07 // 0000 0111
)
// lookup returns the trie value for the first UTF-8 encoding in s and
......@@ -89,7 +84,7 @@ func (t *trie) lookup(s []byte) (v uint16, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := uint16(i)<<6 + uint16(c1)&maskx
o := uint16(i)<<6 + uint16(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
......@@ -105,13 +100,13 @@ func (t *trie) lookup(s []byte) (v uint16, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := uint16(i)<<6 + uint16(c1)&maskx
o := uint16(i)<<6 + uint16(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = uint16(i)<<6 + uint16(c2)&maskx
o = uint16(i)<<6 + uint16(c2)
i = t.index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
......@@ -152,7 +147,7 @@ func (t *trie) lookupString(s string) (v uint16, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := uint16(i)<<6 + uint16(c1)&maskx
o := uint16(i)<<6 + uint16(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
......@@ -168,13 +163,13 @@ func (t *trie) lookupString(s string) (v uint16, sz int) {
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := uint16(i)<<6 + uint16(c1)&maskx
o := uint16(i)<<6 + uint16(c1)
i = t.index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = uint16(i)<<6 + uint16(c2)&maskx
o = uint16(i)<<6 + uint16(c2)
i = t.index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
......@@ -200,11 +195,11 @@ func (t *trie) lookupUnsafe(s []byte) uint16 {
if c0 < t3 {
return t.lookupValue(i, s[1])
}
i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
i = t.index[uint16(i)<<6+uint16(s[1])]
if c0 < t4 {
return t.lookupValue(i, s[2])
}
i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
i = t.index[uint16(i)<<6+uint16(s[2])]
if c0 < t5 {
return t.lookupValue(i, s[3])
}
......@@ -225,11 +220,11 @@ func (t *trie) lookupStringUnsafe(s string) uint16 {
if c0 < t3 {
return t.lookupValue(i, s[1])
}
i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
i = t.index[uint16(i)<<6+uint16(s[1])]
if c0 < t4 {
return t.lookupValue(i, s[2])
}
i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
i = t.index[uint16(i)<<6+uint16(s[2])]
if c0 < t5 {
return t.lookupValue(i, s[3])
}
......
......@@ -96,13 +96,17 @@ func TestLookup(t *testing.T) {
}
for i, tt := range tests {
v, sz := testdata.lookup(tt.bytes)
if int(v) != 0 {
if v != 0 {
t.Errorf("lookup of illegal rune, case %d: found value %#x, expected 0", i, v)
}
if sz != tt.size {
t.Errorf("lookup of illegal rune, case %d: found size %d, expected %d", i, sz, tt.size)
}
}
// Verify defaults.
if v, _ := testdata.lookup([]byte{0xC1, 0x8C}); v != 0 {
t.Errorf("lookup of non-existing rune should be 0; found %X", v)
}
}
func TestLookupUnsafe(t *testing.T) {
......
......@@ -4,7 +4,7 @@
package norm
var testRunes = []rune{1, 12, 127, 128, 256, 2047, 2048, 2457, 65535, 65536, 65793, 1114111, 512, 513, 514, 528, 533}
var testRunes = []int32{1, 12, 127, 128, 256, 2047, 2048, 2457, 65535, 65536, 65793, 1114111, 512, 513, 514, 528, 533}
// testdataValues: 192 entries, 384 bytes
// Block 2 is the null block.
......@@ -62,24 +62,24 @@ var testdataLookup = [640]uint8{
// Block 0x1, offset 0x40
// Block 0x2, offset 0x80
// Block 0x3, offset 0xc0
0x0c2: 0x03, 0x0c4: 0x04,
0x0c8: 0x05,
0x0df: 0x06,
0x0e0: 0x04,
0x0ef: 0x05,
0x0f0: 0x07, 0x0f4: 0x09,
0x0c2: 0x01, 0x0c4: 0x02,
0x0c8: 0x03,
0x0df: 0x04,
0x0e0: 0x02,
0x0ef: 0x03,
0x0f0: 0x05, 0x0f4: 0x07,
// Block 0x4, offset 0x100
0x120: 0x07, 0x126: 0x08,
0x120: 0x05, 0x126: 0x06,
// Block 0x5, offset 0x140
0x17f: 0x09,
0x17f: 0x07,
// Block 0x6, offset 0x180
0x180: 0x0a, 0x184: 0x0b,
0x180: 0x08, 0x184: 0x09,
// Block 0x7, offset 0x1c0
0x1d0: 0x06,
0x1d0: 0x04,
// Block 0x8, offset 0x200
0x23f: 0x0c,
0x23f: 0x0a,
// Block 0x9, offset 0x240
0x24f: 0x08,
0x24f: 0x06,
}
var testdataTrie = trie{testdataLookup[:], testdataValues[:], testdataSparseValues[:], testdataSparseOffset[:], 3}
var testdataTrie = trie{testdataLookup[:], testdataValues[:], testdataSparseValues[:], testdataSparseOffset[:], 1}
......@@ -19,8 +19,11 @@ import (
"unicode/utf8"
)
const blockSize = 64
const maxSparseEntries = 16
const (
blockSize = 64
blockOffset = 2 // Substract two blocks to compensate for the 0x80 added to continuation bytes.
maxSparseEntries = 16
)
// Intermediate trie structure
type trieNode struct {
......@@ -157,7 +160,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int {
if n.isInternal() {
v, ok := index.lookupBlockIdx[h]
if !ok {
v = len(index.lookupBlocks)
v = len(index.lookupBlocks) - blockOffset
index.lookupBlocks = append(index.lookupBlocks, n)
index.lookupBlockIdx[h] = v
}
......@@ -166,7 +169,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int {
v, ok := index.valueBlockIdx[h]
if !ok {
if c := n.countSparseEntries(); c > maxSparseEntries {
v = len(index.valueBlocks)
v = len(index.valueBlocks) - blockOffset
index.valueBlocks = append(index.valueBlocks, n)
index.valueBlockIdx[h] = v
} else {
......@@ -295,7 +298,7 @@ func (t *trieNode) printTables(name string) int {
}
fmt.Print("\n}\n\n")
cutoff := len(index.valueBlocks)
cutoff := len(index.valueBlocks) - blockOffset
ni := len(index.lookupBlocks) * blockSize
fmt.Printf("// %sLookup: %d bytes\n", name, ni)
fmt.Printf("// Block 0 is the null block.\n")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment