Commit 4c1a6f84 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: removed weights struct to allow for faster and easier

incremental comparisons. Instead, processing is now done directly on colElems.
As a result, the size of the weights array is now reduced by 75%.
Details:
- Primary value of type 1 colElem is shifted by 1 bit so that primaries
  of all types can be compared without shifting.
- Quaternary values are now stored in the colElem itself. This is possible
  as quaternary values other than 0 or maxQuaternary are only needed when other
  values are ignored.
- Simplified processWeights by removing cases that are needed for ICU but not
  for us (our CJK primary values fit in a single value).

R=r
CC=golang-dev
https://golang.org/cl/6817054
parent bc0783db
...@@ -26,7 +26,7 @@ const ( ...@@ -26,7 +26,7 @@ const (
// For normal collation elements, we assume that a collation element either has // For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both. // a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form // Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp ssssssss // 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value // - p* is primary collation value
// - s* is the secondary collation value // - s* is the secondary collation value
// or // or
...@@ -67,7 +67,7 @@ func makeCE(weights []int) (uint32, error) { ...@@ -67,7 +67,7 @@ func makeCE(weights []int) (uint32, error) {
if weights[1] >= 1<<maxSecondaryCompactBits { if weights[1] >= 1<<maxSecondaryCompactBits {
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits) return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
} }
ce = uint32(weights[0]<<maxSecondaryCompactBits + weights[1]) ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary ce |= isPrimary
} else { } else {
d := weights[1] - defaultSecondary + 4 d := weights[1] - defaultSecondary + 4
......
...@@ -36,7 +36,7 @@ var ceTests = []ceTest{ ...@@ -36,7 +36,7 @@ var ceTests = []ceTest{
{normalCE, []int{0, 0x28, 3}, 0x80002803}, {normalCE, []int{0, 0x28, 3}, 0x80002803},
{normalCE, []int{100, defaultSecondary, 3}, 0x0000C883}, {normalCE, []int{100, defaultSecondary, 3}, 0x0000C883},
// non-ignorable primary with non-default secondary // non-ignorable primary with non-default secondary
{normalCE, []int{100, 0x28, defaultTertiary}, 0x40006428}, {normalCE, []int{100, 0x28, defaultTertiary}, 0x4000C828},
{normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983}, {normalCE, []int{100, defaultSecondary + 8, 3}, 0x0000C983},
{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary {normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-supported secondary
{normalCE, []int{100, 1, 3}, 0xFFFF}, {normalCE, []int{100, 1, 3}, 0xFFFF},
......
...@@ -8,16 +8,6 @@ import ( ...@@ -8,16 +8,6 @@ import (
"unicode" "unicode"
) )
// weights holds the decoded weights per collation level.
type weights struct {
primary uint32
secondary uint16
tertiary uint8
// TODO: compute quaternary on the fly or compress this value into 8 bits
// such that weights fit within 64bit.
quaternary uint32
}
const ( const (
defaultSecondary = 0x20 defaultSecondary = 0x20
defaultTertiary = 0x2 defaultTertiary = 0x2
...@@ -69,7 +59,7 @@ func (ce colElem) ctype() ceType { ...@@ -69,7 +59,7 @@ func (ce colElem) ctype() ceType {
// For normal collation elements, we assume that a collation element either has // For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both. // a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form // Collation elements with a primary value are of the form
// 010ppppp pppppppp pppppppp ssssssss // 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value // - p* is primary collation value
// - s* is the secondary collation value // - s* is the secondary collation value
// or // or
...@@ -82,25 +72,87 @@ func (ce colElem) ctype() ceType { ...@@ -82,25 +72,87 @@ func (ce colElem) ctype() ceType {
// - 16 BMP implicit -> weight // - 16 BMP implicit -> weight
// - 8 bit s // - 8 bit s
// - default tertiary // - default tertiary
func splitCE(ce colElem) weights { // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
const primaryMask = 0x40000000 // - q* quaternary value
const secondaryMask = 0x80000000 const (
w := weights{} ceTypeMask = 0xC0000000
if ce&primaryMask != 0 { ceType1 = 0x40000000
w.tertiary = defaultTertiary ceType2 = 0x00000000
w.secondary = uint16(uint8(ce)) ceType3 = 0x80000000
w.primary = uint32((ce >> 8) & 0x1FFFFF) ceTypeQ = 0xC0000000
} else if ce&secondaryMask == 0 { ceIgnore = ceType3
w.tertiary = uint8(ce & 0x1F) firstNonPrimary = 0x80000000
ce >>= 5 secondaryMask = 0x80000000
w.secondary = defaultSecondary + uint16(ce&0xF) - 4 hasTertiaryMask = 0x40000000
ce >>= 4 primaryValueMask = 0x3FFFFE00
w.primary = uint32(ce) primaryShift = 9
compactSecondaryShift = 5
minCompactSecondary = defaultSecondary - 4
)
func makeImplicitCE(primary int) colElem {
return ceType1 | colElem(primary<<primaryShift) | defaultSecondary
}
func makeQuaternary(primary int) colElem {
return ceTypeQ | colElem(primary<<primaryShift)
}
func (ce colElem) primary() int {
if ce >= firstNonPrimary {
return 0
}
return int(ce&primaryValueMask) >> primaryShift
}
func (ce colElem) secondary() int {
switch ce & ceTypeMask {
case ceType1:
return int(uint8(ce))
case ceType2:
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
case ceType3:
return int(uint16(ce >> 8))
case ceTypeQ:
return 0
}
panic("should not reach here")
}
func (ce colElem) tertiary() uint8 {
if ce&hasTertiaryMask == 0 {
if ce&ceType3 == 0 {
return uint8(ce & 0x1F)
}
return uint8(ce)
} else if ce&ceTypeMask == ceType1 {
return defaultTertiary
}
// ce is a quaternary value.
return 0
}
func (ce colElem) updateTertiary(t uint8) colElem {
if ce&ceTypeMask == ceType1 {
nce := ce & primaryValueMask
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
ce = nce
} else { } else {
w.tertiary = uint8(ce) ce &= ^colElem(maxTertiary)
w.secondary = uint16(ce >> 8) }
return ce | colElem(t)
}
// quaternary returns the quaternary value if explicitly specified,
// 0 if ce == ceIgnore, or maxQuaternary otherwise.
// Quaternary values are used only for shifted variants.
func (ce colElem) quaternary() int {
if ce&ceTypeMask == ceTypeQ {
return int(ce&primaryValueMask) >> primaryShift
} else if ce == ceIgnore {
return 0
} }
return w return maxQuaternary
} }
// For contractions, collation elements are of the form // For contractions, collation elements are of the form
......
...@@ -29,7 +29,7 @@ func makeCE(weights []int) colElem { ...@@ -29,7 +29,7 @@ func makeCE(weights []int) colElem {
var ce colElem var ce colElem
if weights[0] != 0 { if weights[0] != 0 {
if weights[2] == defaultTertiary { if weights[2] == defaultTertiary {
ce = colElem(weights[0]<<maxSecondaryCompactBits + weights[1]) ce = colElem(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
ce |= isPrimary ce |= isPrimary
} else { } else {
d := weights[1] - defaultSecondary + 4 d := weights[1] - defaultSecondary + 4
...@@ -68,10 +68,10 @@ func makeDecompose(t1, t2 int) colElem { ...@@ -68,10 +68,10 @@ func makeDecompose(t1, t2 int) colElem {
} }
func normalCE(inout []int) (ce colElem, t ceType) { func normalCE(inout []int) (ce colElem, t ceType) {
w := splitCE(makeCE(inout)) w := makeCE(inout)
inout[0] = int(w.primary) inout[0] = w.primary()
inout[1] = int(w.secondary) inout[1] = w.secondary()
inout[2] = int(w.tertiary) inout[2] = int(w.tertiary())
return ce, ceNormal return ce, ceNormal
} }
...@@ -167,3 +167,20 @@ func TestImplicit(t *testing.T) { ...@@ -167,3 +167,20 @@ func TestImplicit(t *testing.T) {
} }
} }
} }
func TestUpdateTertiary(t *testing.T) {
tests := []struct {
in, out colElem
t uint8
}{
{0x4000FE20, 0x0000FE8A, 0x0A},
{0x4000FE21, 0x0000FEAA, 0x0A},
{0x0000FE8B, 0x0000FE83, 0x03},
{0x8000CC02, 0x8000CC1B, 0x1B},
}
for i, tt := range tests {
if out := tt.in.updateTertiary(tt.t); out != tt.out {
t.Errorf("%d: was %X; want %X", i, out, tt.out)
}
}
}
...@@ -120,9 +120,9 @@ type Buffer struct { ...@@ -120,9 +120,9 @@ type Buffer struct {
// TODO: try various parameters and techniques, such as using // TODO: try various parameters and techniques, such as using
// a chan of buffers for a pool. // a chan of buffers for a pool.
ba [4096]byte ba [4096]byte
wa [512]weights wa [512]colElem
key []byte key []byte
ce []weights ce []colElem
} }
func (b *Buffer) init() { func (b *Buffer) init() {
...@@ -196,7 +196,7 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { ...@@ -196,7 +196,7 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
return c.key(buf, buf.ce) return c.key(buf, buf.ce)
} }
func (c *Collator) key(buf *Buffer, w []weights) []byte { func (c *Collator) key(buf *Buffer, w []colElem) []byte {
processWeights(c.Alternate, c.t.variableTop, w) processWeights(c.Alternate, c.t.variableTop, w)
kn := len(buf.key) kn := len(buf.key)
c.keyFromElems(buf, w) c.keyFromElems(buf, w)
...@@ -239,7 +239,7 @@ func (i *iter) done() bool { ...@@ -239,7 +239,7 @@ func (i *iter) done() bool {
return i._done return i._done
} }
func (i *iter) next(ce []weights) []weights { func (i *iter) next(ce []colElem) []colElem {
if !i.eof && len(i.buf)-i.p < i.minBufSize { if !i.eof && len(i.buf)-i.p < i.minBufSize {
// replenish buffer // replenish buffer
n := copy(i.buf, i.buf[i.p:]) n := copy(i.buf, i.buf[i.p:])
...@@ -257,7 +257,7 @@ func (i *iter) next(ce []weights) []weights { ...@@ -257,7 +257,7 @@ func (i *iter) next(ce []weights) []weights {
return ce return ce
} }
func appendPrimary(key []byte, p uint32) []byte { func appendPrimary(key []byte, p int) []byte {
// Convert to variable length encoding; supports up to 23 bits. // Convert to variable length encoding; supports up to 23 bits.
if p <= 0x7FFF { if p <= 0x7FFF {
key = append(key, uint8(p>>8), uint8(p)) key = append(key, uint8(p>>8), uint8(p))
...@@ -269,9 +269,9 @@ func appendPrimary(key []byte, p uint32) []byte { ...@@ -269,9 +269,9 @@ func appendPrimary(key []byte, p uint32) []byte {
// keyFromElems converts the weights ws to a compact sequence of bytes. // keyFromElems converts the weights ws to a compact sequence of bytes.
// The result will be appended to the byte buffer in buf. // The result will be appended to the byte buffer in buf.
func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
for _, v := range ws { for _, v := range ws {
if w := v.primary; w > 0 { if w := v.primary(); w > 0 {
buf.key = appendPrimary(buf.key, w) buf.key = appendPrimary(buf.key, w)
} }
} }
...@@ -280,13 +280,13 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { ...@@ -280,13 +280,13 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) {
// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
if !c.Backwards { if !c.Backwards {
for _, v := range ws { for _, v := range ws {
if w := v.secondary; w > 0 { if w := v.secondary(); w > 0 {
buf.key = append(buf.key, uint8(w>>8), uint8(w)) buf.key = append(buf.key, uint8(w>>8), uint8(w))
} }
} }
} else { } else {
for i := len(ws) - 1; i >= 0; i-- { for i := len(ws) - 1; i >= 0; i-- {
if w := ws[i].secondary; w > 0 { if w := ws[i].secondary(); w > 0 {
buf.key = append(buf.key, uint8(w>>8), uint8(w)) buf.key = append(buf.key, uint8(w>>8), uint8(w))
} }
} }
...@@ -297,20 +297,20 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { ...@@ -297,20 +297,20 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) {
if Tertiary <= c.Strength || c.CaseLevel { if Tertiary <= c.Strength || c.CaseLevel {
buf.key = append(buf.key, 0, 0) buf.key = append(buf.key, 0, 0)
for _, v := range ws { for _, v := range ws {
if w := v.tertiary; w > 0 { if w := v.tertiary(); w > 0 {
buf.key = append(buf.key, w) buf.key = append(buf.key, uint8(w))
} }
} }
// Derive the quaternary weights from the options and other levels. // Derive the quaternary weights from the options and other levels.
// Note that we represent maxQuaternary as 0xFF. The first byte of the // Note that we represent maxQuaternary as 0xFF. The first byte of the
// representation of a a primary weight is always smaller than 0xFF, // representation of a a primary weight is always smaller than 0xFF,
// so using this single byte value will compare correctly. // so using this single byte value will compare correctly.
if Quaternary <= c.Strength { if Quaternary <= c.Strength && c.Alternate >= AltShifted {
if c.Alternate == AltShiftTrimmed { if c.Alternate == AltShiftTrimmed {
lastNonFFFF := len(buf.key) lastNonFFFF := len(buf.key)
buf.key = append(buf.key, 0) buf.key = append(buf.key, 0)
for _, v := range ws { for _, v := range ws {
if w := v.quaternary; w == maxQuaternary { if w := v.quaternary(); w == maxQuaternary {
buf.key = append(buf.key, 0xFF) buf.key = append(buf.key, 0xFF)
} else if w > 0 { } else if w > 0 {
buf.key = appendPrimary(buf.key, w) buf.key = appendPrimary(buf.key, w)
...@@ -321,7 +321,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { ...@@ -321,7 +321,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) {
} else { } else {
buf.key = append(buf.key, 0) buf.key = append(buf.key, 0)
for _, v := range ws { for _, v := range ws {
if w := v.quaternary; w == maxQuaternary { if w := v.quaternary(); w == maxQuaternary {
buf.key = append(buf.key, 0xFF) buf.key = append(buf.key, 0xFF)
} else if w > 0 { } else if w > 0 {
buf.key = appendPrimary(buf.key, w) buf.key = appendPrimary(buf.key, w)
...@@ -332,29 +332,27 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { ...@@ -332,29 +332,27 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) {
} }
} }
func processWeights(vw AlternateHandling, top uint32, wa []weights) { func processWeights(vw AlternateHandling, top uint32, wa []colElem) {
ignore := false ignore := false
vtop := int(top)
switch vw { switch vw {
case AltShifted, AltShiftTrimmed: case AltShifted, AltShiftTrimmed:
for i := range wa { for i := range wa {
if p := wa[i].primary; p <= top && p != 0 { if p := wa[i].primary(); p <= vtop && p != 0 {
wa[i] = weights{quaternary: p} wa[i] = makeQuaternary(p)
ignore = true ignore = true
} else if p == 0 { } else if p == 0 {
if ignore { if ignore {
wa[i] = weights{} wa[i] = ceIgnore
} else if wa[i].tertiary != 0 {
wa[i].quaternary = maxQuaternary
} }
} else { } else {
wa[i].quaternary = maxQuaternary
ignore = false ignore = false
} }
} }
case AltBlanked: case AltBlanked:
for i := range wa { for i := range wa {
if p := wa[i].primary; p <= top && (ignore || p != 0) { if p := wa[i].primary(); p <= vtop && (ignore || p != 0) {
wa[i] = weights{} wa[i] = ceIgnore
ignore = true ignore = true
} else { } else {
ignore = false ignore = false
......
...@@ -24,6 +24,8 @@ func W(ce ...int) Weights { ...@@ -24,6 +24,8 @@ func W(ce ...int) Weights {
} }
if len(ce) > 3 { if len(ce) > 3 {
w.Quaternary = ce[3] w.Quaternary = ce[3]
} else if w.Tertiary != 0 {
w.Quaternary = maxQuaternary
} }
return w return w
} }
...@@ -33,25 +35,27 @@ func (w Weights) String() string { ...@@ -33,25 +35,27 @@ func (w Weights) String() string {
type Table struct { type Table struct {
t *table t *table
w []weights
} }
func GetTable(c *Collator) *Table { func GetTable(c *Collator) *Table {
return &Table{c.t, nil} return &Table{c.t}
} }
func convertToWeights(ws []weights) []Weights { func convertToWeights(ws []colElem) []Weights {
out := make([]Weights, len(ws)) out := make([]Weights, len(ws))
for i, w := range ws { for i, w := range ws {
out[i] = Weights{int(w.primary), int(w.secondary), int(w.tertiary), int(w.quaternary)} out[i] = Weights{int(w.primary()), int(w.secondary()), int(w.tertiary()), int(w.quaternary())}
} }
return out return out
} }
func convertFromWeights(ws []Weights) []weights { func convertFromWeights(ws []Weights) []colElem {
out := make([]weights, len(ws)) out := make([]colElem, len(ws))
for i, w := range ws { for i, w := range ws {
out[i] = weights{uint32(w.Primary), uint16(w.Secondary), uint8(w.Tertiary), uint32(w.Quaternary)} out[i] = makeCE([]int{w.Primary, w.Secondary, w.Tertiary})
if out[i] == ceIgnore && w.Quaternary > 0 {
out[i] = makeQuaternary(w.Quaternary)
}
} }
return out return out
} }
......
...@@ -42,12 +42,16 @@ func (t *table) indexedTable(idx tableIndex) *table { ...@@ -42,12 +42,16 @@ func (t *table) indexedTable(idx tableIndex) *table {
// sequence of runes, the weights for the interstitial runes are // sequence of runes, the weights for the interstitial runes are
// appended as well. It returns a new slice that includes the appended // appended as well. It returns a new slice that includes the appended
// weights and the number of bytes consumed from s. // weights and the number of bytes consumed from s.
func (t *table) appendNext(w []weights, s []byte) ([]weights, int) { func (t *table) appendNext(w []colElem, s []byte) ([]colElem, int) {
v, sz := t.index.lookup(s) v, sz := t.index.lookup(s)
ce := colElem(v) ce := colElem(v)
tp := ce.ctype() tp := ce.ctype()
if tp == ceNormal { if tp == ceNormal {
w = append(w, getWeights(ce, s)) if ce == 0 {
r, _ := utf8.DecodeRune(s)
ce = makeImplicitCE(implicitPrimary(r))
}
w = append(w, ce)
} else if tp == ceExpansionIndex { } else if tp == ceExpansionIndex {
w = t.appendExpansion(w, ce) w = t.appendExpansion(w, ce)
} else if tp == ceContractionIndex { } else if tp == ceContractionIndex {
...@@ -62,40 +66,28 @@ func (t *table) appendNext(w []weights, s []byte) ([]weights, int) { ...@@ -62,40 +66,28 @@ func (t *table) appendNext(w []weights, s []byte) ([]weights, int) {
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] { for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
w, p = t.appendNext(w, nfkd) w, p = t.appendNext(w, nfkd)
} }
w[i].tertiary = t1 w[i] = w[i].updateTertiary(t1)
if i++; i < len(w) { if i++; i < len(w) {
w[i].tertiary = t2 w[i] = w[i].updateTertiary(t2)
for i++; i < len(w); i++ { for i++; i < len(w); i++ {
w[i].tertiary = maxTertiary w[i] = w[i].updateTertiary(maxTertiary)
} }
} }
} }
return w, sz return w, sz
} }
func getWeights(ce colElem, s []byte) weights { func (t *table) appendExpansion(w []colElem, ce colElem) []colElem {
if ce == 0 { // implicit
r, _ := utf8.DecodeRune(s)
return weights{
primary: uint32(implicitPrimary(r)),
secondary: defaultSecondary,
tertiary: defaultTertiary,
}
}
return splitCE(ce)
}
func (t *table) appendExpansion(w []weights, ce colElem) []weights {
i := splitExpandIndex(ce) i := splitExpandIndex(ce)
n := int(t.expandElem[i]) n := int(t.expandElem[i])
i++ i++
for _, ce := range t.expandElem[i : i+n] { for _, ce := range t.expandElem[i : i+n] {
w = append(w, splitCE(colElem(ce))) w = append(w, colElem(ce))
} }
return w return w
} }
func (t *table) matchContraction(w []weights, ce colElem, suffix []byte) ([]weights, int) { func (t *table) matchContraction(w []colElem, ce colElem, suffix []byte) ([]colElem, int) {
index, n, offset := splitContractIndex(ce) index, n, offset := splitContractIndex(ce)
scan := t.contractTries.scanner(index, n, suffix) scan := t.contractTries.scanner(index, n, suffix)
...@@ -138,7 +130,7 @@ func (t *table) matchContraction(w []weights, ce colElem, suffix []byte) ([]weig ...@@ -138,7 +130,7 @@ func (t *table) matchContraction(w []weights, ce colElem, suffix []byte) ([]weig
i, n := scan.result() i, n := scan.result()
ce = colElem(t.contractElem[i+offset]) ce = colElem(t.contractElem[i+offset])
if ce.ctype() == ceNormal { if ce.ctype() == ceNormal {
w = append(w, splitCE(ce)) w = append(w, ce)
} else { } else {
w = t.appendExpansion(w, ce) w = t.appendExpansion(w, ce)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment