Commit f0a31b5f authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate/build: moved some of the code to the appropriate file, as

promised in CL 13985.

R=r
CC=golang-dev
https://golang.org/cl/6503071
parent 4bd8a386
...@@ -29,49 +29,6 @@ import ( ...@@ -29,49 +29,6 @@ import (
// could analyze and detect when using a context makes sense, there is no // could analyze and detect when using a context makes sense, there is no
// need to expose this construct in the API. // need to expose this construct in the API.
// entry is used to keep track of a single entry in the collation element table
// during building. Examples of entries can be found in the Default Unicode
// Collation Element Table.
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
type entry struct {
runes []rune
elems [][]int // the collation elements for runes
str string // same as string(runes)
// prev, next, and level are used to keep track of tailorings.
prev, next *entry
level collate.Level // next differs at this level
decompose bool // can use NFKD decomposition to generate elems
exclude bool // do not include in table
logical logicalAnchor
expansionIndex int // used to store index into expansion table
contractionHandle ctHandle
contractionIndex int // index into contraction elements
}
func (e *entry) String() string {
return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)",
e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
}
func (e *entry) skip() bool {
return e.contraction()
}
func (e *entry) expansion() bool {
return !e.decompose && len(e.elems) > 1
}
func (e *entry) contraction() bool {
return len(e.runes) > 1
}
func (e *entry) contractionStarter() bool {
return e.contractionHandle.n != 0
}
// A Builder builds a root collation table. The user must specify the // A Builder builds a root collation table. The user must specify the
// collation elements for each entry. A common use will be to base the weights // collation elements for each entry. A common use will be to base the weights
// on those specified in the allkeys* file as provided by the UCA or CLDR. // on those specified in the allkeys* file as provided by the UCA or CLDR.
...@@ -231,52 +188,6 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error { ...@@ -231,52 +188,6 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
return nil return nil
} }
func (b *Builder) baseColElem(e *entry) uint32 {
ce := uint32(0)
var err error
switch {
case e.expansion():
ce, err = makeExpandIndex(e.expansionIndex)
default:
if e.decompose {
log.Fatal("decompose should be handled elsewhere")
}
ce, err = makeCE(e.elems[0])
}
if err != nil {
b.error(fmt.Errorf("%s: %X -> %X", err, e.runes, e.elems))
}
return ce
}
func (b *Builder) colElem(e *entry) uint32 {
if e.skip() {
log.Fatal("cannot build colElem for entry that should be skipped")
}
ce := uint32(0)
var err error
switch {
case e.decompose:
t1 := e.elems[0][2]
t2 := 0
if len(e.elems) > 1 {
t2 = e.elems[1][2]
}
ce, err = makeDecompose(t1, t2)
case e.contractionStarter():
ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
default:
if len(e.runes) > 1 {
log.Fatal("colElem: contractions are handled in contraction trie")
}
ce = b.baseColElem(e)
}
if err != nil {
b.error(err)
}
return ce
}
func (b *Builder) error(e error) { func (b *Builder) error(e error) {
if e != nil { if e != nil {
b.err = e b.err = e
...@@ -352,30 +263,6 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { ...@@ -352,30 +263,6 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
return true return true
} }
func equalCE(a, b []int) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < 3; i++ {
if b[i] != a[i] {
return false
}
}
return true
}
func equalCEArrays(a, b [][]int) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if !equalCE(a[i], b[i]) {
return false
}
}
return true
}
func (b *Builder) simplify() { func (b *Builder) simplify() {
// Runes that are a starter of a contraction should not be removed. // Runes that are a starter of a contraction should not be removed.
// (To date, there is only Kannada character 0CCA.) // (To date, there is only Kannada character 0CCA.)
...@@ -412,62 +299,6 @@ func (b *Builder) simplify() { ...@@ -412,62 +299,6 @@ func (b *Builder) simplify() {
} }
} }
// convertLargeWeights converts collation elements with large
// primaries (either double primaries or for illegal runes)
// to our own representation.
// A CJK character C is represented in the DUCET as
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
// We will rewrite these characters to a single CE.
// We assume the CJK values start at 0x8000.
// See http://unicode.org/reports/tr10/#Implicit_Weights
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
const (
cjkPrimaryStart = 0xFB40
rarePrimaryStart = 0xFB80
otherPrimaryStart = 0xFBC0
illegalPrimary = 0xFFFE
highBitsMask = 0x3F
lowBitsMask = 0x7FFF
lowBitsFlag = 0x8000
shiftBits = 15
)
for i := 0; i < len(elems); i++ {
ce := elems[i]
p := ce[0]
if p < cjkPrimaryStart {
continue
}
if p > 0xFFFF {
return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
}
if p >= illegalPrimary {
ce[0] = illegalOffset + p - illegalPrimary
} else {
if i+1 >= len(elems) {
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
}
if elems[i+1][0]&lowBitsFlag == 0 {
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
}
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
switch {
case p < rarePrimaryStart:
np += commonUnifiedOffset
case p < otherPrimaryStart:
np += rareUnifiedOffset
default:
p += otherOffset
}
ce[0] = np
for j := i + 1; j+1 < len(elems); j++ {
elems[j] = elems[j+1]
}
elems = elems[:len(elems)-1]
}
}
return elems, nil
}
// appendExpansion converts the given collation sequence to // appendExpansion converts the given collation sequence to
// collation elements and adds them to the expansion table. // collation elements and adds them to the expansion table.
// It returns an index to the expansion table. // It returns an index to the expansion table.
...@@ -586,7 +417,9 @@ func (b *Builder) processContractions() { ...@@ -586,7 +417,9 @@ func (b *Builder) processContractions() {
es[0].contractionHandle = handle es[0].contractionHandle = handle
// Add collation elements for contractions. // Add collation elements for contractions.
for _, e := range es { for _, e := range es {
t.contractElem = append(t.contractElem, b.baseColElem(e)) ce, err := e.encodeBase()
b.error(err)
t.contractElem = append(t.contractElem, ce)
} }
} }
} }
...@@ -596,7 +429,8 @@ func (b *Builder) buildTrie() { ...@@ -596,7 +429,8 @@ func (b *Builder) buildTrie() {
o := b.root o := b.root
for e := o.front(); e != nil; e, _ = e.nextIndexed() { for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if !e.skip() { if !e.skip() {
ce := b.colElem(e) ce, err := e.encode()
b.error(err)
t.insert(e.runes[0], ce) t.insert(e.runes[0], ce)
} }
} }
......
...@@ -199,6 +199,62 @@ func implicitPrimary(r rune) int { ...@@ -199,6 +199,62 @@ func implicitPrimary(r rune) int {
return int(r) + otherOffset return int(r) + otherOffset
} }
// convertLargeWeights converts collation elements with large
// primaries (either double primaries or for illegal runes)
// to our own representation.
// A CJK character C is represented in the DUCET as
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
// We will rewrite these characters to a single CE.
// We assume the CJK values start at 0x8000.
// See http://unicode.org/reports/tr10/#Implicit_Weights
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
const (
cjkPrimaryStart = 0xFB40
rarePrimaryStart = 0xFB80
otherPrimaryStart = 0xFBC0
illegalPrimary = 0xFFFE
highBitsMask = 0x3F
lowBitsMask = 0x7FFF
lowBitsFlag = 0x8000
shiftBits = 15
)
for i := 0; i < len(elems); i++ {
ce := elems[i]
p := ce[0]
if p < cjkPrimaryStart {
continue
}
if p > 0xFFFF {
return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
}
if p >= illegalPrimary {
ce[0] = illegalOffset + p - illegalPrimary
} else {
if i+1 >= len(elems) {
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
}
if elems[i+1][0]&lowBitsFlag == 0 {
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
}
np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask
switch {
case p < rarePrimaryStart:
np += commonUnifiedOffset
case p < otherPrimaryStart:
np += rareUnifiedOffset
default:
p += otherOffset
}
ce[0] = np
for j := i + 1; j+1 < len(elems); j++ {
elems[j] = elems[j+1]
}
elems = elems[:len(elems)-1]
}
}
return elems, nil
}
// nextWeight computes the first possible collation weights following elems // nextWeight computes the first possible collation weights following elems
// for the given level. // for the given level.
func nextWeight(level collate.Level, elems [][]int) [][]int { func nextWeight(level collate.Level, elems [][]int) [][]int {
...@@ -247,3 +303,27 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) { ...@@ -247,3 +303,27 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) {
} }
return 0, collate.Identity return 0, collate.Identity
} }
func equalCE(a, b []int) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < 3; i++ {
if b[i] != a[i] {
return false
}
}
return true
}
func equalCEArrays(a, b [][]int) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if !equalCE(a[i], b[i]) {
return false
}
}
return true
}
...@@ -21,7 +21,48 @@ const ( ...@@ -21,7 +21,48 @@ const (
lastAnchor = 1 lastAnchor = 1
) )
// TODO: move type entry from builder.go to this file. // entry is used to keep track of a single entry in the collation element table
// during building. Examples of entries can be found in the Default Unicode
// Collation Element Table.
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
type entry struct {
runes []rune
elems [][]int // the collation elements for runes
str string // same as string(runes)
// prev, next, and level are used to keep track of tailorings.
prev, next *entry
level collate.Level // next differs at this level
decompose bool // can use NFKD decomposition to generate elems
exclude bool // do not include in table
logical logicalAnchor
expansionIndex int // used to store index into expansion table
contractionHandle ctHandle
contractionIndex int // index into contraction elements
}
func (e *entry) String() string {
return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)",
e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
}
func (e *entry) skip() bool {
return e.contraction()
}
func (e *entry) expansion() bool {
return !e.decompose && len(e.elems) > 1
}
func (e *entry) contraction() bool {
return len(e.runes) > 1
}
func (e *entry) contractionStarter() bool {
return e.contractionHandle.n != 0
}
// nextIndexed gets the next entry that needs to be stored in the table. // nextIndexed gets the next entry that needs to be stored in the table.
// It returns the entry and the collation level at which the next entry differs // It returns the entry and the collation level at which the next entry differs
...@@ -72,6 +113,42 @@ func (e *entry) insertAfter(n *entry) { ...@@ -72,6 +113,42 @@ func (e *entry) insertAfter(n *entry) {
e.next = n e.next = n
} }
func (e *entry) encodeBase() (ce uint32, err error) {
switch {
case e.expansion():
ce, err = makeExpandIndex(e.expansionIndex)
default:
if e.decompose {
log.Fatal("decompose should be handled elsewhere")
}
ce, err = makeCE(e.elems[0])
}
return
}
func (e *entry) encode() (ce uint32, err error) {
if e.skip() {
log.Fatal("cannot build colElem for entry that should be skipped")
}
switch {
case e.decompose:
t1 := e.elems[0][2]
t2 := 0
if len(e.elems) > 1 {
t2 = e.elems[1][2]
}
ce, err = makeDecompose(t1, t2)
case e.contractionStarter():
ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
default:
if len(e.runes) > 1 {
log.Fatal("colElem: contractions are handled in contraction trie")
}
ce, err = e.encodeBase()
}
return
}
// entryLess returns true if a sorts before b and false otherwise. // entryLess returns true if a sorts before b and false otherwise.
func entryLess(a, b *entry) bool { func entryLess(a, b *entry) bool {
if res, _ := compareWeights(a.elems, b.elems); res != 0 { if res, _ := compareWeights(a.elems, b.elems); res != 0 {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment