Commit 8ba20dbd authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: a few minor changes in prepration for a table format change:

 - Unified bounary conditions for NFC and NFD and removed some indirections.
   This enforces boundaries at the character level, which is typically what
   the user expects. (NFD allows a boundary between 'a' and '`', for example,
   which may give unexpected results for collation.  The current implementation
   is already stricter than the standard, so nothing much changes.  This change
   just formalizes it.
 - Moved methods of qcflags to runeInfo.
 - Swapped YesC and YesMaybe bits in qcFlags. This is to aid future changes.
 - runeInfo return values use named fields in preperation for struct change.
 - Replaced some left-over uint32s with rune.

R=r
CC=golang-dev
https://golang.org/cl/5607050
parent d673c95d
......@@ -98,10 +98,10 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
if info.size == 3 {
if rune := src.hangul(i); rune != 0 {
return rb.decomposeHangul(uint32(rune))
return rb.decomposeHangul(rune)
}
}
if info.flags.hasDecomposition() {
if info.hasDecomposition() {
dcomp := rb.f.decompose(src, i)
rb.tmpBytes = inputBytes(dcomp)
for i := 0; i < len(dcomp); {
......@@ -126,26 +126,26 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
}
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
func (rb *reorderBuffer) appendRune(r uint32) {
func (rb *reorderBuffer) appendRune(r rune) {
bn := rb.nbyte
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.nbyte += utf8.UTFMax
rb.rune[rb.nrune] = runeInfo{bn, uint8(sz), 0, 0}
rb.rune[rb.nrune] = runeInfo{pos: bn, size: uint8(sz)}
rb.nrune++
}
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
func (rb *reorderBuffer) assignRune(pos int, r uint32) {
func (rb *reorderBuffer) assignRune(pos int, r rune) {
bn := rb.rune[pos].pos
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.rune[pos] = runeInfo{bn, uint8(sz), 0, 0}
rb.rune[pos] = runeInfo{pos: bn, size: uint8(sz)}
}
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
func (rb *reorderBuffer) runeAt(n int) uint32 {
func (rb *reorderBuffer) runeAt(n int) rune {
inf := rb.rune[n]
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
return uint32(r)
return r
}
// bytesAt returns the UTF-8 encoding of the rune at position n.
......@@ -237,7 +237,7 @@ func isHangulWithoutJamoT(b []byte) bool {
// decomposeHangul algorithmically decomposes a Hangul rune into
// its Jamo components.
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
func (rb *reorderBuffer) decomposeHangul(r uint32) bool {
func (rb *reorderBuffer) decomposeHangul(r rune) bool {
b := rb.rune[:]
n := rb.nrune
if n+3 > len(b) {
......@@ -319,7 +319,7 @@ func (rb *reorderBuffer) compose() {
// get the info for the combined character. This is more
// expensive than using the filter. Using combinesBackward()
// is safe.
if ii.flags.combinesBackward() {
if ii.combinesBackward() {
cccB := b[k-1].ccc
cccC := ii.ccc
blocked := false // b[i] blocked by starter or greater or equal CCC?
......
......@@ -14,7 +14,6 @@ type runeInfo struct {
}
// functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b input, i int) runeInfo
type decompFunc func(b input, i int) []byte
......@@ -24,10 +23,8 @@ type formInfo struct {
composing, compatibility bool // form type
decompose decompFunc
info lookupFunc
boundaryBefore boundaryFunc
boundaryAfter boundaryFunc
decompose decompFunc
info lookupFunc
}
var formTable []*formInfo
......@@ -49,27 +46,17 @@ func init() {
}
if Form(i) == NFC || Form(i) == NFKC {
f.composing = true
f.boundaryBefore = compBoundaryBefore
f.boundaryAfter = compBoundaryAfter
} else {
f.boundaryBefore = decompBoundary
f.boundaryAfter = decompBoundary
}
}
}
func decompBoundary(f *formInfo, info runeInfo) bool {
if info.ccc == 0 && info.flags.isYesD() { // Implies isHangul(b) == true
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
func compBoundaryBefore(f *formInfo, info runeInfo) bool {
if info.ccc == 0 && !info.flags.combinesBackward() {
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
// after 'a'. However, a might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
func (i runeInfo) boundaryBefore() bool {
if i.ccc == 0 && !i.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
......@@ -78,15 +65,13 @@ func compBoundaryBefore(f *formInfo, info runeInfo) bool {
return false
}
func compBoundaryAfter(f *formInfo, info runeInfo) bool {
// This misses values where the last char in a decomposition is a
// boundary such as Hangul with JamoT.
return info.isInert()
func (i runeInfo) boundaryAfter() bool {
return i.isInert()
}
// We pack quick check data in 4 bits:
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..2: NFC_QC Yes(00), No (01), or Maybe (11)
// 1..2: NFC_QC Yes(00), No (10), or Maybe (11)
// 3: Combines forward (0 == false, 1 == true)
//
// When all 4 bits are zero, the character is inert, meaning it is never
......@@ -95,15 +80,12 @@ func compBoundaryAfter(f *formInfo, info runeInfo) bool {
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8
func (i qcInfo) isYesC() bool { return i&0x2 == 0 }
func (i qcInfo) isNoC() bool { return i&0x6 == 0x2 }
func (i qcInfo) isMaybe() bool { return i&0x4 != 0 }
func (i qcInfo) isYesD() bool { return i&0x1 == 0 }
func (i qcInfo) isNoD() bool { return i&0x1 != 0 }
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
func (i qcInfo) combinesForward() bool { return i&0x8 != 0 }
func (i qcInfo) combinesBackward() bool { return i&0x4 != 0 } // == isMaybe
func (i qcInfo) hasDecomposition() bool { return i&0x1 != 0 } // == isNoD
func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0
......@@ -137,7 +119,7 @@ func decomposeNFKC(s input, i int) []byte {
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b uint32) uint32 {
func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
return recompMap[key]
}
......@@ -148,10 +130,10 @@ func combine(a, b uint32) uint32 {
// 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b input, i int) runeInfo {
v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 8)}
}
func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 12)}
}
......@@ -14,7 +14,7 @@ type input interface {
charinfo(p int) (uint16, int)
decomposeNFC(p int) uint16
decomposeNFKC(p int) uint16
hangul(p int) uint32
hangul(p int) rune
}
type inputString string
......@@ -54,12 +54,12 @@ func (s inputString) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
}
func (s inputString) hangul(p int) uint32 {
func (s inputString) hangul(p int) rune {
if !isHangulString(string(s[p:])) {
return 0
}
rune, _ := utf8.DecodeRuneInString(string(s[p:]))
return uint32(rune)
return rune
}
type inputBytes []byte
......@@ -96,10 +96,10 @@ func (s inputBytes) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupUnsafe(s[p:])
}
func (s inputBytes) hangul(p int) uint32 {
func (s inputBytes) hangul(p int) rune {
if !isHangul(s[p:]) {
return 0
}
rune, _ := utf8.DecodeRune(s[p:])
return uint32(rune)
return rune
}
......@@ -562,7 +562,7 @@ func makeEntry(f *FormInfo) uint16 {
switch f.quickCheck[MComposed] {
case QCYes:
case QCNo:
e |= 0x2
e |= 0x4
case QCMaybe:
e |= 0x6
default:
......@@ -718,7 +718,7 @@ func makeTables() {
sz := nrentries * 8
size += sz
fmt.Printf("// recompMap: %d bytes (entries only)\n", sz)
fmt.Println("var recompMap = map[uint32]uint32{")
fmt.Println("var recompMap = map[uint32]rune{")
for i, c := range chars {
f := c.forms[FCanonical]
d := f.decomp
......
......@@ -188,11 +188,11 @@ func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
var info runeInfo
if p < n {
info = fd.info(src, p)
if p == 0 && !fd.boundaryBefore(fd, info) {
if p == 0 && !info.boundaryBefore() {
out = decomposeToLastBoundary(rb, out)
}
}
if info.size == 0 || fd.boundaryBefore(fd, info) {
if info.size == 0 || info.boundaryBefore() {
if fd.composing {
rb.compose()
}
......@@ -257,11 +257,11 @@ func quickSpan(rb *reorderBuffer, i int) int {
}
cc := info.ccc
if rb.f.composing {
if !info.flags.isYesC() {
if !info.isYesC() {
break
}
} else {
if !info.flags.isYesD() {
if !info.isYesD() {
break
}
}
......@@ -316,13 +316,13 @@ func firstBoundary(rb *reorderBuffer) int {
}
fd := &rb.f
info := fd.info(src, i)
for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); {
for n := 0; info.size != 0 && !info.boundaryBefore(); {
i += int(info.size)
if n++; n >= maxCombiningChars {
return i
}
if i >= nsrc {
if !fd.boundaryAfter(fd, info) {
if !info.boundaryAfter() {
return -1
}
return nsrc
......@@ -368,11 +368,11 @@ func lastBoundary(fd *formInfo, b []byte) int {
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
return i
}
if fd.boundaryAfter(fd, info) {
if info.boundaryAfter() {
return i
}
i = p
for n := 0; i >= 0 && !fd.boundaryBefore(fd, info); {
for n := 0; i >= 0 && !info.boundaryBefore(); {
info, p = lastRuneStart(fd, b[:i])
if n++; n >= maxCombiningChars {
return len(b)
......@@ -404,7 +404,7 @@ func decomposeSegment(rb *reorderBuffer, sp int) int {
break
}
info = rb.f.info(rb.src, sp)
bound := rb.f.boundaryBefore(&rb.f, info)
bound := info.boundaryBefore()
if bound || info.size == 0 {
break
}
......@@ -419,7 +419,7 @@ func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
}
if p < 0 {
return runeInfo{0, 0, 0, 0}, -1
return runeInfo{}, -1
}
return fd.info(inputBytes(buf), p), p
}
......@@ -433,7 +433,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
// illegal trailing continuation bytes
return buf
}
if rb.f.boundaryAfter(fd, info) {
if info.boundaryAfter() {
return buf
}
var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
......@@ -441,13 +441,13 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
padd := 1
n := 1
p := len(buf) - int(info.size)
for ; p >= 0 && !rb.f.boundaryBefore(fd, info); p -= int(info.size) {
for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
info, i = lastRuneStart(fd, buf[:p])
if int(info.size) != p-i {
break
}
// Check that decomposition doesn't result in overflow.
if info.flags.hasDecomposition() {
if info.hasDecomposition() {
dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
for i := 0; i < len(dcomp); {
inf := rb.f.info(inputBytes(dcomp), i)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment