Commit 5844fc1b authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: introduced input interface to implement string versions

of methods.

R=r, mpvl
CC=golang-dev
https://golang.org/cl/5166045
parent 0da66a2e
...@@ -7,6 +7,7 @@ include ../../../Make.inc ...@@ -7,6 +7,7 @@ include ../../../Make.inc
TARG=exp/norm TARG=exp/norm
GOFILES=\ GOFILES=\
composition.go\ composition.go\
input.go\
forminfo.go\ forminfo.go\
normalize.go\ normalize.go\
readwriter.go\ readwriter.go\
......
...@@ -27,6 +27,26 @@ type reorderBuffer struct { ...@@ -27,6 +27,26 @@ type reorderBuffer struct {
nrune int // Number of runeInfos. nrune int // Number of runeInfos.
nbyte uint8 // Number or bytes. nbyte uint8 // Number or bytes.
f formInfo f formInfo
src input
nsrc int
srcBytes inputBytes
srcString inputString
tmpBytes inputBytes
}
func (rb *reorderBuffer) init(f Form, src []byte) {
rb.f = *formTable[f]
rb.srcBytes = inputBytes(src)
rb.src = &rb.srcBytes
rb.nsrc = len(src)
}
func (rb *reorderBuffer) initString(f Form, src string) {
rb.f = *formTable[f]
rb.srcString = inputString(src)
rb.src = &rb.srcString
rb.nsrc = len(src)
} }
// reset discards all characters from the buffer. // reset discards all characters from the buffer.
...@@ -75,45 +95,17 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool { ...@@ -75,45 +95,17 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
// insert inserts the given rune in the buffer ordered by CCC. // insert inserts the given rune in the buffer ordered by CCC.
// It returns true if the buffer was large enough to hold the decomposed rune. // It returns true if the buffer was large enough to hold the decomposed rune.
func (rb *reorderBuffer) insert(src []byte, info runeInfo) bool { func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
if info.size == 3 && isHangul(src) { if info.size == 3 {
rune, _ := utf8.DecodeRune(src) if rune := src.hangul(i); rune != 0 {
return rb.decomposeHangul(uint32(rune)) return rb.decomposeHangul(uint32(rune))
}
if info.flags.hasDecomposition() {
dcomp := rb.f.decompose(src)
for i := 0; i < len(dcomp); {
info = rb.f.info(dcomp[i:])
pos := rb.nbyte
if !rb.insertOrdered(info) {
return false
}
end := i + int(info.size)
copy(rb.byte[pos:], dcomp[i:end])
i = end
} }
} else {
// insertOrder changes nbyte
pos := rb.nbyte
if !rb.insertOrdered(info) {
return false
}
copy(rb.byte[pos:], src[:info.size])
}
return true
}
// insertString inserts the given rune in the buffer ordered by CCC.
// It returns true if the buffer was large enough to hold the decomposed rune.
func (rb *reorderBuffer) insertString(src string, info runeInfo) bool {
if info.size == 3 && isHangulString(src) {
rune, _ := utf8.DecodeRuneInString(src)
return rb.decomposeHangul(uint32(rune))
} }
if info.flags.hasDecomposition() { if info.flags.hasDecomposition() {
dcomp := rb.f.decomposeString(src) dcomp := rb.f.decompose(src, i)
rb.tmpBytes = inputBytes(dcomp)
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
info = rb.f.info(dcomp[i:]) info = rb.f.info(&rb.tmpBytes, i)
pos := rb.nbyte pos := rb.nbyte
if !rb.insertOrdered(info) { if !rb.insertOrdered(info) {
return false return false
...@@ -128,7 +120,7 @@ func (rb *reorderBuffer) insertString(src string, info runeInfo) bool { ...@@ -128,7 +120,7 @@ func (rb *reorderBuffer) insertString(src string, info runeInfo) bool {
if !rb.insertOrdered(info) { if !rb.insertOrdered(info) {
return false return false
} }
copy(rb.byte[pos:], src[:info.size]) src.copySlice(rb.byte[pos:], i, i+int(info.size))
} }
return true return true
} }
......
...@@ -15,21 +15,19 @@ type TestCase struct { ...@@ -15,21 +15,19 @@ type TestCase struct {
type insertFunc func(rb *reorderBuffer, rune int) bool type insertFunc func(rb *reorderBuffer, rune int) bool
func insert(rb *reorderBuffer, rune int) bool { func insert(rb *reorderBuffer, rune int) bool {
b := []byte(string(rune)) src := inputString(string(rune))
return rb.insert(b, rb.f.info(b)) return rb.insert(src, 0, rb.f.info(src, 0))
} }
func insertString(rb *reorderBuffer, rune int) bool { func runTests(t *testing.T, name string, fm Form, f insertFunc, tests []TestCase) {
s := string(rune) rb := reorderBuffer{}
return rb.insertString(s, rb.f.infoString(s)) rb.init(fm, nil)
}
func runTests(t *testing.T, name string, rb *reorderBuffer, f insertFunc, tests []TestCase) {
for i, test := range tests { for i, test := range tests {
rb.reset() rb.reset()
for j, rune := range test.in { for j, rune := range test.in {
b := []byte(string(rune)) b := []byte(string(rune))
if !rb.insert(b, rb.f.info(b)) { src := inputBytes(b)
if !rb.insert(src, 0, rb.f.info(src, 0)) {
t.Errorf("%s:%d: insert failed for rune %d", name, i, j) t.Errorf("%s:%d: insert failed for rune %d", name, i, j)
} }
} }
...@@ -50,7 +48,8 @@ func runTests(t *testing.T, name string, rb *reorderBuffer, f insertFunc, tests ...@@ -50,7 +48,8 @@ func runTests(t *testing.T, name string, rb *reorderBuffer, f insertFunc, tests
} }
func TestFlush(t *testing.T) { func TestFlush(t *testing.T) {
rb := &reorderBuffer{f: *formTable[NFC]} rb := reorderBuffer{}
rb.init(NFC, nil)
out := make([]byte, 0) out := make([]byte, 0)
out = rb.flush(out) out = rb.flush(out)
...@@ -59,7 +58,7 @@ func TestFlush(t *testing.T) { ...@@ -59,7 +58,7 @@ func TestFlush(t *testing.T) {
} }
for _, r := range []int("world!") { for _, r := range []int("world!") {
insert(rb, r) insert(&rb, r)
} }
out = []byte("Hello ") out = []byte("Hello ")
...@@ -88,13 +87,7 @@ var insertTests = []TestCase{ ...@@ -88,13 +87,7 @@ var insertTests = []TestCase{
} }
func TestInsert(t *testing.T) { func TestInsert(t *testing.T) {
rb := &reorderBuffer{f: *formTable[NFD]} runTests(t, "TestInsert", NFD, insert, insertTests)
runTests(t, "TestInsert", rb, insert, insertTests)
}
func TestInsertString(t *testing.T) {
rb := &reorderBuffer{f: *formTable[NFD]}
runTests(t, "TestInsertString", rb, insertString, insertTests)
} }
var decompositionNFDTest = []TestCase{ var decompositionNFDTest = []TestCase{
...@@ -113,11 +106,8 @@ var decompositionNFKDTest = []TestCase{ ...@@ -113,11 +106,8 @@ var decompositionNFKDTest = []TestCase{
} }
func TestDecomposition(t *testing.T) { func TestDecomposition(t *testing.T) {
rb := &reorderBuffer{} runTests(t, "TestDecompositionNFD", NFD, insert, decompositionNFDTest)
rb.f = *formTable[NFD] runTests(t, "TestDecompositionNFKD", NFKD, insert, decompositionNFKDTest)
runTests(t, "TestDecompositionNFD", rb, insert, decompositionNFDTest)
rb.f = *formTable[NFKD]
runTests(t, "TestDecompositionNFKD", rb, insert, decompositionNFKDTest)
} }
var compositionTest = []TestCase{ var compositionTest = []TestCase{
...@@ -133,6 +123,5 @@ var compositionTest = []TestCase{ ...@@ -133,6 +123,5 @@ var compositionTest = []TestCase{
} }
func TestComposition(t *testing.T) { func TestComposition(t *testing.T) {
rb := &reorderBuffer{f: *formTable[NFC]} runTests(t, "TestComposition", NFC, insert, compositionTest)
runTests(t, "TestComposition", rb, insert, compositionTest)
} }
...@@ -15,10 +15,8 @@ type runeInfo struct { ...@@ -15,10 +15,8 @@ type runeInfo struct {
// functions dispatchable per form // functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b []byte) runeInfo type lookupFunc func(b input, i int) runeInfo
type lookupFuncString func(s string) runeInfo type decompFunc func(b input, i int) []byte
type decompFunc func(b []byte) []byte
type decompFuncString func(s string) []byte
// formInfo holds Form-specific functions and tables. // formInfo holds Form-specific functions and tables.
type formInfo struct { type formInfo struct {
...@@ -26,12 +24,10 @@ type formInfo struct { ...@@ -26,12 +24,10 @@ type formInfo struct {
composing, compatibility bool // form type composing, compatibility bool // form type
decompose decompFunc decompose decompFunc
decomposeString decompFuncString info lookupFunc
info lookupFunc boundaryBefore boundaryFunc
infoString lookupFuncString boundaryAfter boundaryFunc
boundaryBefore boundaryFunc
boundaryAfter boundaryFunc
} }
var formTable []*formInfo var formTable []*formInfo
...@@ -46,14 +42,10 @@ func init() { ...@@ -46,14 +42,10 @@ func init() {
if Form(i) == NFKD || Form(i) == NFKC { if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true f.compatibility = true
f.decompose = decomposeNFKC f.decompose = decomposeNFKC
f.decomposeString = decomposeStringNFKC
f.info = lookupInfoNFKC f.info = lookupInfoNFKC
f.infoString = lookupInfoStringNFKC
} else { } else {
f.decompose = decomposeNFC f.decompose = decomposeNFC
f.decomposeString = decomposeStringNFC
f.info = lookupInfoNFC f.info = lookupInfoNFC
f.infoString = lookupInfoStringNFC
} }
if Form(i) == NFC || Form(i) == NFKC { if Form(i) == NFC || Form(i) == NFKC {
f.composing = true f.composing = true
...@@ -123,29 +115,15 @@ func (r runeInfo) isInert() bool { ...@@ -123,29 +115,15 @@ func (r runeInfo) isInert() bool {
// array of UTF-8 decomposition sequences. The first byte is the number // array of UTF-8 decomposition sequences. The first byte is the number
// of bytes in the decomposition (excluding this length byte). The actual // of bytes in the decomposition (excluding this length byte). The actual
// sequence starts at the offset+1. // sequence starts at the offset+1.
func decomposeNFC(b []byte) []byte { func decomposeNFC(s input, i int) []byte {
p := nfcDecompTrie.lookupUnsafe(b) p := s.decomposeNFC(i)
n := decomps[p] n := decomps[p]
p++ p++
return decomps[p : p+uint16(n)] return decomps[p : p+uint16(n)]
} }
func decomposeNFKC(b []byte) []byte { func decomposeNFKC(s input, i int) []byte {
p := nfkcDecompTrie.lookupUnsafe(b) p := s.decomposeNFKC(i)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeStringNFC(s string) []byte {
p := nfcDecompTrie.lookupStringUnsafe(s)
n := decomps[p]
p++
return decomps[p : p+uint16(n)]
}
func decomposeStringNFKC(s string) []byte {
p := nfkcDecompTrie.lookupStringUnsafe(s)
n := decomps[p] n := decomps[p]
p++ p++
return decomps[p : p+uint16(n)] return decomps[p : p+uint16(n)]
...@@ -168,22 +146,12 @@ func combine(a, b uint32) uint32 { ...@@ -168,22 +146,12 @@ func combine(a, b uint32) uint32 {
// 0..7 CCC value. // 0..7 CCC value.
// 8..11 qcInfo for NFC/NFD // 8..11 qcInfo for NFC/NFD
// 12..15 qcInfo for NFKC/NFKD // 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b []byte) runeInfo { func lookupInfoNFC(b input, i int) runeInfo {
v, sz := charInfoTrie.lookup(b) v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}
func lookupInfoStringNFC(s string) runeInfo {
v, sz := charInfoTrie.lookupString(s)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)} return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
} }
func lookupInfoNFKC(b []byte) runeInfo { func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := charInfoTrie.lookup(b) v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}
func lookupInfoStringNFKC(s string) runeInfo {
v, sz := charInfoTrie.lookupString(s)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)} return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
} }
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "utf8"
type input interface {
skipASCII(p int) int
skipNonStarter() int
appendSlice(buf []byte, s, e int) []byte
copySlice(buf []byte, s, e int)
charinfo(p int) (uint16, int)
decomposeNFC(p int) uint16
decomposeNFKC(p int) uint16
hangul(p int) uint32
}
type inputString string
func (s inputString) skipASCII(p int) int {
for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
}
return p
}
func (s inputString) skipNonStarter() int {
p := 0
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
}
return p
}
func (s inputString) appendSlice(buf []byte, b, e int) []byte {
for i := b; i < e; i++ {
buf = append(buf, s[i])
}
return buf
}
func (s inputString) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
}
func (s inputString) charinfo(p int) (uint16, int) {
return charInfoTrie.lookupString(string(s[p:]))
}
func (s inputString) decomposeNFC(p int) uint16 {
return nfcDecompTrie.lookupStringUnsafe(string(s[p:]))
}
func (s inputString) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
}
func (s inputString) hangul(p int) uint32 {
if !isHangulString(string(s[p:])) {
return 0
}
rune, _ := utf8.DecodeRuneInString(string(s[p:]))
return uint32(rune)
}
type inputBytes []byte
func (s inputBytes) skipASCII(p int) int {
for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
}
return p
}
func (s inputBytes) skipNonStarter() int {
p := 0
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
}
return p
}
func (s inputBytes) appendSlice(buf []byte, b, e int) []byte {
return append(buf, s[b:e]...)
}
func (s inputBytes) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
}
func (s inputBytes) charinfo(p int) (uint16, int) {
return charInfoTrie.lookup(s[p:])
}
func (s inputBytes) decomposeNFC(p int) uint16 {
return nfcDecompTrie.lookupUnsafe(s[p:])
}
func (s inputBytes) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupUnsafe(s[p:])
}
func (s inputBytes) hangul(p int) uint32 {
if !isHangul(s[p:]) {
return 0
}
rune, _ := utf8.DecodeRune(s[p:])
return uint32(rune)
}
...@@ -56,15 +56,15 @@ func (f Form) String(s string) string { ...@@ -56,15 +56,15 @@ func (f Form) String(s string) string {
// IsNormal returns true if b == f(b). // IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool { func (f Form) IsNormal(b []byte) bool {
fd := formTable[f] rb := reorderBuffer{}
bp := quickSpan(fd, b) rb.init(f, b)
bp := quickSpan(&rb, 0)
if bp == len(b) { if bp == len(b) {
return true return true
} }
rb := reorderBuffer{f: *fd}
for bp < len(b) { for bp < len(b) {
decomposeSegment(&rb, b[bp:]) decomposeSegment(&rb, bp)
if fd.composing { if rb.f.composing {
rb.compose() rb.compose()
} }
for i := 0; i < rb.nrune; i++ { for i := 0; i < rb.nrune; i++ {
...@@ -82,14 +82,42 @@ func (f Form) IsNormal(b []byte) bool { ...@@ -82,14 +82,42 @@ func (f Form) IsNormal(b []byte) bool {
} }
} }
rb.reset() rb.reset()
bp += quickSpan(fd, b[bp:]) bp = quickSpan(&rb, bp)
} }
return true return true
} }
// IsNormalString returns true if s == f(s). // IsNormalString returns true if s == f(s).
func (f Form) IsNormalString(s string) bool { func (f Form) IsNormalString(s string) bool {
panic("not implemented") rb := reorderBuffer{}
rb.initString(f, s)
bp := quickSpan(&rb, 0)
if bp == len(s) {
return true
}
for bp < len(s) {
decomposeSegment(&rb, bp)
if rb.f.composing {
rb.compose()
}
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if bp+int(info.size) > len(s) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if s[bp] != rb.byte[p] {
return false
}
bp++
}
}
rb.reset()
bp = quickSpan(&rb, bp)
}
return true
} }
// patchTail fixes a case where a rune may be incorrectly normalized // patchTail fixes a case where a rune may be incorrectly normalized
...@@ -113,12 +141,12 @@ func patchTail(rb *reorderBuffer, buf []byte) ([]byte, int) { ...@@ -113,12 +141,12 @@ func patchTail(rb *reorderBuffer, buf []byte) ([]byte, int) {
return buf, 0 return buf, 0
} }
func appendQuick(f *formInfo, dst, src []byte) ([]byte, int) { func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
if len(src) == 0 { if rb.nsrc == i {
return dst, 0 return dst, i
} }
end := quickSpan(f, src) end := quickSpan(rb, i)
return append(dst, src[:end]...), end return rb.src.appendSlice(dst, i, end), end
} }
// Append returns f(append(out, b...)). // Append returns f(append(out, b...)).
...@@ -127,22 +155,21 @@ func (f Form) Append(out []byte, src ...byte) []byte { ...@@ -127,22 +155,21 @@ func (f Form) Append(out []byte, src ...byte) []byte {
if len(src) == 0 { if len(src) == 0 {
return out return out
} }
fd := formTable[f] rb := reorderBuffer{}
rb := &reorderBuffer{f: *fd} rb.init(f, src)
return doAppend(rb, out, src) return doAppend(&rb, out)
} }
func doAppend(rb *reorderBuffer, out, src []byte) []byte { func doAppend(rb *reorderBuffer, out []byte) []byte {
src, n := rb.src, rb.nsrc
doMerge := len(out) > 0 doMerge := len(out) > 0
p := 0 p := 0
if !utf8.RuneStart(src[0]) { if p = src.skipNonStarter(); p > 0 {
// Move leading non-starters to destination. // Move leading non-starters to destination.
for p++; p < len(src) && !utf8.RuneStart(src[p]); p++ { out = src.appendSlice(out, 0, p)
}
out = append(out, src[:p]...)
buf, ndropped := patchTail(rb, out) buf, ndropped := patchTail(rb, out)
if ndropped > 0 { if ndropped > 0 {
out = append(buf, src[p-ndropped:p]...) out = src.appendSlice(buf, p-ndropped, p)
doMerge = false // no need to merge, ends with illegal UTF-8 doMerge = false // no need to merge, ends with illegal UTF-8
} else { } else {
out = decomposeToLastBoundary(rb, buf) // force decomposition out = decomposeToLastBoundary(rb, buf) // force decomposition
...@@ -151,8 +178,8 @@ func doAppend(rb *reorderBuffer, out, src []byte) []byte { ...@@ -151,8 +178,8 @@ func doAppend(rb *reorderBuffer, out, src []byte) []byte {
fd := &rb.f fd := &rb.f
if doMerge { if doMerge {
var info runeInfo var info runeInfo
if p < len(src) { if p < n {
info = fd.info(src[p:]) info = fd.info(src, p)
if p == 0 && !fd.boundaryBefore(fd, info) { if p == 0 && !fd.boundaryBefore(fd, info) {
out = decomposeToLastBoundary(rb, out) out = decomposeToLastBoundary(rb, out)
} }
...@@ -164,59 +191,63 @@ func doAppend(rb *reorderBuffer, out, src []byte) []byte { ...@@ -164,59 +191,63 @@ func doAppend(rb *reorderBuffer, out, src []byte) []byte {
out = rb.flush(out) out = rb.flush(out)
if info.size == 0 { if info.size == 0 {
// Append incomplete UTF-8 encoding. // Append incomplete UTF-8 encoding.
return append(out, src[p:]...) return src.appendSlice(out, p, n)
} }
} }
} }
if rb.nrune == 0 { if rb.nrune == 0 {
src = src[p:] out, p = appendQuick(rb, out, p)
out, p = appendQuick(fd, out, src)
} }
for n := 0; p < len(src); p += n { for p < n {
p += decomposeSegment(rb, src[p:]) p = decomposeSegment(rb, p)
if fd.composing { if fd.composing {
rb.compose() rb.compose()
} }
out = rb.flush(out) out = rb.flush(out)
out, n = appendQuick(fd, out, src[p:]) out, p = appendQuick(rb, out, p)
} }
return out return out
} }
// AppendString returns f(append(out, []byte(s))). // AppendString returns f(append(out, []byte(s))).
// The buffer out must be nil, empty, or equal to f(out). // The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, s string) []byte { func (f Form) AppendString(out []byte, src string) []byte {
panic("not implemented") if len(src) == 0 {
return out
}
rb := reorderBuffer{}
rb.initString(f, src)
return doAppend(&rb, out)
} }
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n. // It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int { func (f Form) QuickSpan(b []byte) int {
return quickSpan(formTable[f], b) rb := reorderBuffer{}
rb.init(f, b)
return quickSpan(&rb, 0)
} }
func quickSpan(fd *formInfo, b []byte) int { func quickSpan(rb *reorderBuffer, i int) int {
var lastCC uint8 var lastCC uint8
var lastSegStart int var nc int
var i, nc int lastSegStart := i
for i < len(b) { src, n := rb.src, rb.nsrc
if b[i] < utf8.RuneSelf { for i < n {
// Keep the loop tight for ASCII processing, as this is where if j := src.skipASCII(i); i != j {
// most of the time is spent for this case. i = j
for i++; i < len(b) && b[i] < utf8.RuneSelf; i++ {
}
lastSegStart = i - 1 lastSegStart = i - 1
lastCC = 0 lastCC = 0
nc = 0 nc = 0
continue continue
} }
info := fd.info(b[i:]) info := rb.f.info(src, i)
if info.size == 0 { if info.size == 0 {
// include incomplete runes // include incomplete runes
return len(b) return n
} }
cc := info.ccc cc := info.ccc
if fd.composing { if rb.f.composing {
if !info.flags.isYesC() { if !info.flags.isYesC() {
break break
} }
...@@ -243,10 +274,10 @@ func quickSpan(fd *formInfo, b []byte) int { ...@@ -243,10 +274,10 @@ func quickSpan(fd *formInfo, b []byte) int {
lastCC = cc lastCC = cc
i += int(info.size) i += int(info.size)
} }
if i == len(b) { if i == n {
return len(b) return n
} }
if fd.composing { if rb.f.composing {
return lastSegStart return lastSegStart
} }
return i return i
...@@ -255,32 +286,39 @@ func quickSpan(fd *formInfo, b []byte) int { ...@@ -255,32 +286,39 @@ func quickSpan(fd *formInfo, b []byte) int {
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]). // QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n. // It is not guaranteed to return the largest such n.
func (f Form) QuickSpanString(s string) int { func (f Form) QuickSpanString(s string) int {
panic("not implemented") rb := reorderBuffer{}
rb.initString(f, s)
return quickSpan(&rb, 0)
} }
// FirstBoundary returns the position i of the first boundary in b // FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary. // or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int { func (f Form) FirstBoundary(b []byte) int {
i := 0 rb := reorderBuffer{}
for ; i < len(b) && !utf8.RuneStart(b[i]); i++ { rb.init(f, b)
} return firstBoundary(&rb)
if i >= len(b) { }
func firstBoundary(rb *reorderBuffer) int {
src, nsrc := rb.src, rb.nsrc
i := src.skipNonStarter()
if i >= nsrc {
return -1 return -1
} }
fd := formTable[f] fd := &rb.f
info := fd.info(b[i:]) info := fd.info(src, i)
for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); { for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); {
i += int(info.size) i += int(info.size)
if n++; n >= maxCombiningChars { if n++; n >= maxCombiningChars {
return i return i
} }
if i >= len(b) { if i >= nsrc {
if !fd.boundaryAfter(fd, info) { if !fd.boundaryAfter(fd, info) {
return -1 return -1
} }
return len(b) return nsrc
} }
info = fd.info(b[i:]) info = fd.info(src, i)
} }
if info.size == 0 { if info.size == 0 {
return -1 return -1
...@@ -290,8 +328,10 @@ func (f Form) FirstBoundary(b []byte) int { ...@@ -290,8 +328,10 @@ func (f Form) FirstBoundary(b []byte) int {
// FirstBoundaryInString returns the position i of the first boundary in s // FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary. // or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) (i int, ok bool) { func (f Form) FirstBoundaryInString(s string) int {
panic("not implemented") rb := reorderBuffer{}
rb.initString(f, s)
return firstBoundary(&rb)
} }
// LastBoundary returns the position i of the last boundary in b // LastBoundary returns the position i of the last boundary in b
...@@ -349,19 +389,18 @@ func (f Form) LastBoundaryInString(s string) int { ...@@ -349,19 +389,18 @@ func (f Form) LastBoundaryInString(s string) int {
// It returns the number of bytes consumed from src. // It returns the number of bytes consumed from src.
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner) // TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
// when we detect a sequence of 30+ non-starter chars. // when we detect a sequence of 30+ non-starter chars.
func decomposeSegment(rb *reorderBuffer, src []byte) int { func decomposeSegment(rb *reorderBuffer, sp int) int {
// Force one character to be consumed. // Force one character to be consumed.
info := rb.f.info(src) info := rb.f.info(rb.src, sp)
if info.size == 0 { if info.size == 0 {
return 0 return 0
} }
sp := 0 for rb.insert(rb.src, sp, info) {
for rb.insert(src[sp:], info) {
sp += int(info.size) sp += int(info.size)
if sp >= len(src) { if sp >= rb.nsrc {
break break
} }
info = rb.f.info(src[sp:]) info = rb.f.info(rb.src, sp)
bound := rb.f.boundaryBefore(&rb.f, info) bound := rb.f.boundaryBefore(&rb.f, info)
if bound || info.size == 0 { if bound || info.size == 0 {
break break
...@@ -379,7 +418,7 @@ func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) { ...@@ -379,7 +418,7 @@ func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
if p < 0 { if p < 0 {
return runeInfo{0, 0, 0, 0}, -1 return runeInfo{0, 0, 0, 0}, -1
} }
return fd.info(buf[p:]), p return fd.info(inputBytes(buf), p), p
} }
// decomposeToLastBoundary finds an open segment at the end of the buffer // decomposeToLastBoundary finds an open segment at the end of the buffer
...@@ -406,9 +445,9 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte { ...@@ -406,9 +445,9 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
} }
// Check that decomposition doesn't result in overflow. // Check that decomposition doesn't result in overflow.
if info.flags.hasDecomposition() { if info.flags.hasDecomposition() {
dcomp := rb.f.decompose(buf[p-int(info.size):]) dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
inf := rb.f.info(dcomp[i:]) inf := rb.f.info(inputBytes(dcomp), i)
i += int(inf.size) i += int(inf.size)
n++ n++
} }
...@@ -424,7 +463,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte { ...@@ -424,7 +463,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
pp := p pp := p
for padd--; padd >= 0; padd-- { for padd--; padd >= 0; padd-- {
info = add[padd] info = add[padd]
rb.insert(buf[pp:], info) rb.insert(inputBytes(buf), pp, info)
pp += int(info.size) pp += int(info.size)
} }
return buf[:p] return buf[:p]
......
...@@ -18,9 +18,12 @@ type PositionTest struct { ...@@ -18,9 +18,12 @@ type PositionTest struct {
type positionFunc func(rb *reorderBuffer, s string) int type positionFunc func(rb *reorderBuffer, s string) int
func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) { func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
rb := reorderBuffer{f: *formTable[f]} rb := reorderBuffer{}
rb.init(f, nil)
for i, test := range tests { for i, test := range tests {
rb.reset() rb.reset()
rb.src = inputString(test.input)
rb.nsrc = len(test.input)
pos := fn(&rb, test.input) pos := fn(&rb, test.input)
if pos != test.pos { if pos != test.pos {
t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos) t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
...@@ -60,7 +63,9 @@ var decomposeSegmentTests = []PositionTest{ ...@@ -60,7 +63,9 @@ var decomposeSegmentTests = []PositionTest{
} }
func decomposeSegmentF(rb *reorderBuffer, s string) int { func decomposeSegmentF(rb *reorderBuffer, s string) int {
return decomposeSegment(rb, []byte(s)) rb.src = inputString(s)
rb.nsrc = len(s)
return decomposeSegment(rb, 0)
} }
func TestDecomposeSegment(t *testing.T) { func TestDecomposeSegment(t *testing.T) {
...@@ -90,12 +95,17 @@ var firstBoundaryTests = []PositionTest{ ...@@ -90,12 +95,17 @@ var firstBoundaryTests = []PositionTest{
{strings.Repeat("\u0300", maxCombiningChars+1), 60, ""}, {strings.Repeat("\u0300", maxCombiningChars+1), 60, ""},
} }
func firstBoundary(rb *reorderBuffer, s string) int { func firstBoundaryF(rb *reorderBuffer, s string) int {
return rb.f.form.FirstBoundary([]byte(s)) return rb.f.form.FirstBoundary([]byte(s))
} }
func firstBoundaryStringF(rb *reorderBuffer, s string) int {
return rb.f.form.FirstBoundaryInString(s)
}
func TestFirstBoundary(t *testing.T) { func TestFirstBoundary(t *testing.T) {
runPosTests(t, "TestFirstBoundary", NFC, firstBoundary, firstBoundaryTests) runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
} }
var decomposeToLastTests = []PositionTest{ var decomposeToLastTests = []PositionTest{
...@@ -275,11 +285,20 @@ func doQuickSpan(rb *reorderBuffer, s string) int { ...@@ -275,11 +285,20 @@ func doQuickSpan(rb *reorderBuffer, s string) int {
return rb.f.form.QuickSpan([]byte(s)) return rb.f.form.QuickSpan([]byte(s))
} }
func doQuickSpanString(rb *reorderBuffer, s string) int {
return rb.f.form.QuickSpanString(s)
}
func TestQuickSpan(t *testing.T) { func TestQuickSpan(t *testing.T) {
runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests) runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests)
runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests) runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests)
runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests) runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests)
runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests) runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests)
runPosTests(t, "TestQuickSpanStringNFD1", NFD, doQuickSpanString, quickSpanTests)
runPosTests(t, "TestQuickSpanStringNFD2", NFD, doQuickSpanString, quickSpanNFDTests)
runPosTests(t, "TestQuickSpanStringNFC1", NFC, doQuickSpanString, quickSpanTests)
runPosTests(t, "TestQuickSpanStringNFC2", NFC, doQuickSpanString, quickSpanNFCTests)
} }
var isNormalTests = []PositionTest{ var isNormalTests = []PositionTest{
...@@ -334,7 +353,7 @@ var isNormalNFCTests = []PositionTest{ ...@@ -334,7 +353,7 @@ var isNormalNFCTests = []PositionTest{
{"같은", 1, ""}, {"같은", 1, ""},
} }
func isNormal(rb *reorderBuffer, s string) int { func isNormalF(rb *reorderBuffer, s string) int {
if rb.f.form.IsNormal([]byte(s)) { if rb.f.form.IsNormal([]byte(s)) {
return 1 return 1
} }
...@@ -342,10 +361,10 @@ func isNormal(rb *reorderBuffer, s string) int { ...@@ -342,10 +361,10 @@ func isNormal(rb *reorderBuffer, s string) int {
} }
func TestIsNormal(t *testing.T) { func TestIsNormal(t *testing.T) {
runPosTests(t, "TestIsNormalNFD1", NFD, isNormal, isNormalTests) runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
runPosTests(t, "TestIsNormalNFD2", NFD, isNormal, isNormalNFDTests) runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
runPosTests(t, "TestIsNormalNFC1", NFC, isNormal, isNormalTests) runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
runPosTests(t, "TestIsNormalNFC2", NFC, isNormal, isNormalNFCTests) runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
} }
type AppendTest struct { type AppendTest struct {
...@@ -452,8 +471,13 @@ func appendF(f Form, out []byte, s string) []byte { ...@@ -452,8 +471,13 @@ func appendF(f Form, out []byte, s string) []byte {
return f.Append(out, []byte(s)...) return f.Append(out, []byte(s)...)
} }
func appendStringF(f Form, out []byte, s string) []byte {
return f.AppendString(out, s)
}
func TestAppend(t *testing.T) { func TestAppend(t *testing.T) {
runAppendTests(t, "TestAppend", NFKC, appendF, appendTests) runAppendTests(t, "TestAppend", NFKC, appendF, appendTests)
runAppendTests(t, "TestAppendString", NFKC, appendStringF, appendTests)
} }
func doFormBenchmark(b *testing.B, f Form, s string) { func doFormBenchmark(b *testing.B, f Form, s string) {
......
...@@ -28,7 +28,9 @@ func (w *normWriter) Write(data []byte) (n int, err os.Error) { ...@@ -28,7 +28,9 @@ func (w *normWriter) Write(data []byte) (n int, err os.Error) {
if m > chunk { if m > chunk {
m = chunk m = chunk
} }
w.buf = doAppend(&w.rb, w.buf, data[:m]) w.rb.src = inputBytes(data[:m])
w.rb.nsrc = m
w.buf = doAppend(&w.rb, w.buf)
data = data[m:] data = data[m:]
n += m n += m
...@@ -65,7 +67,9 @@ func (w *normWriter) Close() os.Error { ...@@ -65,7 +67,9 @@ func (w *normWriter) Close() os.Error {
// an internal buffer to maintain state across Write calls. // an internal buffer to maintain state across Write calls.
// Calling its Close method writes any buffered data to w. // Calling its Close method writes any buffered data to w.
func (f Form) Writer(w io.Writer) io.WriteCloser { func (f Form) Writer(w io.Writer) io.WriteCloser {
return &normWriter{rb: reorderBuffer{f: *formTable[f]}, w: w} wr := &normWriter{rb: reorderBuffer{}, w: w}
wr.rb.init(f, nil)
return wr
} }
type normReader struct { type normReader struct {
...@@ -97,9 +101,10 @@ func (r *normReader) Read(p []byte) (int, os.Error) { ...@@ -97,9 +101,10 @@ func (r *normReader) Read(p []byte) (int, os.Error) {
r.bufStart = 0 r.bufStart = 0
n, err := r.r.Read(r.inbuf) n, err := r.r.Read(r.inbuf)
r.err = err // save error for when done with buffer r.rb.src = inputBytes(r.inbuf[0:n])
r.rb.nsrc, r.err = n, err
if n > 0 { if n > 0 {
r.outbuf = doAppend(&r.rb, r.outbuf, r.inbuf[0:n]) r.outbuf = doAppend(&r.rb, r.outbuf)
} }
if err == os.EOF { if err == os.EOF {
r.lastBoundary = len(r.outbuf) r.lastBoundary = len(r.outbuf)
...@@ -117,5 +122,8 @@ func (r *normReader) Read(p []byte) (int, os.Error) { ...@@ -117,5 +122,8 @@ func (r *normReader) Read(p []byte) (int, os.Error) {
// by reading data from r and returning f(data). // by reading data from r and returning f(data).
func (f Form) Reader(r io.Reader) io.Reader { func (f Form) Reader(r io.Reader) io.Reader {
const chunk = 4000 const chunk = 4000
return &normReader{rb: reorderBuffer{f: *formTable[f]}, r: r, inbuf: make([]byte, chunk)} buf := make([]byte, chunk)
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf}
rr.rb.init(f, buf)
return rr
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment