Commit fc77e826 authored by Russ Cox's avatar Russ Cox

unicode: add case folding tables

R=r, r
CC=golang-dev
https://golang.org/cl/4571074
parent 6e9b1a78
......@@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int {
}
return r
}
// caseOrbit is defined in tables.go as []foldPair. Right now all the
// entries fit in uint16, so use uint16. If that changes, compilation
// will fail (the constants in the composite literal will not fit in uint16)
// and the types here can change to uint32.
type foldPair struct {
From uint16
To uint16
}
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
// smallest r >= rune if one exists, or else the smallest r >= 0.
//
// For example:
// SimpleFold('A') = 'a'
// SimpleFold('a') = 'A'
//
// SimpleFold('K') = 'k'
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
// SimpleFold('\u212A') = 'K'
//
// SimpleFold('1') = '1'
//
func SimpleFold(rune int) int {
// Consult caseOrbit table for special cases.
lo := 0
hi := len(caseOrbit)
for lo < hi {
m := lo + (hi-lo)/2
if int(caseOrbit[m].From) < rune {
lo = m + 1
} else {
hi = m
}
}
if lo < len(caseOrbit) && int(caseOrbit[lo].From) == rune {
return int(caseOrbit[lo].To)
}
// No folding specified. This is a one- or two-element
// equivalence class containing rune and ToLower(rune)
// and ToUpper(rune) if they are different from rune.
if l := ToLower(rune); l != rune {
return l
}
return ToUpper(rune)
}
......@@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) {
}
}
}
var simpleFoldTests = []string{
// SimpleFold could order its returned slices in any order it wants,
// but we know it orders them in increasing order starting at in
// and looping around from MaxRune to 0.
// Easy cases.
"Aa",
"aA",
"δΔ",
"Δδ",
// ASCII special cases.
"KkK",
"kKK",
"KKk",
"Ssſ",
"sſS",
"ſSs",
// Non-ASCII special cases.
"ρϱΡ",
"ϱΡρ",
"Ρρϱ",
"ͅΙιι",
"Ιιιͅ",
"ιιͅΙ",
"ιͅΙι",
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
}
func TestSimpleFold(t *testing.T) {
for _, tt := range simpleFoldTests {
cycle := []int(tt)
rune := cycle[len(cycle)-1]
for _, out := range cycle {
if r := SimpleFold(rune); r != out {
t.Errorf("SimpleFold(%#U) = %#U, want %#U", rune, r, out)
}
rune = out
}
}
}
......@@ -24,15 +24,18 @@ import (
func main() {
flag.Parse()
loadChars() // always needed
loadCasefold()
printCategories()
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
printLatinProperties()
printCasefold()
printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
"http://www.unicode.org/Public/6.0.0/ucd/",
"URL of Unicode database directory")
......@@ -119,6 +122,8 @@ type Char struct {
upperCase int
lowerCase int
titleCase int
foldCase int // simple case folding
caseOrbit int // next in simple case folding orbit
}
// Scripts.txt has form:
......@@ -308,8 +313,53 @@ func loadChars() {
resp.Body.Close()
}
func loadCasefold() {
if *casefoldingURL == "" {
flag.Set("casefolding", *url+"CaseFolding.txt")
}
resp, err := http.Get(*casefoldingURL)
if err != nil {
logger.Fatal(err)
}
if resp.StatusCode != 200 {
logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
}
input := bufio.NewReader(resp.Body)
for {
line, err := input.ReadString('\n')
if err != nil {
if err == os.EOF {
break
}
logger.Fatal(err)
}
if line[0] == '#' {
continue
}
field := strings.Split(line, "; ", -1)
if len(field) != 4 {
logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
}
kind := field[1]
if kind != "C" && kind != "S" {
// Only care about 'common' and 'simple' foldings.
continue
}
p1, err := strconv.Btoui64(field[0], 16)
if err != nil {
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
}
p2, err := strconv.Btoui64(field[2], 16)
if err != nil {
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
}
chars[p1].foldCase = int(p2)
}
resp.Body.Close()
}
const progHeader = `// Generated by running
// maketables --tables=%s --data=%s
// maketables --tables=%s --data=%s --casefolding=%s
// DO NOT EDIT
package unicode
......@@ -330,7 +380,7 @@ func printCategories() {
fullCategoryTest(list)
return
}
fmt.Printf(progHeader, *tablelist, *dataURL)
fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
......@@ -837,13 +887,13 @@ func printCases() {
}
fmt.Printf(
"// Generated by running\n"+
"// maketables --data=%s\n"+
"// maketables --data=%s --casefolding=%s\n"+
"// DO NOT EDIT\n\n"+
"// CaseRanges is the table describing case mappings for all letters with\n"+
"// non-self mappings.\n"+
"var CaseRanges = _CaseRanges\n"+
"var _CaseRanges = []CaseRange {\n",
*dataURL)
*dataURL, *casefoldingURL)
var startState *caseState // the start of a run; nil for not active
var prevState = &caseState{} // the state of the previous character
......@@ -946,13 +996,246 @@ func printLatinProperties() {
if code == ' ' {
property = "pZ | pp"
}
fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
}
fmt.Printf("}\n\n")
}
func printCasefold() {
// Build list of case-folding groups attached to each canonical folded char (typically lower case).
var caseOrbit = make([][]int, MaxChar+1)
for i := range chars {
c := &chars[i]
if c.foldCase == 0 {
continue
}
orb := caseOrbit[c.foldCase]
if orb == nil {
orb = append(orb, c.foldCase)
}
caseOrbit[c.foldCase] = append(orb, i)
}
// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
for i := range chars {
c := &chars[i]
f := c.foldCase
if f == 0 {
f = i
}
orb := caseOrbit[f]
if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
// Default assumption of [upper, lower] is wrong.
caseOrbit[i] = []int{i}
}
}
// Delete the groups for which assuming [lower, upper] is right.
for i, orb := range caseOrbit {
if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
caseOrbit[i] = nil
}
}
// Record orbit information in chars.
for _, orb := range caseOrbit {
if orb == nil {
continue
}
sort.SortInts(orb)
c := orb[len(orb)-1]
for _, d := range orb {
chars[c].caseOrbit = d
c = d
}
}
printCaseOrbit()
// Tables of category and script folding exceptions: code points
// that must be added when interpreting a particular category/script
// in a case-folding context.
cat := make(map[string]map[int]bool)
for name := range category {
if x := foldExceptions(inCategory(name)); len(x) > 0 {
cat[name] = x
}
}
scr := make(map[string]map[int]bool)
for name := range scripts {
if x := foldExceptions(inScript(name)); len(x) > 0 {
cat[name] = x
}
}
printCatFold("FoldCategory", cat)
printCatFold("FoldScript", scr)
}
// inCategory returns a list of all the runes in the category.
func inCategory(name string) []int {
var x []int
for i := range chars {
c := &chars[i]
if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
x = append(x, i)
}
}
fmt.Println("}")
return x
}
var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables.
// inScript returns a list of all the runes in the script.
func inScript(name string) []int {
var x []int
for _, s := range scripts[name] {
for c := s.lo; c <= s.hi; c++ {
x = append(x, int(c))
}
}
return x
}
// foldExceptions returns a list of all the runes fold-equivalent
// to runes in class but not in class themselves.
func foldExceptions(class []int) map[int]bool {
// Create map containing class and all fold-equivalent chars.
m := make(map[int]bool)
for _, r := range class {
c := &chars[r]
if c.caseOrbit == 0 {
// Just upper and lower.
if u := c.upperCase; u != 0 {
m[u] = true
}
if l := c.lowerCase; l != 0 {
m[l] = true
}
m[r] = true
continue
}
// Otherwise walk orbit.
r0 := r
for {
m[r] = true
r = chars[r].caseOrbit
if r == r0 {
break
}
}
}
// Remove class itself.
for _, r := range class {
m[r] = false, false
}
// What's left is the exceptions.
return m
}
var comment = map[string]string{
"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
"// code points outside the category that are equivalent under\n" +
"// simple case folding to code points inside the category.\n" +
"// If there is no entry for a category name, there are no such points.\n",
"FoldScript": "// FoldScript maps a script name to a table of\n" +
"// code points outside the script that are equivalent under\n" +
"// simple case folding to code points inside the script.\n" +
"// If there is no entry for a script name, there are no such points.\n",
}
func printCaseOrbit() {
if *test {
for i := range chars {
c := &chars[i]
f := c.caseOrbit
if f == 0 {
if c.lowerCase != i && c.lowerCase != 0 {
f = c.lowerCase
} else if c.upperCase != i && c.upperCase != 0 {
f = c.upperCase
} else {
f = i
}
}
if g := unicode.SimpleFold(i); g != f {
fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
}
}
return
}
fmt.Printf("var caseOrbit = []foldPair{\n")
for i := range chars {
c := &chars[i]
if c.caseOrbit != 0 {
fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
foldPairCount++
}
}
fmt.Printf("}\n\n")
}
func printCatFold(name string, m map[string]map[int]bool) {
if *test {
var pkgMap map[string]*unicode.RangeTable
if name == "FoldCategory" {
pkgMap = unicode.FoldCategory
} else {
pkgMap = unicode.FoldScript
}
if len(pkgMap) != len(m) {
fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
return
}
for k, v := range m {
t, ok := pkgMap[k]
if !ok {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
continue
}
n := 0
for _, r := range t.R16 {
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
if !v[c] {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
}
n++
}
}
for _, r := range t.R32 {
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
if !v[c] {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
}
n++
}
}
if n != len(v) {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
}
}
return
}
fmt.Print(comment[name])
fmt.Printf("var %s = map[string]*RangeTable{\n", name)
for name := range m {
fmt.Printf("\t%q: fold%s,\n", name, name)
}
fmt.Printf("}\n\n")
for name, class := range m {
dumpRange(
fmt.Sprintf("var fold%s = &RangeTable{\n", name),
func(code int) bool { return class[code] })
}
}
var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables.
var foldPairCount = 0 // Number of fold pairs in the exception tables.
func printSizes() {
if *test {
......@@ -963,4 +1246,6 @@ func printSizes() {
range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
fmt.Println()
fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment