Commit 0de328ed authored by Rob Pike's avatar Rob Pike

unicode: make the tables smaller.

By splitting the ranges into 16-bit values and 32-bit values,
we can reduce about 3000 entries by 48 bits per entry, or about
16KB, at the cost of a little more complexity in the code.

R=iant, bradfitz, rsc, r
CC=golang-dev
https://golang.org/cl/4547066
parent 2c6a2a97
......@@ -11,13 +11,30 @@ const (
ReplacementChar = 0xFFFD // Represents invalid code points.
)
// RangeTable defines a set of Unicode code points by listing the ranges of
// code points within the set. The ranges are listed in two slices
// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
// The two slices must be in sorted order and non-overlapping.
type RangeTable struct {
R16 []Range16
R32 []Range32
}
// The representation of a range of Unicode code points. The range runs from Lo to Hi
// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
// inclusive and has the specified stride.
type Range struct {
Lo int
Hi int
Stride int
type Range16 struct {
Lo uint16
Hi uint16
Stride uint16
}
// Range32 represents of a range of Unicode code points and is used when one or
// more of the values will not fit in 16 bits. The range runs from Lo to Hi
// inclusive and has the specified stride.
type Range32 struct {
Lo uint32
Hi uint32
Stride uint32
}
// CaseRange represents a range of Unicode code points for simple (one
......@@ -60,22 +77,28 @@ const (
UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
)
// Is tests whether rune is in the specified table of ranges.
func Is(ranges []Range, rune int) bool {
// common case: rune is ASCII or Latin-1
if rune < 0x100 {
for _, r := range ranges {
if rune > r.Hi {
continue
// is16 uses binary search to test whether rune is in the specified slice of 16-bit ranges.
func is16(ranges []Range16, rune uint16) bool {
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := lo + (hi-lo)/2
r := ranges[m]
if r.Lo <= rune && rune <= r.Hi {
return (rune-r.Lo)%r.Stride == 0
}
if rune < r.Lo {
return false
hi = m
} else {
lo = m + 1
}
return (rune-r.Lo)%r.Stride == 0
}
return false
}
}
// is32 uses binary search to test whether rune is in the specified slice of 32-bit ranges.
func is32(ranges []Range32, rune uint32) bool {
// binary search over ranges
lo := 0
hi := len(ranges)
......@@ -94,6 +117,43 @@ func Is(ranges []Range, rune int) bool {
return false
}
// Is tests whether rune is in the specified table of ranges.
func Is(rangeTab *RangeTable, rune int) bool {
// common case: rune is ASCII or Latin-1.
if rune < 0x100 {
r16 := uint16(rune)
for _, r := range rangeTab.R16 {
if r16 > r.Hi {
continue
}
if r16 < r.Lo {
return false
}
return (r16-r.Lo)%r.Stride == 0
}
r32 := uint32(rune)
for _, r := range rangeTab.R32 {
if r32 > r.Hi {
continue
}
if r32 < r.Lo {
return false
}
return (r32-r.Lo)%r.Stride == 0
}
return false
}
r16 := rangeTab.R16
if len(r16) > 0 && rune <= int(r16[len(r16)-1].Hi) {
return is16(r16, uint16(rune))
}
r32 := rangeTab.R32
if len(r32) > 0 && rune >= int(r32[0].Lo) {
return is32(r32, uint32(rune))
}
return false
}
// IsUpper reports whether the rune is an upper case letter.
func IsUpper(rune int) bool {
if rune < 0x80 { // quick ASCII check
......
......@@ -28,6 +28,7 @@ func main() {
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
......@@ -278,16 +279,16 @@ func loadChars() {
switch parseCategory(line[0 : len(line)-1]) {
case SNormal:
if first != 0 {
logger.Fatalf("bad state normal at U+%04X", lastChar)
logger.Fatalf("bad state normal at %U", lastChar)
}
case SFirst:
if first != 0 {
logger.Fatalf("bad state first at U+%04X", lastChar)
logger.Fatalf("bad state first at %U", lastChar)
}
first = lastChar
case SLast:
if first == 0 {
logger.Fatalf("bad state last at U+%04X", lastChar)
logger.Fatalf("bad state last at %U", lastChar)
}
for i := first + 1; i <= lastChar; i++ {
chars[i] = chars[first]
......@@ -299,6 +300,15 @@ func loadChars() {
resp.Body.Close()
}
const progHeader = `// Generated by running
// maketables --tables=%s --data=%s
// DO NOT EDIT
package unicode
`
func printCategories() {
if *tablelist == "" {
return
......@@ -312,20 +322,14 @@ func printCategories() {
fullCategoryTest(list)
return
}
fmt.Printf(
"// Generated by running\n"+
"// maketables --tables=%s --data=%s\n"+
"// DO NOT EDIT\n\n"+
"package unicode\n\n",
*tablelist,
*dataURL)
fmt.Printf(progHeader, *tablelist, *dataURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
if *tablelist == "all" {
fmt.Println("// Categories is the set of Unicode data tables.")
fmt.Println("var Categories = map[string] []Range {")
fmt.Println("var Categories = map[string] *RangeTable {")
for k := range category {
fmt.Printf("\t%q: %s,\n", k, k)
}
......@@ -364,12 +368,12 @@ func printCategories() {
ndecl++
if name == "letter" { // special case
dumpRange(
"var letter = []Range {\n",
"var letter = &RangeTable{\n",
letterOp)
continue
}
dumpRange(
fmt.Sprintf("var _%s = []Range {\n", name),
fmt.Sprintf("var _%s = &RangeTable{\n", name),
func(code int) bool { return chars[code].category == name })
}
decl.Sort()
......@@ -382,12 +386,15 @@ func printCategories() {
type Op func(code int) bool
const format = "\t{0x%04x, 0x%04x, %d},\n"
const format = "\t\t{0x%04x, 0x%04x, %d},\n"
func dumpRange(header string, inCategory Op) {
fmt.Print(header)
next := 0
fmt.Print("\tR16: []Range16{\n")
// one Range for each iteration
count := &range16Count
size := 16
for {
// look for start of range
for next < len(chars) && !inCategory(next) {
......@@ -427,10 +434,18 @@ func dumpRange(header string, inCategory Op) {
break
}
}
if size == 16 && (lo >= 1<<16 || hi >= 1<<16) {
fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n")
size = 32
count = &range32Count
}
fmt.Printf(format, lo, hi, stride)
*count++
// next range: start looking where this range ends
next = hi + 1
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
......@@ -454,12 +469,12 @@ func fullCategoryTest(list []string) {
}
}
func verifyRange(name string, inCategory Op, table []unicode.Range) {
func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
for i := range chars {
web := inCategory(i)
pkg := unicode.Is(table, i)
if web != pkg {
fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
}
}
}
......@@ -497,22 +512,22 @@ func parseScript(line string, scripts map[string][]Script) {
}
// The script tables have a lot of adjacent elements. Fold them together.
func foldAdjacent(r []Script) []unicode.Range {
s := make([]unicode.Range, 0, len(r))
func foldAdjacent(r []Script) []unicode.Range32 {
s := make([]unicode.Range32, 0, len(r))
j := 0
for i := 0; i < len(r); i++ {
if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
s[j-1].Hi = int(r[i].hi)
if j > 0 && r[i].lo == s[j-1].Hi+1 {
s[j-1].Hi = r[i].hi
} else {
s = s[0 : j+1]
s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
j++
}
}
return s
}
func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
logger.Fatal("unknown script", name)
......@@ -524,7 +539,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts
for _, script := range scripts[name] {
for r := script.lo; r <= script.hi; r++ {
if !unicode.Is(installed[name], int(r)) {
fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
}
}
}
......@@ -589,10 +604,10 @@ func printScriptOrProperty(doProps bool) {
if flaglist == "all" {
if doProps {
fmt.Println("// Properties is the set of Unicode property tables.")
fmt.Println("var Properties = map[string] []Range {")
fmt.Println("var Properties = map[string] *RangeTable{")
} else {
fmt.Println("// Scripts is the set of Unicode script tables.")
fmt.Println("var Scripts = map[string] []Range {")
fmt.Println("var Scripts = map[string] *RangeTable{")
}
for k := range table {
fmt.Printf("\t%q: %s,\n", k, k)
......@@ -613,11 +628,22 @@ func printScriptOrProperty(doProps bool) {
name, name, name, name)
}
ndecl++
fmt.Printf("var _%s = []Range {\n", name)
fmt.Printf("var _%s = &RangeTable {\n", name)
fmt.Print("\tR16: []Range16{\n")
ranges := foldAdjacent(table[name])
size := 16
count := &range16Count
for _, s := range ranges {
if size == 16 && (s.Lo >= 1<<16 || s.Hi >= 1<<16) {
fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n")
size = 32
count = &range32Count
}
*count++
fmt.Printf(format, s.Lo, s.Hi, s.Stride)
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
decl.Sort()
......@@ -808,7 +834,7 @@ func printCaseRange(lo, hi *caseState) {
fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
lo.point, hi.point)
case hi.point > lo.point && lo.isLowerUpper():
logger.Fatalf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point)
logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
lo.point, hi.point)
default:
......@@ -831,17 +857,28 @@ func fullCaseTest() {
lower := unicode.ToLower(i)
want := caseIt(i, c.lowerCase)
if lower != want {
fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
}
upper := unicode.ToUpper(i)
want = caseIt(i, c.upperCase)
if upper != want {
fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
}
title := unicode.ToTitle(i)
want = caseIt(i, c.titleCase)
if title != want {
fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
}
}
}
var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables.
func printSizes() {
fmt.Println()
fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
}
This diff is collapsed.
......@@ -1028,7 +1028,8 @@ func isNameByte(c byte) bool {
// and then reformatting. First corresponds to (Letter | '_' | ':')
// and second corresponds to NameChar.
var first = []unicode.Range{
var first = &unicode.RangeTable{
R16: []unicode.Range16{
{0x003A, 0x003A, 1},
{0x0041, 0x005A, 1},
{0x005F, 0x005F, 1},
......@@ -1219,9 +1220,11 @@ var first = []unicode.Range{
{0x3105, 0x312C, 1},
{0x4E00, 0x9FA5, 1},
{0xAC00, 0xD7A3, 1},
},
}
var second = []unicode.Range{
var second = &unicode.RangeTable{
R16: []unicode.Range16{
{0x002D, 0x002E, 1},
{0x0030, 0x0039, 1},
{0x00B7, 0x00B7, 1},
......@@ -1334,6 +1337,7 @@ var second = []unicode.Range{
{0x3099, 0x309A, 1},
{0x309D, 0x309E, 1},
{0x30FC, 0x30FE, 1},
},
}
// HTMLEntity is an entity map containing translations for the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment