unicode: make the tables smaller.

By splitting the ranges into 16-bit values and 32-bit values, we can reduce about 3000 entries by 48 bits per entry, or about 16KB, at the cost of a little more complexity in the code. R=iant, bradfitz, rsc, r CC=golang-dev https://golang.org/cl/4547066

unicode: make the tables smaller.
By splitting the ranges into 16-bit values and 32-bit values, we can reduce about 3000 entries by 48 bits per entry, or about 16KB, at the cost of a little more complexity in the code. R=iant, bradfitz, rsc, r CC=golang-dev https://golang.org/cl/4547066
0de328ed · Rob Pike · 2c6a2a97 · 0de328ed · 0de328ed · 0de328ed
Commit 0de328ed authored May 31, 2011 by Rob Pike
4 changed files
--- a/src/pkg/unicode/letter.go
+++ b/src/pkg/unicode/letter.go
@@ -11,13 +11,30 @@ const (
 	ReplacementChar = 0xFFFD   // Represents invalid code points.
 )

+// RangeTable defines a set of Unicode code points by listing the ranges of
+// code points within the set. The ranges are listed in two slices
+// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
+// The two slices must be in sorted order and non-overlapping.
+type RangeTable struct {
+	R16 []Range16
+	R32 []Range32
+}

-// The representation of a range of Unicode code points.  The range runs from Lo to Hi
+// Range16 represents of a range of 16-bit Unicode code points.  The range runs from Lo to Hi
 // inclusive and has the specified stride.
-type Range struct {
-	Lo     int
-	Hi     int
-	Stride int
+type Range16 struct {
+	Lo     uint16
+	Hi     uint16
+	Stride uint16
+}
+
+// Range32 represents of a range of Unicode code points and is used when one or
+//  more of the values will not fit in 16 bits.  The range runs from Lo to Hi
+// inclusive and has the specified stride.
+type Range32 struct {
+	Lo     uint32
+	Hi     uint32
+	Stride uint32
 }

 // CaseRange represents a range of Unicode code points for simple (one
@@ -60,22 +77,28 @@ const (
 	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
 )

-// Is tests whether rune is in the specified table of ranges.
-func Is(ranges []Range, rune int) bool {
-	// common case: rune is ASCII or Latin-1
-	if rune < 0x100 {
-		for _, r := range ranges {
-			if rune > r.Hi {
-				continue
+// is16 uses binary search to test whether rune is in the specified slice of 16-bit ranges.
+func is16(ranges []Range16, rune uint16) bool {
+	// binary search over ranges
+	lo := 0
+	hi := len(ranges)
+	for lo < hi {
+		m := lo + (hi-lo)/2
+		r := ranges[m]
+		if r.Lo <= rune && rune <= r.Hi {
+			return (rune-r.Lo)%r.Stride == 0
 		}
 		if rune < r.Lo {
-				return false
+			hi = m
+		} else {
+			lo = m + 1
 		}
-			return (rune-r.Lo)%r.Stride == 0
 	}
 	return false
-	}
+}

+// is32 uses binary search to test whether rune is in the specified slice of 32-bit ranges.
+func is32(ranges []Range32, rune uint32) bool {
 	// binary search over ranges
 	lo := 0
 	hi := len(ranges)
@@ -94,6 +117,43 @@ func Is(ranges []Range, rune int) bool {
 	return false
 }

+// Is tests whether rune is in the specified table of ranges.
+func Is(rangeTab *RangeTable, rune int) bool {
+	// common case: rune is ASCII or Latin-1.
+	if rune < 0x100 {
+		r16 := uint16(rune)
+		for _, r := range rangeTab.R16 {
+			if r16 > r.Hi {
+				continue
+			}
+			if r16 < r.Lo {
+				return false
+			}
+			return (r16-r.Lo)%r.Stride == 0
+		}
+		r32 := uint32(rune)
+		for _, r := range rangeTab.R32 {
+			if r32 > r.Hi {
+				continue
+			}
+			if r32 < r.Lo {
+				return false
+			}
+			return (r32-r.Lo)%r.Stride == 0
+		}
+		return false
+	}
+	r16 := rangeTab.R16
+	if len(r16) > 0 && rune <= int(r16[len(r16)-1].Hi) {
+		return is16(r16, uint16(rune))
+	}
+	r32 := rangeTab.R32
+	if len(r32) > 0 && rune >= int(r32[0].Lo) {
+		return is32(r32, uint32(rune))
+	}
+	return false
+}
+
 // IsUpper reports whether the rune is an upper case letter.
 func IsUpper(rune int) bool {
 	if rune < 0x80 { // quick ASCII check

--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -28,6 +28,7 @@ func main() {
 	printScriptOrProperty(false)
 	printScriptOrProperty(true)
 	printCases()
+	printSizes()
 }

 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
@@ -278,16 +279,16 @@ func loadChars() {
 		switch parseCategory(line[0 : len(line)-1]) {
 		case SNormal:
 			if first != 0 {
-				logger.Fatalf("bad state normal at U+%04X", lastChar)
+				logger.Fatalf("bad state normal at %U", lastChar)
 			}
 		case SFirst:
 			if first != 0 {
-				logger.Fatalf("bad state first at U+%04X", lastChar)
+				logger.Fatalf("bad state first at %U", lastChar)
 			}
 			first = lastChar
 		case SLast:
 			if first == 0 {
-				logger.Fatalf("bad state last at U+%04X", lastChar)
+				logger.Fatalf("bad state last at %U", lastChar)
 			}
 			for i := first + 1; i <= lastChar; i++ {
 				chars[i] = chars[first]
@@ -299,6 +300,15 @@ func loadChars() {
 	resp.Body.Close()
 }

+const progHeader = `// Generated by running
+//	maketables --tables=%s --data=%s
+// DO NOT EDIT
+
+package unicode
+
+`
+
+
 func printCategories() {
 	if *tablelist == "" {
 		return
@@ -312,20 +322,14 @@ func printCategories() {
 		fullCategoryTest(list)
 		return
 	}
-	fmt.Printf(
-		"// Generated by running\n"+
-			"//	maketables --tables=%s --data=%s\n"+
-			"// DO NOT EDIT\n\n"+
-			"package unicode\n\n",
-		*tablelist,
-		*dataURL)
+	fmt.Printf(progHeader, *tablelist, *dataURL)

 	fmt.Println("// Version is the Unicode edition from which the tables are derived.")
 	fmt.Printf("const Version = %q\n\n", version())

 	if *tablelist == "all" {
 		fmt.Println("// Categories is the set of Unicode data tables.")
-		fmt.Println("var Categories = map[string] []Range {")
+		fmt.Println("var Categories = map[string] *RangeTable {")
 		for k := range category {
 			fmt.Printf("\t%q: %s,\n", k, k)
 		}
@@ -364,12 +368,12 @@ func printCategories() {
 		ndecl++
 		if name == "letter" { // special case
 			dumpRange(
-				"var letter = []Range {\n",
+				"var letter = &RangeTable{\n",
 				letterOp)
 			continue
 		}
 		dumpRange(
-			fmt.Sprintf("var _%s = []Range {\n", name),
+			fmt.Sprintf("var _%s = &RangeTable{\n", name),
 			func(code int) bool { return chars[code].category == name })
 	}
 	decl.Sort()
@@ -382,12 +386,15 @@ func printCategories() {

 type Op func(code int) bool

-const format = "\t{0x%04x, 0x%04x, %d},\n"
+const format = "\t\t{0x%04x, 0x%04x, %d},\n"

 func dumpRange(header string, inCategory Op) {
 	fmt.Print(header)
 	next := 0
+	fmt.Print("\tR16: []Range16{\n")
 	// one Range for each iteration
+	count := &range16Count
+	size := 16
 	for {
 		// look for start of range
 		for next < len(chars) && !inCategory(next) {
@@ -427,10 +434,18 @@ func dumpRange(header string, inCategory Op) {
 				break
 			}
 		}
+		if size == 16 && (lo >= 1<<16 || hi >= 1<<16) {
+			fmt.Print("\t},\n")
+			fmt.Print("\tR32: []Range32{\n")
+			size = 32
+			count = &range32Count
+		}
 		fmt.Printf(format, lo, hi, stride)
+		*count++
 		// next range: start looking where this range ends
 		next = hi + 1
 	}
+	fmt.Print("\t},\n")
 	fmt.Print("}\n\n")
 }

@@ -454,12 +469,12 @@ func fullCategoryTest(list []string) {
 	}
 }

-func verifyRange(name string, inCategory Op, table []unicode.Range) {
+func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
 	for i := range chars {
 		web := inCategory(i)
 		pkg := unicode.Is(table, i)
 		if web != pkg {
-			fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
+			fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
 		}
 	}
 }
@@ -497,22 +512,22 @@ func parseScript(line string, scripts map[string][]Script) {
 }

 // The script tables have a lot of adjacent elements. Fold them together.
-func foldAdjacent(r []Script) []unicode.Range {
-	s := make([]unicode.Range, 0, len(r))
+func foldAdjacent(r []Script) []unicode.Range32 {
+	s := make([]unicode.Range32, 0, len(r))
 	j := 0
 	for i := 0; i < len(r); i++ {
-		if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
-			s[j-1].Hi = int(r[i].hi)
+		if j > 0 && r[i].lo == s[j-1].Hi+1 {
+			s[j-1].Hi = r[i].hi
 		} else {
 			s = s[0 : j+1]
-			s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
+			s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
 			j++
 		}
 	}
 	return s
 }

-func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
+func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
 	for _, name := range list {
 		if _, ok := scripts[name]; !ok {
 			logger.Fatal("unknown script", name)
@@ -524,7 +539,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts
 		for _, script := range scripts[name] {
 			for r := script.lo; r <= script.hi; r++ {
 				if !unicode.Is(installed[name], int(r)) {
-					fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
+					fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
 				}
 			}
 		}
@@ -589,10 +604,10 @@ func printScriptOrProperty(doProps bool) {
 	if flaglist == "all" {
 		if doProps {
 			fmt.Println("// Properties is the set of Unicode property tables.")
-			fmt.Println("var Properties = map[string] []Range {")
+			fmt.Println("var Properties = map[string] *RangeTable{")
 		} else {
 			fmt.Println("// Scripts is the set of Unicode script tables.")
-			fmt.Println("var Scripts = map[string] []Range {")
+			fmt.Println("var Scripts = map[string] *RangeTable{")
 		}
 		for k := range table {
 			fmt.Printf("\t%q: %s,\n", k, k)
@@ -613,11 +628,22 @@ func printScriptOrProperty(doProps bool) {
 				name, name, name, name)
 		}
 		ndecl++
-		fmt.Printf("var _%s = []Range {\n", name)
+		fmt.Printf("var _%s = &RangeTable {\n", name)
+		fmt.Print("\tR16: []Range16{\n")
 		ranges := foldAdjacent(table[name])
+		size := 16
+		count := &range16Count
 		for _, s := range ranges {
+			if size == 16 && (s.Lo >= 1<<16 || s.Hi >= 1<<16) {
+				fmt.Print("\t},\n")
+				fmt.Print("\tR32: []Range32{\n")
+				size = 32
+				count = &range32Count
+			}
+			*count++
 			fmt.Printf(format, s.Lo, s.Hi, s.Stride)
 		}
+		fmt.Print("\t},\n")
 		fmt.Print("}\n\n")
 	}
 	decl.Sort()
@@ -808,7 +834,7 @@ func printCaseRange(lo, hi *caseState) {
 		fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
 			lo.point, hi.point)
 	case hi.point > lo.point && lo.isLowerUpper():
-		logger.Fatalf("LowerUpper sequence: should not happen: U+%04X.  If it's real, need to fix To()", lo.point)
+		logger.Fatalf("LowerUpper sequence: should not happen: %U.  If it's real, need to fix To()", lo.point)
 		fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
 			lo.point, hi.point)
 	default:
@@ -831,17 +857,28 @@ func fullCaseTest() {
 		lower := unicode.ToLower(i)
 		want := caseIt(i, c.lowerCase)
 		if lower != want {
-			fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
+			fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
 		}
 		upper := unicode.ToUpper(i)
 		want = caseIt(i, c.upperCase)
 		if upper != want {
-			fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
+			fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
 		}
 		title := unicode.ToTitle(i)
 		want = caseIt(i, c.titleCase)
 		if title != want {
-			fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
+			fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
 		}
 	}
 }
+
+var range16Count = 0 // Number of entries in the 16-bit range tables.
+var range32Count = 0 // Number of entries in the 32-bit range tables.
+
+func printSizes() {
+	fmt.Println()
+	fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
+	range16Bytes := range16Count * 3 * 2
+	range32Bytes := range32Count * 3 * 4
+	fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
+}
--- a/src/pkg/unicode/tables.go
+++ b/src/pkg/unicode/tables.go
--- a/src/pkg/xml/xml.go
+++ b/src/pkg/xml/xml.go
@@ -1028,7 +1028,8 @@ func isNameByte(c byte) bool {
 // and then reformatting.  First corresponds to (Letter | '_' | ':')
 // and second corresponds to NameChar.

-var first = []unicode.Range{
+var first = &unicode.RangeTable{
+	R16: []unicode.Range16{
 		{0x003A, 0x003A, 1},
 		{0x0041, 0x005A, 1},
 		{0x005F, 0x005F, 1},
@@ -1219,9 +1220,11 @@ var first = []unicode.Range{
 		{0x3105, 0x312C, 1},
 		{0x4E00, 0x9FA5, 1},
 		{0xAC00, 0xD7A3, 1},
+	},
 }

-var second = []unicode.Range{
+var second = &unicode.RangeTable{
+	R16: []unicode.Range16{
 		{0x002D, 0x002E, 1},
 		{0x0030, 0x0039, 1},
 		{0x00B7, 0x00B7, 1},
@@ -1334,6 +1337,7 @@ var second = []unicode.Range{
 		{0x3099, 0x309A, 1},
 		{0x309D, 0x309E, 1},
 		{0x30FC, 0x30FE, 1},
+	},
 }

 // HTMLEntity is an entity map containing translations for the