first cut at case mapping tables and library.

next cut will do the optimization for alternating sequences. R=rsc DELTA=1658 (1620 added, 9 deleted, 29 changed) OCL=34072 CL=34075

first cut at case mapping tables and library.
next cut will do the optimization for alternating sequences. R=rsc DELTA=1658 (1620 added, 9 deleted, 29 changed) OCL=34072 CL=34075
22c2b476 · Rob Pike · 30dcb134 · 22c2b476 · 22c2b476 · 22c2b476
Commit 22c2b476 authored Aug 28, 2009 by Rob Pike
5 changed files
--- a/src/pkg/unicode/digit_test.go
+++ b/src/pkg/unicode/digit_test.go
@@ -103,12 +103,12 @@ var testLetter = []int {
 func TestDigit(t *testing.T) {
 	for i, r := range testDigit {
 		if !IsDigit(r) {
-			t.Errorf("IsDigit(%#x) = false, want true\n", r);
+			t.Errorf("IsDigit(U+%04X) = false, want true\n", r);
 		}
 	}
 	for i, r := range testLetter {
 		if IsDigit(r) {
-			t.Errorf("IsDigit(%#x) = true, want false\n", r);
+			t.Errorf("IsDigit(U+%04X) = true, want false\n", r);
 		}
 	}
 }
--- a/src/pkg/unicode/letter.go
+++ b/src/pkg/unicode/letter.go
@@ -9,11 +9,39 @@ package unicode
 // The representation of a range of Unicode code points.  The range runs from Lo to Hi
 // inclusive and has the specified stride.
 type Range struct {
-	Lo int;
-	Hi int;
-	Stride int;
+	Lo	int;
+	Hi	int;
+	Stride	int;
 }

+// The representation of a range of Unicode code points for case conversion.
+// The range runs from Lo to Hi inclusive, with a fixed stride of 1.  Deltas
+// are the number to add to the code point to reach the code point for a
+// different case for that character.  They may be negative.  If zero, it
+// means the character is in the corresponding case.
+type CaseRange struct {
+	Lo	int;
+	Hi	int;
+	Delta	d;
+}
+
+// Indices into the Delta arrays inside CaseRanges for case mapping.
+const (
+	UpperCase = iota;
+	LowerCase;
+	TitleCase;
+	MaxCase;
+)
+type d [MaxCase]int32	// to make the CaseRanges text shorter
+
+// If the Delta field of a CaseRange is UpperLower or LowerUpper, it means
+// this CaseRange represents a sequence of the form (say)
+// Upper Lower Upper Lower.
+const (
+	UpperLower	= 1;
+	LowerUpper	= -1;
+)
+
 // Is tests whether rune is in the specified table of ranges.
 func Is(ranges []Range, rune int) bool {
 	// common case: rune is ASCII or Latin-1
@@ -80,3 +108,59 @@ func IsLetter(rune int) bool {
 	}
 	return Is(Letter, rune);
 }
+
+// To maps the rune to the specified case, UpperCase, LowerCase, or TitleCase
+func To(_case int, rune int) int {
+	if _case < 0 || MaxCase <= _case {
+		return 0xFFFD	// as reasonable an error as any
+	}
+	// binary search over ranges
+	lo := 0;
+	hi := len(CaseRanges);
+	for lo < hi {
+		m := lo + (hi - lo)/2;
+		r := CaseRanges[m];
+		if r.Lo <= rune && rune <= r.Hi {
+			return rune + int(r.Delta[_case]);
+		}
+		if rune < r.Lo {
+			hi = m;
+		} else {
+			lo = m+1;
+		}
+	}
+	return rune;
+}
+
+// ToUpper maps the rune to upper case
+func ToUpper(rune int) int {
+	if rune < 0x80 {	// quick ASCII check
+		if 'a' <= rune && rune <= 'z' {
+			rune &^= ' '
+		}
+		return rune
+	}
+	return To(UpperCase, rune);
+}
+
+// ToLower maps the rune to lower case
+func ToLower(rune int) int {
+	if rune < 0x80 {	// quick ASCII check
+		if 'A' <= rune && rune <= 'Z' {
+			rune |= ' '
+		}
+		return rune
+	}
+	return To(LowerCase, rune);
+}
+
+// ToTitle maps the rune to title case
+func ToTitle(rune int) int {
+	if rune < 0x80 {	// quick ASCII check
+		if 'a' <= rune && rune <= 'z' {	// title case is upper case for ASCII
+			rune &^= ' '
+		}
+		return rune
+	}
+	return To(TitleCase, rune);
+}
--- a/src/pkg/unicode/letter_test.go
+++ b/src/pkg/unicode/letter_test.go
@@ -89,20 +89,127 @@ var notletterTest = []int{
 	0x10ffff,
 }

+type caseT struct {
+	cas, in, out int
+}
+
+var caseTest = []caseT {
+	// errors
+	caseT{-1,	'\n',	0xFFFD},
+	caseT{UpperCase,	-1,	-1},
+	caseT{UpperCase,	1<<30,	1<<30},
+
+	// ASCII (special-cased so test carefully)
+	caseT{UpperCase,	'\n',	'\n'},
+	caseT{UpperCase,	'a',	'A'},
+	caseT{UpperCase,	'A',	'A'},
+	caseT{UpperCase,	'7',	'7'},
+	caseT{LowerCase,	'\n',	'\n'},
+	caseT{LowerCase,	'a',	'a'},
+	caseT{LowerCase,	'A',	'a'},
+	caseT{LowerCase,	'7',	'7'},
+	caseT{TitleCase,	'\n',	'\n'},
+	caseT{TitleCase,	'a',	'A'},
+	caseT{TitleCase,	'A',	'A'},
+	caseT{TitleCase,	'7',	'7'},
+
+	// Latin-1: easy to read the tests!
+	caseT{UpperCase,	0x80,	0x80},
+	caseT{UpperCase,	'Å',	'Å'},
+	caseT{UpperCase,	'å',	'Å'},
+	caseT{LowerCase,	0x80,	0x80},
+	caseT{LowerCase,	'Å',	'å'},
+	caseT{LowerCase,	'å',	'å'},
+	caseT{TitleCase,	0x80,	0x80},
+	caseT{TitleCase,	'Å',	'Å'},
+	caseT{TitleCase,	'å',	'Å'},
+
+	// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
+	caseT{UpperCase,	0x0131,	'I'},
+	caseT{LowerCase,	0x0131,	0x0131},
+	caseT{TitleCase,	0x0131,	'I'},
+
+	// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
+	caseT{UpperCase,	0x0133,	0x0132},
+	caseT{LowerCase,	0x0133,	0x0133},
+	caseT{TitleCase,	0x0133,	0x0132},
+
+	// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
+	caseT{UpperCase,	0x212A,	0x212A},
+	caseT{LowerCase,	0x212A,	'k'},
+	caseT{TitleCase,	0x212A,	0x212A},
+
+	// From an UpperLower sequence
+	// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
+	caseT{UpperCase,	0xA640,	0xA640},
+	caseT{LowerCase,	0xA640,	0xA641},
+	caseT{TitleCase,	0xA640,	0xA640},
+	// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
+	caseT{UpperCase,	0xA641,	0xA640},
+	caseT{LowerCase,	0xA641,	0xA641},
+	caseT{TitleCase,	0xA641,	0xA640},
+	// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
+	caseT{UpperCase,	0xA64E,	0xA64E},
+	caseT{LowerCase,	0xA64E,	0xA64F},
+	caseT{TitleCase,	0xA64E,	0xA64E},
+	// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
+	caseT{UpperCase,	0xA65F,	0xA65E},
+	caseT{LowerCase,	0xA65F,	0xA65F},
+	caseT{TitleCase,	0xA65F,	0xA65E},
+
+	// From a LowerUpper sequence
+	// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
+	caseT{UpperCase,	0x0139,	0x0139},
+	caseT{LowerCase,	0x0139,	0x013A},
+	caseT{TitleCase,	0x0139,	0x0139},
+	// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
+	caseT{UpperCase,	0x013f,	0x013f},
+	caseT{LowerCase,	0x013f,	0x0140},
+	caseT{TitleCase,	0x013f,	0x013f},
+	// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
+	caseT{UpperCase,	0x0148,	0x0147},
+	caseT{LowerCase,	0x0148,	0x0148},
+	caseT{TitleCase,	0x0148,	0x0147},
+
+	// Last block in the 5.1.0 table
+	// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
+	caseT{UpperCase,	0x10400,	0x10400},
+	caseT{LowerCase,	0x10400,	0x10428},
+	caseT{TitleCase,	0x10400,	0x10400},
+	// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
+	caseT{UpperCase,	0x10427,	0x10427},
+	caseT{LowerCase,	0x10427,	0x1044F},
+	caseT{TitleCase,	0x10427,	0x10427},
+	// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
+	caseT{UpperCase,	0x10428,	0x10400},
+	caseT{LowerCase,	0x10428,	0x10428},
+	caseT{TitleCase,	0x10428,	0x10400},
+	// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
+	caseT{UpperCase,	0x1044F,	0x10427},
+	caseT{LowerCase,	0x1044F,	0x1044F},
+	caseT{TitleCase,	0x1044F,	0x10427},
+
+	// First one not in the 5.1.0 table
+	// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
+	caseT{UpperCase,	0x10450,	0x10450},
+	caseT{LowerCase,	0x10450,	0x10450},
+	caseT{TitleCase,	0x10450,	0x10450},
+}
+
 func TestIsLetter(t *testing.T) {
 	for i, r := range upperTest {
 		if !IsLetter(r) {
-			t.Errorf("IsLetter(%#x) = false, want true\n", r);
+			t.Errorf("IsLetter(U+%04X) = false, want true\n", r);
 		}
 	}
 	for i, r := range letterTest {
 		if !IsLetter(r) {
-			t.Errorf("IsLetter(%#x) = false, want true\n", r);
+			t.Errorf("IsLetter(U+%04X) = false, want true\n", r);
 		}
 	}
 	for i, r := range notletterTest {
 		if IsLetter(r) {
-			t.Errorf("IsLetter(%#x) = true, want false\n", r);
+			t.Errorf("IsLetter(U+%04X) = true, want false\n", r);
 		}
 	}
 }
@@ -110,17 +217,74 @@ func TestIsLetter(t *testing.T) {
 func TestIsUpper(t *testing.T) {
 	for i, r := range upperTest {
 		if !IsUpper(r) {
-			t.Errorf("IsUpper(%#x) = false, want true\n", r);
+			t.Errorf("IsUpper(U+%04X) = false, want true\n", r);
 		}
 	}
 	for i, r := range notupperTest {
 		if IsUpper(r) {
-			t.Errorf("IsUpper(%#x) = true, want false\n", r);
+			t.Errorf("IsUpper(U+%04X) = true, want false\n", r);
 		}
 	}
 	for i, r := range notletterTest {
 		if IsUpper(r) {
-			t.Errorf("IsUpper(%#x) = true, want false\n", r);
+			t.Errorf("IsUpper(U+%04X) = true, want false\n", r);
+		}
+	}
+}
+
+func caseString(c int) string {
+	switch c {
+	case UpperCase:
+		return "UpperCase"
+	case LowerCase:
+		return "LowerCase"
+	case TitleCase:
+		return "TitleCase"
+	}
+	return "ErrorCase"
+}
+
+func TestTo(t *testing.T) {
+	for i, c := range caseTest {
+		r := To(c.cas, c.in);
+		if c.out != r {
+			t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X\n", c.in, caseString(c.cas), r, c.out);
+		}
+	}
+}
+
+func TestToUpperCase(t *testing.T) {
+	for i, c := range caseTest {
+		if c.cas != UpperCase {
+			continue
+		}
+		r := ToUpper(c.in);
+		if c.out != r {
+			t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
+		}
+	}
+}
+
+func TestToLowerCase(t *testing.T) {
+	for i, c := range caseTest {
+		if c.cas != LowerCase {
+			continue
+		}
+		r := ToLower(c.in);
+		if c.out != r {
+			t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
+		}
+	}
+}
+
+func TestToTitleCase(t *testing.T) {
+	for i, c := range caseTest {
+		if c.cas != TitleCase {
+			continue
+		}
+		r := ToTitle(c.in);
+		if c.out != r {
+			t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X\n", c.in, r, c.out);
 		}
 	}
 }
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -21,6 +21,14 @@ import (
 	"unicode";
 )

+func main() {
+	flag.Parse();
+	loadChars();	// always needed
+	printCategories();
+	printScripts();
+	printCases();
+}
+
 var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
 var url = flag.String("url",
 	"http://www.unicode.org/Public/5.1.0/ucd/",
@@ -31,6 +39,9 @@ var tablelist = flag.String("tables",
 var scriptlist = flag.String("scripts",
 	"all",
 	"comma-separated list of which script tables to generate");
+var cases = flag.Bool("cases",
+	true,
+	"generate case tables");
 var test = flag.Bool("test",
 	false,
 	"test existing tables; can be used to compare web data with package data");
@@ -44,7 +55,7 @@ var category = map[string] bool{ "letter":true }	// Nd Lu etc. letter is a speci
 //	0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
 //	007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
-// The fields
+// The fields:
 const (
 	FCodePoint = iota;
 	FName;
@@ -87,11 +98,11 @@ var fieldName = []string{
 // This contains only the properties we're interested in.
 type Char struct {
 	field	[]string; 	// debugging only; could be deleted if we take out char.dump()
-	codePoint	uint32;	// redundant (it's the index in the chars table) but useful
+	codePoint	uint32;	// if zero, this index is not a valid code point.
 	category	string;
-	upperCase	uint32;
-	lowerCase	uint32;
-	titleCase	uint32;
+	upperCase	int;
+	lowerCase	int;
+	titleCase	int;
 }

 // Scripts.txt has form:
@@ -104,26 +115,21 @@ type Script struct {
 	script	string;
 }

-func main() {
-	flag.Parse();
-	printCategories();
-	printScripts();
-}
-
-var chars = make([]Char, MaxChar)
+var chars = make([]Char, MaxChar+1)
 var scripts = make(map[string] []Script)

 var lastChar uint32 = 0;

 // In UnicodeData.txt, some ranges are marked like this:
-// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
+//	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
+//	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 // parseCategory returns a state variable indicating the weirdness.
 type State int
 const (
 	SNormal State = iota;	// known to be zero for the type
 	SFirst;
 	SLast;
+	SMissing;
 )

 func parseCategory(line string) (state State) {
@@ -139,7 +145,7 @@ func parseCategory(line string) (state State) {
 	if point == 0 {
 		return	// not interesting and we use 0 as unset
 	}
-	if point >= MaxChar {
+	if point > MaxChar {
 		return;
 	}
 	char := &chars[point];
@@ -189,7 +195,7 @@ func (char *Char) letter(u, l, t string) {
 	char.titleCase = char.letterValue(t, "T");
 }

-func (char *Char) letterValue(s string, cas string) uint32 {
+func (char *Char) letterValue(s string, cas string) int {
 	if s == "" {
 		return 0
 	}
@@ -198,7 +204,7 @@ func (char *Char) letterValue(s string, cas string) uint32 {
 		char.dump(cas);
 		die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
 	}
-	return uint32(v)
+	return int(v)
 }

 func allCategories() []string {
@@ -242,10 +248,7 @@ func letterOp(code int) bool {
 	return false
 }

-func printCategories() {
-	if *tablelist == "" {
-		return
-	}
+func loadChars() {
 	if *dataUrl == "" {
 		flag.Set("data", *url + "UnicodeData.txt");
 	}
@@ -288,6 +291,12 @@ func printCategories() {
 		}
 	}
 	resp.Body.Close();
+}
+
+func printCategories() {
+	if *tablelist == "" {
+		return
+	}
 	// Find out which categories to dump
 	list := strings.Split(*tablelist, ",", 0);
 	if *tablelist == "all" {
@@ -299,11 +308,11 @@ func printCategories() {
 	}
 	fmt.Printf(
 		"// Generated by running\n"
-		"//	maketables --tables=%s --url=%s\n"
+		"//	maketables --tables=%s --data=%s\n"
 		"// DO NOT EDIT\n\n"
 		"package unicode\n\n",
 		*tablelist,
-		*url
+		*dataUrl
 	);

 	fmt.Println("// Version is the Unicode edition from which the tables are derived.");
@@ -496,6 +505,9 @@ func parseScript(line string) {
 }

 func printScripts() {
+	if *scriptlist == "" {
+		return
+	}
 	var err os.Error;
 	scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
 	if err != nil {
@@ -604,3 +616,148 @@ func fullScriptTest(list []string) {
 		}
 	}
 }
+
+const (
+	CaseUpper = 1 << iota;
+	CaseLower;
+	CaseTitle;
+	CaseNone = 0;	// must be zero
+	CaseMissing = -1;	// character not present; not a valid case state
+)
+
+type caseState struct {
+	point	int;
+	_case	int;
+	deltaToUpper	int;
+	deltaToLower	int;
+	deltaToTitle	int;
+}
+
+// Is d a continuation of the state of c?
+func (c *caseState) adjacent(d *caseState) bool {
+	if d.point < c.point {
+		return d.adjacent(c)
+	}
+	switch {
+	case d.point != c.point+1:
+		return false
+	case d._case != c._case:
+		return false
+	case c._case == CaseNone:
+		return false
+	case c._case == CaseMissing:
+		return false
+	case d.deltaToUpper != c.deltaToUpper:
+		return false
+	case d.deltaToLower != c.deltaToLower:
+		return false
+	case d.deltaToTitle != c.deltaToTitle:
+		return false
+	}
+	return true; 
+}
+
+func getCaseState(i int) (c *caseState) {
+	c = &caseState{ point: i, _case: CaseNone };
+	ch := &chars[i];
+	switch int(ch.codePoint) {
+	case 0:
+		c._case = CaseMissing;	// Will get NUL wrong but that doesn't matter
+		return;
+	case ch.upperCase:
+		c._case = CaseUpper;
+	case ch.lowerCase:
+		c._case = CaseLower;
+	case ch.titleCase:
+		c._case = CaseTitle;
+	}
+	if ch.upperCase != 0 {
+		c.deltaToUpper = ch.upperCase - i
+	}
+	if ch.lowerCase != 0 {
+		c.deltaToLower = ch.lowerCase - i
+	}
+	if ch.titleCase != 0 {
+		c.deltaToTitle = ch.titleCase - i
+	}
+	return;
+}
+
+func printCases() {
+	if !*cases {
+		return
+	}
+	if *test {
+		fullCaseTest();
+		return
+	}
+	fmt.Printf(
+		"// Generated by running\n"
+		"//	maketables --data=%s\n"
+		"// DO NOT EDIT\n\n"
+		"// CaseRanges is the table describing case mappings for all letters with\n"
+		"// non-self mappings.\n"
+		"var CaseRanges = _CaseRanges\n"
+		"var _CaseRanges = []CaseRange {\n",
+		*dataUrl
+	);
+
+	var startState *caseState;	// the start of a run; nil for not active
+	var prevState = &caseState{};	// the state of the previous character
+	for i, c := range chars {
+		state := getCaseState(i);
+		if state.adjacent(prevState) {
+			prevState = state;
+			continue;
+		}
+		// end of run (possibly)
+		printCaseRange(startState, prevState);
+		startState = nil;
+		if state._case != CaseMissing && state._case != CaseNone {
+			startState = state;
+		}
+		prevState = state;
+	}
+	fmt.Printf("}\n");
+}
+
+func printCaseRange(lo, hi *caseState) {
+	if lo == nil {
+		return
+	}
+	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
+		// character represents itself in all cases - no need to mention it
+		return
+	}
+	fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
+		lo.point, hi.point,
+		lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
+}
+
+// If the cased value in the Char is 0, it means use the rune itself.
+func caseIt(rune, cased int) int {
+	if cased == 0 {
+		return rune
+	}
+	return cased
+}
+
+func fullCaseTest() {
+	for i, c := range chars {
+		lower := unicode.ToLower(i);
+		want := caseIt(i, c.lowerCase);
+		if lower != want {
+			fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower);
+		}
+		upper := unicode.ToUpper(i);
+		want = caseIt(i, c.upperCase);
+		if upper != want {
+			fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper);
+		}
+		title := unicode.ToTitle(i);
+		want = caseIt(i, c.titleCase);
+		if title != want {
+			fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title);
+		}
+	}
+}
--- a/src/pkg/unicode/tables.go
+++ b/src/pkg/unicode/tables.go