Commit 396b47bb authored by Rob Pike's avatar Rob Pike

generate the unicode tables directly from web database

after this CL, two more to come:
	1) add an exhaustive test, probably as a variant of maketables
	2) add ToUpper, ToLower, ToTitle and associated tests

R=rsc
DELTA=1578  (1007 added, 559 deleted, 12 changed)
OCL=33902
CL=33907
parent 2aea4a06
...@@ -68,7 +68,7 @@ func (c *common) setNext(i instr) { c._next = i } ...@@ -68,7 +68,7 @@ func (c *common) setNext(i instr) { c._next = i }
func (c *common) index() int { return c._index } func (c *common) index() int { return c._index }
func (c *common) setIndex(i int) { c._index = i } func (c *common) setIndex(i int) { c._index = i }
// The representation of a compiled regular expression. // Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods. // The public interface is entirely through methods.
type Regexp struct { type Regexp struct {
expr string; // the original expression expr string; // the original expression
......
...@@ -7,6 +7,15 @@ include $(GOROOT)/src/Make.$(GOARCH) ...@@ -7,6 +7,15 @@ include $(GOROOT)/src/Make.$(GOARCH)
TARG=unicode TARG=unicode
GOFILES=\ GOFILES=\
decimaldigit.go\ decimaldigit.go\
digittables.go\
letter.go\ letter.go\
lettertables.go\
include $(GOROOT)/src/Make.pkg include $(GOROOT)/src/Make.pkg
tables:
$(GC) maketables.go
$(LD) -o maketables maketables.$O
maketables --digits > digittables.go
maketables > lettertables.go
rm -f maketables
...@@ -4,46 +4,6 @@ ...@@ -4,46 +4,6 @@
package unicode package unicode
// TODO: Generated by hand starting with
// http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
// These ranges are the characters with the third field "Nd".
// Should generate automatically etc.
// Decimal digit is the set of Unicode characters with the "decimal digit" property.
var DecimalDigit = []Range{
Range{0x0030, 0x0039, 1},
Range{0x0660, 0x0669, 1},
Range{0x06F0, 0x06F9, 1},
Range{0x07C0, 0x07C9, 1},
Range{0x0966, 0x096F, 1},
Range{0x09E6, 0x09EF, 1},
Range{0x0A66, 0x0A6F, 1},
Range{0x0AE6, 0x0AEF, 1},
Range{0x0B66, 0x0B6F, 1},
Range{0x0BE6, 0x0BEF, 1},
Range{0x0C66, 0x0C6F, 1},
Range{0x0CE6, 0x0CEF, 1},
Range{0x0D66, 0x0D6F, 1},
Range{0x0E50, 0x0E59, 1},
Range{0x0ED0, 0x0ED9, 1},
Range{0x0F20, 0x0F29, 1},
Range{0x1040, 0x1049, 1},
Range{0x1090, 0x1099, 1},
Range{0x17E0, 0x17E9, 1},
Range{0x1810, 0x1819, 1},
Range{0x1946, 0x194F, 1},
Range{0x19D0, 0x19D9, 1},
Range{0x1B50, 0x1B59, 1},
Range{0x1BB0, 0x1BB9, 1},
Range{0x1C40, 0x1C49, 1},
Range{0x1C50, 0x1C59, 1},
Range{0xA620, 0xA629, 1},
Range{0xA8D0, 0xA8D9, 1},
Range{0xA900, 0xA909, 1},
Range{0xAA50, 0xAA59, 1},
Range{0xFF10, 0xFF19, 1},
}
// IsDecimalDigit reports whether the rune is a decimal digit. // IsDecimalDigit reports whether the rune is a decimal digit.
func IsDecimalDigit(rune int) bool { func IsDecimalDigit(rune int) bool {
return Is(DecimalDigit, rune); return Is(DecimalDigit, rune);
......
// Generated by running
// tables --digits=true --url=http://www.unicode.org/Public/5.1.0/ucd/UnicodeData.txt
// DO NOT EDIT
package unicode
// DecimalDigit is the set of Unicode characters with the "decimal digit" property.
var DecimalDigit = decimalDigit
var decimalDigit = []Range {
Range{0x0030, 0x0039, 1},
Range{0x0660, 0x0669, 1},
Range{0x06f0, 0x06f9, 1},
Range{0x07c0, 0x07c9, 1},
Range{0x0966, 0x096f, 1},
Range{0x09e6, 0x09ef, 1},
Range{0x0a66, 0x0a6f, 1},
Range{0x0ae6, 0x0aef, 1},
Range{0x0b66, 0x0b6f, 1},
Range{0x0be6, 0x0bef, 1},
Range{0x0c66, 0x0c6f, 1},
Range{0x0ce6, 0x0cef, 1},
Range{0x0d66, 0x0d6f, 1},
Range{0x0e50, 0x0e59, 1},
Range{0x0ed0, 0x0ed9, 1},
Range{0x0f20, 0x0f29, 1},
Range{0x1040, 0x1049, 1},
Range{0x1090, 0x1099, 1},
Range{0x17e0, 0x17e9, 1},
Range{0x1810, 0x1819, 1},
Range{0x1946, 0x194f, 1},
Range{0x19d0, 0x19d9, 1},
Range{0x1b50, 0x1b59, 1},
Range{0x1bb0, 0x1bb9, 1},
Range{0x1c40, 0x1c49, 1},
Range{0x1c50, 0x1c59, 1},
Range{0xa620, 0xa629, 1},
Range{0xa8d0, 0xa8d9, 1},
Range{0xa900, 0xa909, 1},
Range{0xaa50, 0xaa59, 1},
Range{0xff10, 0xff19, 1},
Range{0x104a0, 0x104a9, 1},
Range{0x1d7ce, 0x1d7ff, 1},
}
This diff is collapsed.
...@@ -6,7 +6,7 @@ package unicode ...@@ -6,7 +6,7 @@ package unicode
import "testing" import "testing"
var upper = []int{ var upperTest = []int{
0x41, 0x41,
0xc0, 0xc0,
0xd8, 0xd8,
...@@ -30,7 +30,7 @@ var upper = []int{ ...@@ -30,7 +30,7 @@ var upper = []int{
0x1d7ca, 0x1d7ca,
} }
var notupper = []int{ var notupperTest = []int{
0x40, 0x40,
0x5b, 0x5b,
0x61, 0x61,
...@@ -43,7 +43,7 @@ var notupper = []int{ ...@@ -43,7 +43,7 @@ var notupper = []int{
0x10000, 0x10000,
} }
var letter = []int{ var letterTest = []int{
0x41, 0x41,
0x61, 0x61,
0xaa, 0xaa,
...@@ -78,7 +78,7 @@ var letter = []int{ ...@@ -78,7 +78,7 @@ var letter = []int{
0x2fa1d, 0x2fa1d,
} }
var notletter = []int{ var notletterTest = []int{
0x20, 0x20,
0x35, 0x35,
0x375, 0x375,
...@@ -90,17 +90,17 @@ var notletter = []int{ ...@@ -90,17 +90,17 @@ var notletter = []int{
} }
func TestIsLetter(t *testing.T) { func TestIsLetter(t *testing.T) {
for i, r := range upper { for i, r := range upperTest {
if !IsLetter(r) { if !IsLetter(r) {
t.Errorf("IsLetter(%#x) = false, want true\n", r); t.Errorf("IsLetter(%#x) = false, want true\n", r);
} }
} }
for i, r := range letter { for i, r := range letterTest {
if !IsLetter(r) { if !IsLetter(r) {
t.Errorf("IsLetter(%#x) = false, want true\n", r); t.Errorf("IsLetter(%#x) = false, want true\n", r);
} }
} }
for i, r := range notletter { for i, r := range notletterTest {
if IsLetter(r) { if IsLetter(r) {
t.Errorf("IsLetter(%#x) = true, want false\n", r); t.Errorf("IsLetter(%#x) = true, want false\n", r);
} }
...@@ -108,17 +108,17 @@ func TestIsLetter(t *testing.T) { ...@@ -108,17 +108,17 @@ func TestIsLetter(t *testing.T) {
} }
func TestIsUpper(t *testing.T) { func TestIsUpper(t *testing.T) {
for i, r := range upper { for i, r := range upperTest {
if !IsUpper(r) { if !IsUpper(r) {
t.Errorf("IsUpper(%#x) = false, want true\n", r); t.Errorf("IsUpper(%#x) = false, want true\n", r);
} }
} }
for i, r := range notupper { for i, r := range notupperTest {
if IsUpper(r) { if IsUpper(r) {
t.Errorf("IsUpper(%#x) = true, want false\n", r); t.Errorf("IsUpper(%#x) = true, want false\n", r);
} }
} }
for i, r := range notletter { for i, r := range notletterTest {
if IsUpper(r) { if IsUpper(r) {
t.Errorf("IsUpper(%#x) = true, want false\n", r); t.Errorf("IsUpper(%#x) = true, want false\n", r);
} }
......
This diff is collapsed.
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode table generator.
// Data read from the web.
package main
import (
"bufio";
"flag";
"fmt";
"http";
"log";
"os";
"strconv";
"strings";
)
var url = flag.String("url", "http://www.unicode.org/Public/5.1.0/ucd/UnicodeData.txt", "URL of Unicode database")
var digits = flag.Bool("digits", false, "whether to generate digit tables; default is letter tables");
var die = log.New(os.Stderr, nil, "", log.Lexit|log.Lshortfile);
// Data has form:
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
// The fields
const (
FCodePoint = iota;
FName;
FGeneralCategory;
FCanonicalCombiningClass;
FBidiClass;
FDecompositionType;
FDecompositionMapping;
FNumericType;
FNumericValue;
FBidiMirrored;
FUnicode1Name;
FISOComment;
FSimpleUppercaseMapping;
FSimpleLowercaseMapping;
FSimpleTitlecaseMapping;
NumField;
MaxChar = 0xF0000; // anything above this doesn't have useful properties
)
var fieldName = []string{
"CodePoint",
"Name",
"GeneralCategory",
"CanonicalCombiningClass",
"BidiClass",
"DecompositionType",
"DecompositionMapping",
"NumericType",
"NumericValue",
"BidiMirrored",
"Unicode1Name",
"ISOComment",
"SimpleUppercaseMapping",
"SimpleLowercaseMapping",
"SimpleTitlecaseMapping"
}
// This contains only the properties we're interested in.
type Char struct {
field []string; // debugging only; could be deleted if we take out char.dump()
codePoint uint32; // redundant (it's the index in the chars table) but useful
category string;
numValue int;
upperCase uint32;
lowerCase uint32;
titleCase uint32;
}
var chars = make([]Char, MaxChar)
var lastChar uint32 = 0;
func parse(line string) {
field := strings.Split(line, ";", -1);
if len(field) != NumField {
die.Logf("%.5s...: %d fields (expected %d)\n", line, len(field), NumField);
}
point, err := strconv.Btoui64(field[FCodePoint], 16);
if err != nil {
die.Log("%.5s...:", err)
}
lastChar = uint32(point);
if point == 0 {
return // not interesting and we use 0 as unset
}
if point >= MaxChar {
fmt.Fprintf(os.Stderr, "ignoring char U+%04x\n", point);
return;
}
char := &chars[point];
char.field=field;
if char.codePoint != 0 {
die.Logf("point U+%04x reused\n");
}
char.codePoint = lastChar;
char.category = field[FGeneralCategory];
switch char.category {
case "Nd":
// Decimal digit
v, err := strconv.Atoi(field[FNumericValue]);
if err != nil {
die.Log("U+%04x: bad numeric field: %s", point, err);
}
char.numValue = v;
case "Lu":
char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]);
case "Ll":
char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]);
case "Lt":
char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]);
case "Lm", "Lo":
char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]);
}
}
func (char *Char) dump(s string) {
fmt.Print(s, " ");
for i:=0;i<len(char.field);i++ {
fmt.Printf("%s:%q ", fieldName[i], char.field[i]);
}
fmt.Print("\n");
}
func (char *Char) letter(u, l, t string) {
char.upperCase = char.letterValue(u, "U");
char.lowerCase = char.letterValue(l, "L");
char.titleCase = char.letterValue(t, "T");
}
func (char *Char) letterValue(s string, cas string) uint32 {
if s == "" {
return 0
}
v, err := strconv.Btoui64(s, 16);
if err != nil {
char.dump(cas);
die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
}
return uint32(v)
}
func main() {
flag.Parse();
resp, _, err := http.Get(*url);
if err != nil {
die.Log(err);
}
input := bufio.NewReader(resp.Body);
for {
line, err := input.ReadLineString('\n', false);
if err != nil {
if err == os.EOF {
break;
}
die.Log(err);
}
parse(line);
}
resp.Body.Close();
fmt.Printf(
"// Generated by running\n"
"// tables --digits=%t --url=%s\n"
"// DO NOT EDIT\n\n"
"package unicode\n",
*digits,
*url
);
// We generate an UpperCase name to serve as concise documentation and a lowerCase
// name to store the data. This stops godoc dumping all the tables but keeps them
// available to clients.
if *digits {
dumpRange(
"\n// DecimalDigit is the set of Unicode characters with the \"decimal digit\" property.\n"
"var DecimalDigit = decimalDigit\n"
"var decimalDigit = []Range {\n",
func(code int) bool { return chars[code].category == "Nd" },
"}\n"
);
} else {
dumpRange(
"\n// Letter is the set of Unicode letters.\n"
"var Letter = letter\n"
"var letter = []Range {\n",
func(code int) bool {
switch chars[code].category {
case "Lu", "Ll", "Lt", "Lm", "Lo":
return true
}
return false
},
"}\n"
);
dumpRange(
"\n// Upper is the set of Unicode upper case letters.\n"
"var Upper = upper\n"
"var upper = []Range {\n",
func(code int) bool { return chars[code].category == "Lu" },
"}\n"
);
dumpRange(
"\n// Lower is the set of Unicode lower case letters.\n"
"var Lower = lower\n"
"var lower = []Range {\n",
func(code int) bool { return chars[code].category == "Ll" },
"}\n"
);
dumpRange(
"\n// Title is the set of Unicode title case letters.\n"
"var Title = title\n"
"var title = []Range {\n",
func(code int) bool { return chars[code].category == "Lt" },
"}\n"
);
}
}
type Op func(code int) bool
func dumpRange(header string, inCategory Op, trailer string) {
fmt.Print(header);
const format = "\tRange{0x%04x, 0x%04x, %d},\n";
next := 0;
// one Range for each iteration
for {
// look for start of range
for next < len(chars) && !inCategory(next) {
next++
}
if next >= len(chars) {
// no characters remain
break
}
// start of range
lo := next;
hi := next;
stride := 1;
// accept lo
next++;
// look for another character to set the stride
for next < len(chars) && !inCategory(next) {
next++
}
if next >= len(chars) {
// no more characters
fmt.Printf(format, lo, hi, stride);
break;
}
// set stride
stride = next - lo;
// check for length of run. next points to first jump in stride
for i := next; i < len(chars); i++ {
if inCategory(i) == (((i-lo)%stride) == 0) {
// accept
if inCategory(i) {
hi = i
}
} else {
// no more characters in this run
break
}
}
fmt.Printf(format, lo, hi, stride);
// next range: start looking where this range ends
next = hi + 1;
}
fmt.Print(trailer);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment