Commit 1e55e4a3 authored by Rob Pike's avatar Rob Pike

add property tables

R=rsc
DELTA=1087  (1001 added, 78 deleted, 8 changed)
OCL=34137
CL=34147
parent 04a77ac7
...@@ -25,7 +25,8 @@ func main() { ...@@ -25,7 +25,8 @@ func main() {
flag.Parse(); flag.Parse();
loadChars(); // always needed loadChars(); // always needed
printCategories(); printCategories();
printScripts(); printScriptOrProperty(false);
printScriptOrProperty(true);
printCases(); printCases();
} }
...@@ -39,6 +40,9 @@ var tablelist = flag.String("tables", ...@@ -39,6 +40,9 @@ var tablelist = flag.String("tables",
var scriptlist = flag.String("scripts", var scriptlist = flag.String("scripts",
"all", "all",
"comma-separated list of which script tables to generate"); "comma-separated list of which script tables to generate");
var proplist = flag.String("props",
"all",
"comma-separated list of which property tables to generate");
var cases = flag.Bool("cases", var cases = flag.Bool("cases",
true, true,
"generate case tables"); "generate case tables");
...@@ -117,8 +121,11 @@ type Script struct { ...@@ -117,8 +121,11 @@ type Script struct {
var chars = make([]Char, MaxChar+1) var chars = make([]Char, MaxChar+1)
var scripts = make(map[string] []Script) var scripts = make(map[string] []Script)
var props = make(map[string] []Script) // a property looks like a script; can share the format
var lastChar uint32 = 0
var lastChar uint32 = 0; const scriptParseExpression = `([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)`
// In UnicodeData.txt, some ranges are marked like this: // In UnicodeData.txt, some ranges are marked like this:
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
...@@ -217,7 +224,7 @@ func allCategories() []string { ...@@ -217,7 +224,7 @@ func allCategories() []string {
return a; return a;
} }
func allScripts() []string { func all(scripts map[string] []Script) []string {
a := make([]string, len(scripts)); a := make([]string, len(scripts));
i := 0; i := 0;
for k := range scripts { for k := range scripts {
...@@ -462,7 +469,7 @@ func verifyRange(name string, inCategory Op, table []unicode.Range) { ...@@ -462,7 +469,7 @@ func verifyRange(name string, inCategory Op, table []unicode.Range) {
} }
} }
func parseScript(line string) { func parseScript(line string, scripts map[string] []Script) {
comment := strings.Index(line, "#"); comment := strings.Index(line, "#");
if comment >= 0 { if comment >= 0 {
line = line[0:comment] line = line[0:comment]
...@@ -504,21 +511,69 @@ func parseScript(line string) { ...@@ -504,21 +511,69 @@ func parseScript(line string) {
scripts[name] = s; scripts[name] = s;
} }
func printScripts() { // The script tables have a lot of adjacent elements. Fold them together.
if *scriptlist == "" { func foldAdjacent(r []Script) []unicode.Range {
s := make([]unicode.Range, 0, len(r));
j := 0;
for i := 0; i < len(r); i++ {
if j>0 && int(r[i].lo) == s[j-1].Hi+1 {
s[j-1].Hi = int(r[i].hi);
} else {
s = s[0:j+1];
s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1};
j++;
}
}
return s;
}
func fullScriptTest(list []string, installed map[string] []unicode.Range, scripts map[string] []Script) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
die.Log("unknown script", name);
}
r, ok := installed[name];
if !ok {
die.Log("unknown table", name);
}
for _, script := range scripts[name] {
for r := script.lo; r <= script.hi; r++ {
if !unicode.Is(installed[name], int(r)) {
fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name);
}
}
}
}
}
// PropList.txt has the same format as Scripts.txt so we can share its parser.
func printScriptOrProperty(doProps bool) {
flag := "scripts";
flaglist := *scriptlist;
file := "Scripts.txt";
table := scripts;
installed := unicode.Scripts;
if doProps {
flag = "props";
flaglist = *proplist;
file = "PropList.txt";
table = props;
installed = unicode.Props;
}
if flaglist == "" {
return return
} }
var err os.Error; var err os.Error;
scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`); scriptRe, err = regexp.Compile(scriptParseExpression);
if err != nil { if err != nil {
die.Log("re error:", err) die.Log("re error:", err)
} }
resp, _, err := http.Get(*url + "Scripts.txt"); resp, _, err := http.Get(*url + file);
if err != nil { if err != nil {
die.Log(err); die.Log(err);
} }
if resp.StatusCode != 200 { if resp.StatusCode != 200 {
die.Log("bad GET status for Scripts.txt", resp.Status); die.Log("bad GET status for ", file, ":", resp.Status);
} }
input := bufio.NewReader(resp.Body); input := bufio.NewReader(resp.Body);
for { for {
...@@ -529,31 +584,37 @@ func printScripts() { ...@@ -529,31 +584,37 @@ func printScripts() {
} }
die.Log(err); die.Log(err);
} }
parseScript(line[0:len(line)-1]); parseScript(line[0:len(line)-1], table);
} }
resp.Body.Close(); resp.Body.Close();
// Find out which scripts to dump // Find out which scripts to dump
list := strings.Split(*scriptlist, ",", 0); list := strings.Split(flaglist, ",", 0);
if *scriptlist == "all" { if flaglist == "all" {
list = allScripts(); list = all(table);
} }
if *test { if *test {
fullScriptTest(list); fullScriptTest(list, installed, table);
return; return;
} }
fmt.Printf( fmt.Printf(
"// Generated by running\n" "// Generated by running\n"
"// maketables --scripts=%s --url=%s\n" "// maketables --%s=%s --url=%s\n"
"// DO NOT EDIT\n\n", "// DO NOT EDIT\n\n",
*scriptlist, flag,
flaglist,
*url *url
); );
if *scriptlist == "all" { if flaglist == "all" {
fmt.Println("// Scripts is the set of Unicode script tables."); if doProps {
fmt.Println("// Props is the set of Unicode property tables.");
fmt.Println("var Props = map[string] []Range {");
} else {
fmt.Println("// Scripts is the set of Unicode script tables.");
fmt.Println("var Scripts = map[string] []Range {"); fmt.Println("var Scripts = map[string] []Range {");
for k, _ := range scripts { }
for k, _ := range table {
fmt.Printf("\t%q: %s,\n", k, k); fmt.Printf("\t%q: %s,\n", k, k);
} }
fmt.Printf("}\n\n"); fmt.Printf("}\n\n");
...@@ -562,13 +623,20 @@ func printScripts() { ...@@ -562,13 +623,20 @@ func printScripts() {
decl := make(sort.StringArray, len(list)); decl := make(sort.StringArray, len(list));
ndecl := 0; ndecl := 0;
for _, name := range list { for _, name := range list {
decl[ndecl] = fmt.Sprintf( if doProps {
"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", decl[ndecl] = fmt.Sprintf(
name, name, name, name "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
); name, name, name, name
);
} else {
decl[ndecl] = fmt.Sprintf(
"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
name, name, name, name
);
}
ndecl++; ndecl++;
fmt.Printf("var _%s = []Range {\n", name); fmt.Printf("var _%s = []Range {\n", name);
ranges := foldAdjacent(scripts[name]); ranges := foldAdjacent(table[name]);
for _, s := range ranges { for _, s := range ranges {
fmt.Printf(format, s.Lo, s.Hi, s.Stride); fmt.Printf(format, s.Lo, s.Hi, s.Stride);
} }
...@@ -582,41 +650,6 @@ func printScripts() { ...@@ -582,41 +650,6 @@ func printScripts() {
fmt.Println(")\n"); fmt.Println(")\n");
} }
// The script tables have a lot of adjacent elements. Fold them together.
func foldAdjacent(r []Script) []unicode.Range {
s := make([]unicode.Range, 0, len(r));
j := 0;
for i := 0; i < len(r); i++ {
if j>0 && int(r[i].lo) == s[j-1].Hi+1 {
s[j-1].Hi = int(r[i].hi);
} else {
s = s[0:j+1];
s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1};
j++;
}
}
return s;
}
func fullScriptTest(list []string) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
die.Log("unknown script", name);
}
r, ok := unicode.Scripts[name];
if !ok {
die.Log("unknown table", name);
}
for _, script := range scripts[name] {
for r := script.lo; r <= script.hi; r++ {
if !unicode.Is(unicode.Scripts[name], int(r)) {
fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name);
}
}
}
}
}
const ( const (
CaseUpper = 1 << iota; CaseUpper = 1 << iota;
CaseLower; CaseLower;
......
...@@ -134,12 +134,50 @@ var inCategoryTest = []T { ...@@ -134,12 +134,50 @@ var inCategoryTest = []T {
T{0x04aa, "letter"}, T{0x04aa, "letter"},
} }
var inPropTest = []T {
T{0x0046, "ASCII_Hex_Digit"},
T{0x200F, "Bidi_Control"},
T{0x2212, "Dash"},
T{0xE0001, "Deprecated"},
T{0x00B7, "Diacritic"},
T{0x30FE, "Extender"},
T{0xFF46, "Hex_Digit"},
T{0x2E17, "Hyphen"},
T{0x2FFB, "IDS_Binary_Operator"},
T{0x2FF3, "IDS_Trinary_Operator"},
T{0xFA6A, "Ideographic"},
T{0x200D, "Join_Control"},
T{0x0EC4, "Logical_Order_Exception"},
T{0x2FFFF, "Noncharacter_Code_Point"},
T{0x065E, "Other_Alphabetic"},
T{0x2069, "Other_Default_Ignorable_Code_Point"},
T{0x0BD7, "Other_Grapheme_Extend"},
T{0x0387, "Other_ID_Continue"},
T{0x212E, "Other_ID_Start"},
T{0x2094, "Other_Lowercase"},
T{0x2040, "Other_Math"},
T{0x216F, "Other_Uppercase"},
T{0x0027, "Pattern_Syntax"},
T{0x0020, "Pattern_White_Space"},
T{0x300D, "Quotation_Mark"},
T{0x2EF3, "Radical"},
T{0x061F, "STerm"},
T{0x2071, "Soft_Dotted"},
T{0x003A, "Terminal_Punctuation"},
T{0x9FC3, "Unified_Ideograph"},
T{0xFE0F, "Variation_Selector"},
T{0x0020, "White_Space"},
}
func TestScripts(t *testing.T) { func TestScripts(t *testing.T) {
notTested := make(map[string] bool); notTested := make(map[string] bool);
for k := range Scripts { for k := range Scripts {
notTested[k] = true notTested[k] = true
} }
for i, test := range inTest { for i, test := range inTest {
if _, ok := Scripts[test.script]; !ok {
t.Fatal(test.script, "not a known script")
}
if !Is(Scripts[test.script], test.rune) { if !Is(Scripts[test.script], test.rune) {
t.Errorf("IsScript(%#x, %s) = false, want true\n", test.rune, test.script); t.Errorf("IsScript(%#x, %s) = false, want true\n", test.rune, test.script);
} }
...@@ -161,6 +199,9 @@ func TestCategories(t *testing.T) { ...@@ -161,6 +199,9 @@ func TestCategories(t *testing.T) {
notTested[k] = true notTested[k] = true
} }
for i, test := range inCategoryTest { for i, test := range inCategoryTest {
if _, ok := Categories[test.script]; !ok {
t.Fatal(test.script, "not a known category")
}
if !Is(Categories[test.script], test.rune) { if !Is(Categories[test.script], test.rune) {
t.Errorf("IsCategory(%#x, %s) = false, want true\n", test.rune, test.script); t.Errorf("IsCategory(%#x, %s) = false, want true\n", test.rune, test.script);
} }
...@@ -171,3 +212,21 @@ func TestCategories(t *testing.T) { ...@@ -171,3 +212,21 @@ func TestCategories(t *testing.T) {
} }
} }
func TestProps(t *testing.T) {
notTested := make(map[string] bool);
for k := range Props {
notTested[k] = true
}
for i, test := range inPropTest {
if _, ok := Props[test.script]; !ok {
t.Fatal(test.script, "not a known prop")
}
if !Is(Props[test.script], test.rune) {
t.Errorf("IsCategory(%#x, %s) = false, want true\n", test.rune, test.script);
}
notTested[test.script] = false, false
}
for k := range notTested {
t.Error("not tested:", k)
}
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment