Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
go
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
go
Commits
fc77e826
Commit
fc77e826
authored
Jun 16, 2011
by
Russ Cox
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
unicode: add case folding tables
R=r, r CC=golang-dev
https://golang.org/cl/4571074
parent
6e9b1a78
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
921 additions
and
109 deletions
+921
-109
src/pkg/unicode/letter.go
src/pkg/unicode/letter.go
+49
-0
src/pkg/unicode/letter_test.go
src/pkg/unicode/letter_test.go
+46
-0
src/pkg/unicode/maketables.go
src/pkg/unicode/maketables.go
+293
-8
src/pkg/unicode/tables.go
src/pkg/unicode/tables.go
+533
-101
No files found.
src/pkg/unicode/letter.go
View file @
fc77e826
...
...
@@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int {
}
return
r
}
// caseOrbit is defined in tables.go as []foldPair. Right now all the
// entries fit in uint16, so use uint16. If that changes, compilation
// will fail (the constants in the composite literal will not fit in uint16)
// and the types here can change to uint32.
type
foldPair
struct
{
From
uint16
To
uint16
}
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
// smallest r >= rune if one exists, or else the smallest r >= 0.
//
// For example:
// SimpleFold('A') = 'a'
// SimpleFold('a') = 'A'
//
// SimpleFold('K') = 'k'
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
// SimpleFold('\u212A') = 'K'
//
// SimpleFold('1') = '1'
//
func
SimpleFold
(
rune
int
)
int
{
// Consult caseOrbit table for special cases.
lo
:=
0
hi
:=
len
(
caseOrbit
)
for
lo
<
hi
{
m
:=
lo
+
(
hi
-
lo
)
/
2
if
int
(
caseOrbit
[
m
]
.
From
)
<
rune
{
lo
=
m
+
1
}
else
{
hi
=
m
}
}
if
lo
<
len
(
caseOrbit
)
&&
int
(
caseOrbit
[
lo
]
.
From
)
==
rune
{
return
int
(
caseOrbit
[
lo
]
.
To
)
}
// No folding specified. This is a one- or two-element
// equivalence class containing rune and ToLower(rune)
// and ToUpper(rune) if they are different from rune.
if
l
:=
ToLower
(
rune
);
l
!=
rune
{
return
l
}
return
ToUpper
(
rune
)
}
src/pkg/unicode/letter_test.go
View file @
fc77e826
...
...
@@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) {
}
}
}
var
simpleFoldTests
=
[]
string
{
// SimpleFold could order its returned slices in any order it wants,
// but we know it orders them in increasing order starting at in
// and looping around from MaxRune to 0.
// Easy cases.
"Aa"
,
"aA"
,
"δΔ"
,
"Δδ"
,
// ASCII special cases.
"KkK"
,
"kKK"
,
"KKk"
,
"Ssſ"
,
"sſS"
,
"ſSs"
,
// Non-ASCII special cases.
"ρϱΡ"
,
"ϱΡρ"
,
"Ρρϱ"
,
"ͅΙιι"
,
"Ιιιͅ"
,
"ιιͅΙ"
,
"ιͅΙι"
,
// Extra special cases: has lower/upper but no case fold.
"İ"
,
"ı"
,
}
func
TestSimpleFold
(
t
*
testing
.
T
)
{
for
_
,
tt
:=
range
simpleFoldTests
{
cycle
:=
[]
int
(
tt
)
rune
:=
cycle
[
len
(
cycle
)
-
1
]
for
_
,
out
:=
range
cycle
{
if
r
:=
SimpleFold
(
rune
);
r
!=
out
{
t
.
Errorf
(
"SimpleFold(%#U) = %#U, want %#U"
,
rune
,
r
,
out
)
}
rune
=
out
}
}
}
src/pkg/unicode/maketables.go
View file @
fc77e826
...
...
@@ -24,15 +24,18 @@ import (
func
main
()
{
flag
.
Parse
()
loadChars
()
// always needed
loadCasefold
()
printCategories
()
printScriptOrProperty
(
false
)
printScriptOrProperty
(
true
)
printCases
()
printLatinProperties
()
printCasefold
()
printSizes
()
}
var
dataURL
=
flag
.
String
(
"data"
,
""
,
"full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt"
)
var
casefoldingURL
=
flag
.
String
(
"casefolding"
,
""
,
"full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt"
)
var
url
=
flag
.
String
(
"url"
,
"http://www.unicode.org/Public/6.0.0/ucd/"
,
"URL of Unicode database directory"
)
...
...
@@ -119,6 +122,8 @@ type Char struct {
upperCase
int
lowerCase
int
titleCase
int
foldCase
int
// simple case folding
caseOrbit
int
// next in simple case folding orbit
}
// Scripts.txt has form:
...
...
@@ -308,8 +313,53 @@ func loadChars() {
resp
.
Body
.
Close
()
}
func
loadCasefold
()
{
if
*
casefoldingURL
==
""
{
flag
.
Set
(
"casefolding"
,
*
url
+
"CaseFolding.txt"
)
}
resp
,
err
:=
http
.
Get
(
*
casefoldingURL
)
if
err
!=
nil
{
logger
.
Fatal
(
err
)
}
if
resp
.
StatusCode
!=
200
{
logger
.
Fatal
(
"bad GET status for CaseFolding.txt"
,
resp
.
Status
)
}
input
:=
bufio
.
NewReader
(
resp
.
Body
)
for
{
line
,
err
:=
input
.
ReadString
(
'\n'
)
if
err
!=
nil
{
if
err
==
os
.
EOF
{
break
}
logger
.
Fatal
(
err
)
}
if
line
[
0
]
==
'#'
{
continue
}
field
:=
strings
.
Split
(
line
,
"; "
,
-
1
)
if
len
(
field
)
!=
4
{
logger
.
Fatalf
(
"CaseFolding.txt %.5s...: %d fields (expected %d)
\n
"
,
line
,
len
(
field
),
4
)
}
kind
:=
field
[
1
]
if
kind
!=
"C"
&&
kind
!=
"S"
{
// Only care about 'common' and 'simple' foldings.
continue
}
p1
,
err
:=
strconv
.
Btoui64
(
field
[
0
],
16
)
if
err
!=
nil
{
logger
.
Fatalf
(
"CaseFolding.txt %.5s...: %s"
,
line
,
err
)
}
p2
,
err
:=
strconv
.
Btoui64
(
field
[
2
],
16
)
if
err
!=
nil
{
logger
.
Fatalf
(
"CaseFolding.txt %.5s...: %s"
,
line
,
err
)
}
chars
[
p1
]
.
foldCase
=
int
(
p2
)
}
resp
.
Body
.
Close
()
}
const
progHeader
=
`// Generated by running
// maketables --tables=%s --data=%s
// maketables --tables=%s --data=%s
--casefolding=%s
// DO NOT EDIT
package unicode
...
...
@@ -330,7 +380,7 @@ func printCategories() {
fullCategoryTest
(
list
)
return
}
fmt
.
Printf
(
progHeader
,
*
tablelist
,
*
dataURL
)
fmt
.
Printf
(
progHeader
,
*
tablelist
,
*
dataURL
,
*
casefoldingURL
)
fmt
.
Println
(
"// Version is the Unicode edition from which the tables are derived."
)
fmt
.
Printf
(
"const Version = %q
\n\n
"
,
version
())
...
...
@@ -837,13 +887,13 @@ func printCases() {
}
fmt
.
Printf
(
"// Generated by running
\n
"
+
"// maketables --data=%s
\n
"
+
"// maketables --data=%s
--casefolding=%s
\n
"
+
"// DO NOT EDIT
\n\n
"
+
"// CaseRanges is the table describing case mappings for all letters with
\n
"
+
"// non-self mappings.
\n
"
+
"var CaseRanges = _CaseRanges
\n
"
+
"var _CaseRanges = []CaseRange {
\n
"
,
*
dataURL
)
*
dataURL
,
*
casefoldingURL
)
var
startState
*
caseState
// the start of a run; nil for not active
var
prevState
=
&
caseState
{}
// the state of the previous character
...
...
@@ -946,13 +996,246 @@ func printLatinProperties() {
if
code
==
' '
{
property
=
"pZ | pp"
}
fmt
.
Printf
(
"
\t
0x%.2X: %s, // %q
\n
"
,
code
,
property
,
code
)
fmt
.
Printf
(
"
\t
0x%02X: %s, // %q
\n
"
,
code
,
property
,
code
)
}
fmt
.
Printf
(
"}
\n\n
"
)
}
func
printCasefold
()
{
// Build list of case-folding groups attached to each canonical folded char (typically lower case).
var
caseOrbit
=
make
([][]
int
,
MaxChar
+
1
)
for
i
:=
range
chars
{
c
:=
&
chars
[
i
]
if
c
.
foldCase
==
0
{
continue
}
orb
:=
caseOrbit
[
c
.
foldCase
]
if
orb
==
nil
{
orb
=
append
(
orb
,
c
.
foldCase
)
}
caseOrbit
[
c
.
foldCase
]
=
append
(
orb
,
i
)
}
// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
for
i
:=
range
chars
{
c
:=
&
chars
[
i
]
f
:=
c
.
foldCase
if
f
==
0
{
f
=
i
}
orb
:=
caseOrbit
[
f
]
if
orb
==
nil
&&
(
c
.
upperCase
!=
0
&&
c
.
upperCase
!=
i
||
c
.
lowerCase
!=
0
&&
c
.
lowerCase
!=
i
)
{
// Default assumption of [upper, lower] is wrong.
caseOrbit
[
i
]
=
[]
int
{
i
}
}
}
// Delete the groups for which assuming [lower, upper] is right.
for
i
,
orb
:=
range
caseOrbit
{
if
len
(
orb
)
==
2
&&
chars
[
orb
[
0
]]
.
upperCase
==
orb
[
1
]
&&
chars
[
orb
[
1
]]
.
lowerCase
==
orb
[
0
]
{
caseOrbit
[
i
]
=
nil
}
}
// Record orbit information in chars.
for
_
,
orb
:=
range
caseOrbit
{
if
orb
==
nil
{
continue
}
sort
.
SortInts
(
orb
)
c
:=
orb
[
len
(
orb
)
-
1
]
for
_
,
d
:=
range
orb
{
chars
[
c
]
.
caseOrbit
=
d
c
=
d
}
}
printCaseOrbit
()
// Tables of category and script folding exceptions: code points
// that must be added when interpreting a particular category/script
// in a case-folding context.
cat
:=
make
(
map
[
string
]
map
[
int
]
bool
)
for
name
:=
range
category
{
if
x
:=
foldExceptions
(
inCategory
(
name
));
len
(
x
)
>
0
{
cat
[
name
]
=
x
}
}
scr
:=
make
(
map
[
string
]
map
[
int
]
bool
)
for
name
:=
range
scripts
{
if
x
:=
foldExceptions
(
inScript
(
name
));
len
(
x
)
>
0
{
cat
[
name
]
=
x
}
}
printCatFold
(
"FoldCategory"
,
cat
)
printCatFold
(
"FoldScript"
,
scr
)
}
// inCategory returns a list of all the runes in the category.
func
inCategory
(
name
string
)
[]
int
{
var
x
[]
int
for
i
:=
range
chars
{
c
:=
&
chars
[
i
]
if
c
.
category
==
name
||
len
(
name
)
==
1
&&
len
(
c
.
category
)
>
1
&&
c
.
category
[
0
]
==
name
[
0
]
{
x
=
append
(
x
,
i
)
}
}
fmt
.
Println
(
"}"
)
return
x
}
var
range16Count
=
0
// Number of entries in the 16-bit range tables.
var
range32Count
=
0
// Number of entries in the 32-bit range tables.
// inScript returns a list of all the runes in the script.
func
inScript
(
name
string
)
[]
int
{
var
x
[]
int
for
_
,
s
:=
range
scripts
[
name
]
{
for
c
:=
s
.
lo
;
c
<=
s
.
hi
;
c
++
{
x
=
append
(
x
,
int
(
c
))
}
}
return
x
}
// foldExceptions returns a list of all the runes fold-equivalent
// to runes in class but not in class themselves.
func
foldExceptions
(
class
[]
int
)
map
[
int
]
bool
{
// Create map containing class and all fold-equivalent chars.
m
:=
make
(
map
[
int
]
bool
)
for
_
,
r
:=
range
class
{
c
:=
&
chars
[
r
]
if
c
.
caseOrbit
==
0
{
// Just upper and lower.
if
u
:=
c
.
upperCase
;
u
!=
0
{
m
[
u
]
=
true
}
if
l
:=
c
.
lowerCase
;
l
!=
0
{
m
[
l
]
=
true
}
m
[
r
]
=
true
continue
}
// Otherwise walk orbit.
r0
:=
r
for
{
m
[
r
]
=
true
r
=
chars
[
r
]
.
caseOrbit
if
r
==
r0
{
break
}
}
}
// Remove class itself.
for
_
,
r
:=
range
class
{
m
[
r
]
=
false
,
false
}
// What's left is the exceptions.
return
m
}
var
comment
=
map
[
string
]
string
{
"FoldCategory"
:
"// FoldCategory maps a category name to a table of
\n
"
+
"// code points outside the category that are equivalent under
\n
"
+
"// simple case folding to code points inside the category.
\n
"
+
"// If there is no entry for a category name, there are no such points.
\n
"
,
"FoldScript"
:
"// FoldScript maps a script name to a table of
\n
"
+
"// code points outside the script that are equivalent under
\n
"
+
"// simple case folding to code points inside the script.
\n
"
+
"// If there is no entry for a script name, there are no such points.
\n
"
,
}
func
printCaseOrbit
()
{
if
*
test
{
for
i
:=
range
chars
{
c
:=
&
chars
[
i
]
f
:=
c
.
caseOrbit
if
f
==
0
{
if
c
.
lowerCase
!=
i
&&
c
.
lowerCase
!=
0
{
f
=
c
.
lowerCase
}
else
if
c
.
upperCase
!=
i
&&
c
.
upperCase
!=
0
{
f
=
c
.
upperCase
}
else
{
f
=
i
}
}
if
g
:=
unicode
.
SimpleFold
(
i
);
g
!=
f
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.SimpleFold(%#U) = %#U, want %#U
\n
"
,
i
,
g
,
f
)
}
}
return
}
fmt
.
Printf
(
"var caseOrbit = []foldPair{
\n
"
)
for
i
:=
range
chars
{
c
:=
&
chars
[
i
]
if
c
.
caseOrbit
!=
0
{
fmt
.
Printf
(
"
\t
{0x%04X, 0x%04X},
\n
"
,
i
,
c
.
caseOrbit
)
foldPairCount
++
}
}
fmt
.
Printf
(
"}
\n\n
"
)
}
func
printCatFold
(
name
string
,
m
map
[
string
]
map
[
int
]
bool
)
{
if
*
test
{
var
pkgMap
map
[
string
]
*
unicode
.
RangeTable
if
name
==
"FoldCategory"
{
pkgMap
=
unicode
.
FoldCategory
}
else
{
pkgMap
=
unicode
.
FoldScript
}
if
len
(
pkgMap
)
!=
len
(
m
)
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.%s has %d elements, want %d
\n
"
,
name
,
len
(
pkgMap
),
len
(
m
))
return
}
for
k
,
v
:=
range
m
{
t
,
ok
:=
pkgMap
[
k
]
if
!
ok
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.%s[%q] missing
\n
"
,
name
,
k
)
continue
}
n
:=
0
for
_
,
r
:=
range
t
.
R16
{
for
c
:=
int
(
r
.
Lo
);
c
<=
int
(
r
.
Hi
);
c
+=
int
(
r
.
Stride
)
{
if
!
v
[
c
]
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.%s[%q] contains %#U, should not
\n
"
,
name
,
k
,
c
)
}
n
++
}
}
for
_
,
r
:=
range
t
.
R32
{
for
c
:=
int
(
r
.
Lo
);
c
<=
int
(
r
.
Hi
);
c
+=
int
(
r
.
Stride
)
{
if
!
v
[
c
]
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.%s[%q] contains %#U, should not
\n
"
,
name
,
k
,
c
)
}
n
++
}
}
if
n
!=
len
(
v
)
{
fmt
.
Fprintf
(
os
.
Stderr
,
"unicode.%s[%q] has %d code points, want %d
\n
"
,
name
,
k
,
n
,
len
(
v
))
}
}
return
}
fmt
.
Print
(
comment
[
name
])
fmt
.
Printf
(
"var %s = map[string]*RangeTable{
\n
"
,
name
)
for
name
:=
range
m
{
fmt
.
Printf
(
"
\t
%q: fold%s,
\n
"
,
name
,
name
)
}
fmt
.
Printf
(
"}
\n\n
"
)
for
name
,
class
:=
range
m
{
dumpRange
(
fmt
.
Sprintf
(
"var fold%s = &RangeTable{
\n
"
,
name
),
func
(
code
int
)
bool
{
return
class
[
code
]
})
}
}
var
range16Count
=
0
// Number of entries in the 16-bit range tables.
var
range32Count
=
0
// Number of entries in the 32-bit range tables.
var
foldPairCount
=
0
// Number of fold pairs in the exception tables.
func
printSizes
()
{
if
*
test
{
...
...
@@ -963,4 +1246,6 @@ func printSizes() {
range16Bytes
:=
range16Count
*
3
*
2
range32Bytes
:=
range32Count
*
3
*
4
fmt
.
Printf
(
"// Range bytes: %d 16-bit, %d 32-bit, %d total.
\n
"
,
range16Bytes
,
range32Bytes
,
range16Bytes
+
range32Bytes
)
fmt
.
Println
()
fmt
.
Printf
(
"// Fold orbit bytes: %d pairs, %d bytes
\n
"
,
foldPairCount
,
foldPairCount
*
2
*
2
)
}
src/pkg/unicode/tables.go
View file @
fc77e826
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment