Commit e607abbf authored by Egon Elbre's avatar Egon Elbre Committed by Brad Fitzpatrick

unicode: improve SimpleFold performance for ascii

This change significantly speeds up case-insensitive regexp matching.

benchmark                      old ns/op      new ns/op      delta
BenchmarkMatchEasy0i_32-8      2690           1473           -45.24%
BenchmarkMatchEasy0i_1K-8      80404          42269          -47.43%
BenchmarkMatchEasy0i_32K-8     3272187        2076118        -36.55%
BenchmarkMatchEasy0i_1M-8      104805990      66503805       -36.55%
BenchmarkMatchEasy0i_32M-8     3360192200     2126121600     -36.73%

benchmark                      old MB/s     new MB/s     speedup
BenchmarkMatchEasy0i_32-8      11.90        21.72        1.83x
BenchmarkMatchEasy0i_1K-8      12.74        24.23        1.90x
BenchmarkMatchEasy0i_32K-8     10.01        15.78        1.58x
BenchmarkMatchEasy0i_1M-8      10.00        15.77        1.58x
BenchmarkMatchEasy0i_32M-8     9.99         15.78        1.58x

Issue #13288

Change-Id: I94af7bb29e75d60b4f6ee760124867ab271b9642
Reviewed-on: https://go-review.googlesource.com/16943Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 6e4a8615
...@@ -672,6 +672,7 @@ func benchmark(b *testing.B, re string, n int) { ...@@ -672,6 +672,7 @@ func benchmark(b *testing.B, re string, n int) {
const ( const (
easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
easy0i = "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"
easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
...@@ -682,6 +683,11 @@ func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } ...@@ -682,6 +683,11 @@ func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) }
func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
func BenchmarkMatchEasy0i_32(b *testing.B) { benchmark(b, easy0i, 32<<0) }
func BenchmarkMatchEasy0i_1K(b *testing.B) { benchmark(b, easy0i, 1<<10) }
func BenchmarkMatchEasy0i_32K(b *testing.B) { benchmark(b, easy0i, 32<<10) }
func BenchmarkMatchEasy0i_1M(b *testing.B) { benchmark(b, easy0i, 1<<20) }
func BenchmarkMatchEasy0i_32M(b *testing.B) { benchmark(b, easy0i, 32<<20) }
func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
......
...@@ -332,6 +332,10 @@ type foldPair struct { ...@@ -332,6 +332,10 @@ type foldPair struct {
// SimpleFold('1') = '1' // SimpleFold('1') = '1'
// //
func SimpleFold(r rune) rune { func SimpleFold(r rune) rune {
if int(r) < len(asciiFold) {
return rune(asciiFold[r])
}
// Consult caseOrbit table for special cases. // Consult caseOrbit table for special cases.
lo := 0 lo := 0
hi := len(caseOrbit) hi := len(caseOrbit)
......
...@@ -1172,6 +1172,7 @@ func printCasefold() { ...@@ -1172,6 +1172,7 @@ func printCasefold() {
} }
} }
printAsciiFold()
printCaseOrbit() printCaseOrbit()
// Tables of category and script folding exceptions: code points // Tables of category and script folding exceptions: code points
...@@ -1269,6 +1270,25 @@ var comment = map[string]string{ ...@@ -1269,6 +1270,25 @@ var comment = map[string]string{
"// If there is no entry for a script name, there are no such points.\n", "// If there is no entry for a script name, there are no such points.\n",
} }
func printAsciiFold() {
printf("var asciiFold = [MaxASCII + 1]uint16{\n")
for i := rune(0); i <= unicode.MaxASCII; i++ {
c := chars[i]
f := c.caseOrbit
if f == 0 {
if c.lowerCase != i && c.lowerCase != 0 {
f = c.lowerCase
} else if c.upperCase != i && c.upperCase != 0 {
f = c.upperCase
} else {
f = i
}
}
printf("\t0x%04X,\n", f)
}
printf("}\n\n")
}
func printCaseOrbit() { func printCaseOrbit() {
if *test { if *test {
for j := range chars { for j := range chars {
......
...@@ -6834,6 +6834,137 @@ var properties = [MaxLatin1 + 1]uint8{ ...@@ -6834,6 +6834,137 @@ var properties = [MaxLatin1 + 1]uint8{
0xFF: pLl | pp, // 'ÿ' 0xFF: pLl | pp, // 'ÿ'
} }
var asciiFold = [MaxASCII + 1]uint16{
0x0000,
0x0001,
0x0002,
0x0003,
0x0004,
0x0005,
0x0006,
0x0007,
0x0008,
0x0009,
0x000A,
0x000B,
0x000C,
0x000D,
0x000E,
0x000F,
0x0010,
0x0011,
0x0012,
0x0013,
0x0014,
0x0015,
0x0016,
0x0017,
0x0018,
0x0019,
0x001A,
0x001B,
0x001C,
0x001D,
0x001E,
0x001F,
0x0020,
0x0021,
0x0022,
0x0023,
0x0024,
0x0025,
0x0026,
0x0027,
0x0028,
0x0029,
0x002A,
0x002B,
0x002C,
0x002D,
0x002E,
0x002F,
0x0030,
0x0031,
0x0032,
0x0033,
0x0034,
0x0035,
0x0036,
0x0037,
0x0038,
0x0039,
0x003A,
0x003B,
0x003C,
0x003D,
0x003E,
0x003F,
0x0040,
0x0061,
0x0062,
0x0063,
0x0064,
0x0065,
0x0066,
0x0067,
0x0068,
0x0069,
0x006A,
0x006B,
0x006C,
0x006D,
0x006E,
0x006F,
0x0070,
0x0071,
0x0072,
0x0073,
0x0074,
0x0075,
0x0076,
0x0077,
0x0078,
0x0079,
0x007A,
0x005B,
0x005C,
0x005D,
0x005E,
0x005F,
0x0060,
0x0041,
0x0042,
0x0043,
0x0044,
0x0045,
0x0046,
0x0047,
0x0048,
0x0049,
0x004A,
0x212A,
0x004C,
0x004D,
0x004E,
0x004F,
0x0050,
0x0051,
0x0052,
0x017F,
0x0054,
0x0055,
0x0056,
0x0057,
0x0058,
0x0059,
0x005A,
0x007B,
0x007C,
0x007D,
0x007E,
0x007F,
}
var caseOrbit = []foldPair{ var caseOrbit = []foldPair{
{0x004B, 0x006B}, {0x004B, 0x006B},
{0x0053, 0x0073}, {0x0053, 0x0073},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment