Commit bebfd4ba authored by Martin Möhrmann's avatar Martin Möhrmann

strings: speed up Fields

- use a string lookup to detect if a single byte is a space character
- determine the exact number of fields for ASCII and
  a possibly underestimated number of fields for non ASCII strings
  by doing a separate byte for byte scan of the input string
  before collecting the fields in an extra pass
- provide a fast path for ASCII only strings when collecting the fields
- avoid utf8.DecodeRuneInString and unicode.IsSpace for ASCII characters

Used golang.org/cl/33108 from Joe Tsai as starting point.

name                      old time/op    new time/op     delta
Fields/ASCII/16              284ns ± 1%      116ns ± 2%   -59.30%  (p=0.000 n=9+10)
Fields/ASCII/256            3.81µs ± 1%     0.80µs ± 1%   -79.10%  (p=0.000 n=10+10)
Fields/ASCII/4096           61.4µs ± 1%     12.3µs ± 1%   -79.96%  (p=0.000 n=10+9)
Fields/ASCII/65536           982µs ± 1%      235µs ± 0%   -76.04%  (p=0.000 n=10+9)
Fields/ASCII/1048576        16.7ms ± 2%      5.4ms ± 1%   -67.52%  (p=0.000 n=10+10)
Fields/Mixed/16              314ns ± 1%      168ns ± 1%   -46.33%  (p=0.000 n=9+10)
Fields/Mixed/256            3.92µs ± 1%     1.17µs ± 1%   -70.19%  (p=0.000 n=10+10)
Fields/Mixed/4096           69.1µs ± 1%     19.0µs ± 1%   -72.53%  (p=0.000 n=10+10)
Fields/Mixed/65536          1.12ms ± 1%     0.39ms ± 0%   -65.37%  (p=0.000 n=10+9)
Fields/Mixed/1048576        19.0ms ± 2%      7.3ms ± 4%   -61.75%  (p=0.000 n=10+9)

name                      old speed      new speed       delta
Fields/ASCII/16           56.3MB/s ± 1%  138.1MB/s ± 2%  +145.31%  (p=0.000 n=9+10)
Fields/ASCII/256          67.1MB/s ± 1%  321.0MB/s ± 1%  +378.26%  (p=0.000 n=10+10)
Fields/ASCII/4096         66.7MB/s ± 1%  333.0MB/s ± 1%  +398.97%  (p=0.000 n=10+9)
Fields/ASCII/65536        66.7MB/s ± 1%  278.4MB/s ± 0%  +317.39%  (p=0.000 n=10+9)
Fields/ASCII/1048576      62.7MB/s ± 2%  192.9MB/s ± 1%  +207.82%  (p=0.000 n=10+10)
Fields/Mixed/16           51.0MB/s ± 2%   94.9MB/s ± 1%   +85.87%  (p=0.000 n=10+10)
Fields/Mixed/256          65.4MB/s ± 1%  219.2MB/s ± 1%  +235.33%  (p=0.000 n=10+10)
Fields/Mixed/4096         59.3MB/s ± 1%  215.7MB/s ± 1%  +263.98%  (p=0.000 n=10+10)
Fields/Mixed/65536        58.6MB/s ± 1%  169.1MB/s ± 0%  +188.73%  (p=0.000 n=10+9)
Fields/Mixed/1048576      55.1MB/s ± 2%  144.0MB/s ± 4%  +161.44%  (p=0.000 n=10+9)

Updates #19789
Updates #17856

Change-Id: If2ce1479542702e9cd65a82a462ba55ac8eb3876
Reviewed-on: https://go-review.googlesource.com/37959
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarJoe Tsai <thebrokentoaster@gmail.com>
parent 5cadc91b
......@@ -290,11 +290,118 @@ func SplitAfter(s, sep string) []string {
return genSplit(s, sep, len(sep), -1)
}
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
// Fields splits the string s around each instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
// empty list if s contains only white space.
func Fields(s string) []string {
return FieldsFunc(s, unicode.IsSpace)
// First count the fields.
// This is an exact count if s is ASCII, otherwise it is an approximation.
n := 0
wasSpace := 1
// setBits is used to track which bits are set in the bytes of s.
setBits := uint8(0)
for i := 0; i < len(s); i++ {
r := s[i]
setBits |= r
isSpace := int(asciiSpace[r])
n += wasSpace & ^isSpace
wasSpace = isSpace
}
if setBits < utf8.RuneSelf { // ASCII fast path
a := make([]string, n)
na := 0
fieldStart := 0
i := 0
// Skip spaces in the front of the input.
for i < len(s) && asciiSpace[s[i]] != 0 {
i++
}
fieldStart = i
for i < len(s) {
if asciiSpace[s[i]] == 0 {
i++
continue
}
a[na] = s[fieldStart:i]
na++
i++
// Skip spaces in between fields.
for i < len(s) && asciiSpace[s[i]] != 0 {
i++
}
fieldStart = i
}
if fieldStart < len(s) { // Last field might end at EOF.
a[na] = s[fieldStart:]
}
return a
}
// Some runes in the input string are not ASCII.
// Same general approach as in the ASCII path but
// uses DecodeRuneInString and unicode.IsSpace if
// a non-ASCII rune needs to be decoded and checked
// if it corresponds to a space.
a := make([]string, 0, n)
fieldStart := 0
i := 0
// Skip spaces in the front of the input.
for i < len(s) {
if c := s[i]; c < utf8.RuneSelf {
if asciiSpace[c] == 0 {
break
}
i++
} else {
r, w := utf8.DecodeRuneInString(s[i:])
if !unicode.IsSpace(r) {
break
}
i += w
}
}
fieldStart = i
for i < len(s) {
if c := s[i]; c < utf8.RuneSelf {
if asciiSpace[c] == 0 {
i++
continue
}
a = append(a, s[fieldStart:i])
i++
} else {
r, w := utf8.DecodeRuneInString(s[i:])
if !unicode.IsSpace(r) {
i += w
continue
}
a = append(a, s[fieldStart:i])
i += w
}
// Skip spaces in between fields.
for i < len(s) {
if c := s[i]; c < utf8.RuneSelf {
if asciiSpace[c] == 0 {
break
}
i++
} else {
r, w := utf8.DecodeRuneInString(s[i:])
if !unicode.IsSpace(r) {
break
}
i += w
}
}
fieldStart = i
}
if fieldStart < len(s) { // Last field might end at EOF.
a = append(a, s[fieldStart:])
}
return a
}
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
......
......@@ -452,6 +452,7 @@ var fieldstests = []FieldsTest{
{"", []string{}},
{" ", []string{}},
{" \t ", []string{}},
{"\u2000", []string{}},
{" abc ", []string{"abc"}},
{"1 2 3 4", []string{"1", "2", "3", "4"}},
{"1 2 3 4", []string{"1", "2", "3", "4"}},
......@@ -459,6 +460,9 @@ var fieldstests = []FieldsTest{
{"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}},
{"\u2000\u2001\u2002", []string{}},
{"\n\t\n", []string{"™", "™"}},
{"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}},
{"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}},
{"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}},
{faces, []string{faces}},
}
......@@ -1473,19 +1477,55 @@ var makeFieldsInput = func() string {
return string(x)
}
var fieldsInput = makeFieldsInput()
var makeFieldsInputASCII = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, rest ASCII non-space.
for i := range x {
if rand.Intn(10) == 0 {
x[i] = ' '
} else {
x[i] = 'x'
}
}
return string(x)
}
var stringdata = []struct{ name, data string }{
{"ASCII", makeFieldsInputASCII()},
{"Mixed", makeFieldsInput()},
}
func BenchmarkFields(b *testing.B) {
b.SetBytes(int64(len(fieldsInput)))
for i := 0; i < b.N; i++ {
Fields(fieldsInput)
for _, sd := range stringdata {
b.Run(sd.name, func(b *testing.B) {
for j := 1 << 4; j <= 1<<20; j <<= 4 {
b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(j))
data := sd.data[:j]
for i := 0; i < b.N; i++ {
Fields(data)
}
})
}
})
}
}
func BenchmarkFieldsFunc(b *testing.B) {
b.SetBytes(int64(len(fieldsInput)))
for i := 0; i < b.N; i++ {
FieldsFunc(fieldsInput, unicode.IsSpace)
for _, sd := range stringdata {
b.Run(sd.name, func(b *testing.B) {
for j := 1 << 4; j <= 1<<20; j <<= 4 {
b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(j))
data := sd.data[:j]
for i := 0; i < b.N; i++ {
FieldsFunc(data, unicode.IsSpace)
}
})
}
})
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment