Commit 5b920281 authored by Ingo Oeser's avatar Ingo Oeser Committed by Brad Fitzpatrick

html: speed up UnescapeString

Add benchmarks for for sparsely escaped and densely escaped strings.
Then speed up the sparse unescaping part heavily by using IndexByte and
copy to skip the parts containing no escaping very fast.

Unescaping densely escaped strings slower because of
the new function call overhead. But sparsely encoded strings are seen
more often in the utf8 enabled web.

We win part of the speed back by looking up entityName differently.

	benchmark                  old ns/op    new ns/op    delta
	BenchmarkEscape                31680        31396   -0.90%
	BenchmarkEscapeNone             6507         6872   +5.61%
	BenchmarkUnescape              36481        48298  +32.39%
	BenchmarkUnescapeNone            332          325   -2.11%
	BenchmarkUnescapeSparse         8836         3221  -63.55%
	BenchmarkUnescapeDense         30639        32224   +5.17%

Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606
Reviewed-on: https://go-review.googlesource.com/10172Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 5f859ba8
...@@ -57,8 +57,9 @@ var replacementTable = [...]rune{ ...@@ -57,8 +57,9 @@ var replacementTable = [...]rune{
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors. // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src. // Precondition: b[src] == '&' && dst <= src.
// attribute should be true if parsing an attribute value. func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { const attribute = false
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'. // i starts at 1 because we already know that s[0] == '&'.
...@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { ...@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
break break
} }
entityName := string(s[1:i]) entityName := s[1:i]
if entityName == "" { if len(entityName) == 0 {
// No-op. // No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op. // No-op.
} else if x := entity[entityName]; x != 0 { } else if x := entity[string(entityName)]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 { } else if x := entity2[string(entityName)]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute { } else if !attribute {
...@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { ...@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
maxLen = longestEntityWithoutSemicolon maxLen = longestEntityWithoutSemicolon
} }
for j := maxLen; j > 1; j-- { for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 { if x := entity[string(entityName[:j])]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
} }
} }
...@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { ...@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
return dst1, src1 return dst1, src1
} }
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
func unescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, false)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, false)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}
var htmlEscaper = strings.NewReplacer( var htmlEscaper = strings.NewReplacer(
`&`, "&amp;", `&`, "&amp;",
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5. `'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
...@@ -208,8 +189,29 @@ func EscapeString(s string) string { ...@@ -208,8 +189,29 @@ func EscapeString(s string) string {
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true. // always true.
func UnescapeString(s string) string { func UnescapeString(s string) string {
if !strings.Contains(s, "&") { i := strings.IndexByte(s, '&')
if i < 0 {
return s return s
} }
return string(unescape([]byte(s)))
b := []byte(s)
dst, src := unescapeEntity(b, i, i)
for len(s[src:]) > 0 {
if s[src] == '&' {
i = 0
} else {
i = strings.IndexByte(s[src:], '&')
}
if i < 0 {
dst += copy(b[dst:], s[src:])
break
}
if i > 0 {
copy(b[dst:], s[src:src+i])
}
dst, src = unescapeEntity(b, dst+i, src+i)
}
return string(b[:dst])
} }
...@@ -120,6 +120,8 @@ func TestUnescapeEscape(t *testing.T) { ...@@ -120,6 +120,8 @@ func TestUnescapeEscape(t *testing.T) {
var ( var (
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100) benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100) benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
) )
func BenchmarkEscape(b *testing.B) { func BenchmarkEscape(b *testing.B) {
...@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) { ...@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
n += len(UnescapeString(s)) n += len(UnescapeString(s))
} }
} }
func BenchmarkUnescapeSparse(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeSparse))
}
}
func BenchmarkUnescapeDense(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeDense))
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment