Commit 54697702 authored by Daniel Martí's avatar Daniel Martí

encoding/json: avoid work when unquoting strings, take 2

This is a re-submission of CL 151157, since it was reverted in CL 190909
due to an introduced crash found by a fuzzer. The revert CL included
regression tests, while this CL includes a fixed version of the original
change.

In particular, what we forgot in the original optimization was that we
still need the length and trailing quote checks at the beginning of
unquoteBytes. Without those, we could end up in a crash later on.

We can work out how many bytes can be unquoted trivially in
rescanLiteral, which already iterates over a string's bytes.

Removing the extra loop in unquoteBytes simplifies the function and
speeds it up, especially when decoding simple strings, which are common.

While at it, we can remove the check that s[0]=='"', since all call
sites already meet that condition.

name           old time/op    new time/op    delta
CodeDecoder-8    10.6ms ± 2%    10.5ms ± 1%  -1.01%  (p=0.004 n=20+10)

name           old speed      new speed      delta
CodeDecoder-8   183MB/s ± 2%   185MB/s ± 1%  +1.02%  (p=0.003 n=20+10)

Updates #28923.

Change-Id: I8c6b13302bcd86a364bc998d72451332c0809cde
Reviewed-on: https://go-review.googlesource.com/c/go/+/190659
Run-TryBot: Daniel Martí <mvdan@mvdan.cc>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarPeter Weinberger <pjw@google.com>
parent 10855608
...@@ -213,6 +213,9 @@ type decodeState struct { ...@@ -213,6 +213,9 @@ type decodeState struct {
savedError error savedError error
useNumber bool useNumber bool
disallowUnknownFields bool disallowUnknownFields bool
// safeUnquote is the number of current string literal bytes that don't
// need to be unquoted. When negative, no bytes need unquoting.
safeUnquote int
} }
// readIndex returns the position of the last byte read. // readIndex returns the position of the last byte read.
...@@ -314,13 +317,27 @@ func (d *decodeState) rescanLiteral() { ...@@ -314,13 +317,27 @@ func (d *decodeState) rescanLiteral() {
Switch: Switch:
switch data[i-1] { switch data[i-1] {
case '"': // string case '"': // string
// safeUnquote is initialized at -1, which means that all bytes
// checked so far can be unquoted at a later time with no work
// at all. When reaching the closing '"', if safeUnquote is
// still -1, all bytes can be unquoted with no work. Otherwise,
// only those bytes up until the first '\\' or non-ascii rune
// can be safely unquoted.
safeUnquote := -1
for ; i < len(data); i++ { for ; i < len(data); i++ {
switch data[i] { if c := data[i]; c == '\\' {
case '\\': if safeUnquote < 0 { // first unsafe byte
safeUnquote = int(i - d.off)
}
i++ // escaped char i++ // escaped char
case '"': } else if c == '"' {
d.safeUnquote = safeUnquote
i++ // tokenize the closing quote too i++ // tokenize the closing quote too
break Switch break Switch
} else if c >= utf8.RuneSelf {
if safeUnquote < 0 { // first unsafe byte
safeUnquote = int(i - d.off)
}
} }
} }
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
...@@ -674,7 +691,7 @@ func (d *decodeState) object(v reflect.Value) error { ...@@ -674,7 +691,7 @@ func (d *decodeState) object(v reflect.Value) error {
start := d.readIndex() start := d.readIndex()
d.rescanLiteral() d.rescanLiteral()
item := d.data[start:d.readIndex()] item := d.data[start:d.readIndex()]
key, ok := unquoteBytes(item) key, ok := d.unquoteBytes(item)
if !ok { if !ok {
panic(phasePanicMsg) panic(phasePanicMsg)
} }
...@@ -875,7 +892,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool ...@@ -875,7 +892,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())}) d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
return nil return nil
} }
s, ok := unquoteBytes(item) s, ok := d.unquoteBytes(item)
if !ok { if !ok {
if fromQuoted { if fromQuoted {
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type()) return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
...@@ -926,7 +943,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool ...@@ -926,7 +943,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
} }
case '"': // string case '"': // string
s, ok := unquoteBytes(item) s, ok := d.unquoteBytes(item)
if !ok { if !ok {
if fromQuoted { if fromQuoted {
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type()) return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
...@@ -1086,7 +1103,7 @@ func (d *decodeState) objectInterface() map[string]interface{} { ...@@ -1086,7 +1103,7 @@ func (d *decodeState) objectInterface() map[string]interface{} {
start := d.readIndex() start := d.readIndex()
d.rescanLiteral() d.rescanLiteral()
item := d.data[start:d.readIndex()] item := d.data[start:d.readIndex()]
key, ok := unquote(item) key, ok := d.unquote(item)
if !ok { if !ok {
panic(phasePanicMsg) panic(phasePanicMsg)
} }
...@@ -1135,7 +1152,7 @@ func (d *decodeState) literalInterface() interface{} { ...@@ -1135,7 +1152,7 @@ func (d *decodeState) literalInterface() interface{} {
return c == 't' return c == 't'
case '"': // string case '"': // string
s, ok := unquote(item) s, ok := d.unquote(item)
if !ok { if !ok {
panic(phasePanicMsg) panic(phasePanicMsg)
} }
...@@ -1178,38 +1195,26 @@ func getu4(s []byte) rune { ...@@ -1178,38 +1195,26 @@ func getu4(s []byte) rune {
// unquote converts a quoted JSON string literal s into an actual string t. // unquote converts a quoted JSON string literal s into an actual string t.
// The rules are different than for Go, so cannot use strconv.Unquote. // The rules are different than for Go, so cannot use strconv.Unquote.
func unquote(s []byte) (t string, ok bool) { // The first byte in s must be '"'.
s, ok = unquoteBytes(s) func (d *decodeState) unquote(s []byte) (t string, ok bool) {
s, ok = d.unquoteBytes(s)
t = string(s) t = string(s)
return return
} }
func unquoteBytes(s []byte) (t []byte, ok bool) { func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' { // We already know that s[0] == '"'. However, we don't know that the
// closing quote exists in all cases, such as when the string is nested
// via the ",string" option.
if len(s) < 2 || s[len(s)-1] != '"' {
return return
} }
s = s[1 : len(s)-1] s = s[1 : len(s)-1]
// Check for unusual characters. If there are none, // If there are no unusual characters, no unquoting is needed, so return
// then no unquoting is needed, so return a slice of the // a slice of the original bytes.
// original bytes. r := d.safeUnquote
r := 0 if r == -1 {
for r < len(s) {
c := s[r]
if c == '\\' || c == '"' || c < ' ' {
break
}
if c < utf8.RuneSelf {
r++
continue
}
rr, size := utf8.DecodeRune(s[r:])
if rr == utf8.RuneError && size == 1 {
break
}
r += size
}
if r == len(s) {
return s, true return s, true
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment