strconv: Teach UnquoteChar to distinguish unexpected EOF from syntax errors

This is needed to get proper error reporting in situations where UnquoteChar is used to decode input stream step by step: there if we see truncated version of valid character as input always returning ErrSyntax effectively blocks caller from determining what it was - a real syntax error or unexpected end of stream. Unquote error behaviour is preserved to return ErrSyntax always, because Unquote operates on whole input at once - not as on stream - and anything wrong there should be seen as really an error in syntax. Since UnquoteChar is internally used by Unquote, and we already have Unquote tests to cover all kinds of valid input, we only need to add tests to cover UnquoteChar error behaviour. Though, unfortunately, we cannot easily reuse error-behaviour Unquote tests for this. P.S. My original use-case for this change is ogórek where UnquoteChar is used to decode unicode string encoded in Python pickles: https://github.com/kisielk/og-rek/blob/c7dbf2e4/ogorek.go#L530 Change-Id: I611e7f5795560da488396bc93135a81a56482b75

strconv: Teach UnquoteChar to distinguish unexpected EOF from syntax errors
This is needed to get proper error reporting in situations where UnquoteChar is used to decode input stream step by step: there if we see truncated version of valid character as input always returning ErrSyntax effectively blocks caller from determining what it was - a real syntax error or unexpected end of stream. Unquote error behaviour is preserved to return ErrSyntax always, because Unquote operates on whole input at once - not as on stream - and anything wrong there should be seen as really an error in syntax. Since UnquoteChar is internally used by Unquote, and we already have Unquote tests to cover all kinds of valid input, we only need to add tests to cover UnquoteChar error behaviour. Though, unfortunately, we cannot easily reuse error-behaviour Unquote tests for this. P.S. My original use-case for this change is ogórek where UnquoteChar is used to decode unicode string encoded in Python pickles: https://github.com/kisielk/og-rek/blob/c7dbf2e4/ogorek.go#L530 Change-Id: I611e7f5795560da488396bc93135a81a56482b75
5c39341f · Kirill Smelkov · 79fab70a · 5c39341f · 5c39341f
Commit 5c39341f authored Feb 15, 2017 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 50 additions and 5 deletions

src/strconv/quote.go src/strconv/quote.go +9 -5

src/strconv/quote_test.go src/strconv/quote_test.go +41 -0

No files found.
--- a/src/strconv/quote.go
+++ b/src/strconv/quote.go
@@ -6,7 +6,10 @@

 package strconv

-import "unicode/utf8"
+import (
+	"io"
+	"unicode/utf8"
+)

 const lowerhex = "0123456789abcdef"

@@ -250,7 +253,7 @@ func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string,

 	// hard case: c is backslash
 	if len(s) <= 1 {
-		err = ErrSyntax
+		err = io.ErrUnexpectedEOF
 		return
 	}
 	c := s[1]
@@ -283,7 +286,7 @@ func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string,
 		}
 		var v rune
 		if len(s) < n {
-			err = ErrSyntax
+			err = io.ErrUnexpectedEOF
 			return
 		}
 		for j := 0; j < n; j++ {
@@ -309,7 +312,7 @@ func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string,
 	case '0', '1', '2', '3', '4', '5', '6', '7':
 		v := rune(c) - '0'
 		if len(s) < 2 {
-			err = ErrSyntax
+			err = io.ErrUnexpectedEOF
 			return
 		}
 		for j := 0; j < 2; j++ { // one digit already; two more
@@ -399,7 +402,8 @@ func Unquote(s string) (string, error) {
 	for len(s) > 0 {
 		c, multibyte, ss, err := UnquoteChar(s, quote)
 		if err != nil {
-			return "", err
+			// turn any error from UnquoteChar into syntax error
+			return "", ErrSyntax
 		}
 		s = ss
 		if c < utf8.RuneSelf || !multibyte {

--- a/src/strconv/quote_test.go
+++ b/src/strconv/quote_test.go
@@ -5,6 +5,7 @@
 package strconv_test

 import (
+	"io"
 	. "strconv"
 	"testing"
 	"unicode"
@@ -305,6 +306,29 @@ var misquoted = []string{
 	"'\n'",
 }

+type unQuoteCharErrTest struct {
+	in  string
+	err error
+}
+
+var misquotedChars = []unQuoteCharErrTest{
+	{`\000`, nil}, // nil mean unquoteChar should be ok -> test for io.ErrUnexpectedEOF
+	{`\x00`, nil}, // on truncated input
+	{`\u0000`, nil},
+	{`\U00000000`, nil},
+
+	{`"`, ErrSyntax},
+	{`\'`, ErrSyntax},
+	{`\q`, ErrSyntax},
+	{`\z`, ErrSyntax},
+	{`\008`, ErrSyntax},
+	{`\400`, ErrSyntax},
+	{`\x0z`, ErrSyntax},
+	{`\u000z`, ErrSyntax},
+	{`\U0000000z`, ErrSyntax},
+	{`\U12345678`, ErrSyntax},
+}
+
 func TestUnquote(t *testing.T) {
 	for _, tt := range unquotetests {
 		if out, err := Unquote(tt.in); err != nil || out != tt.out {
@@ -324,6 +348,23 @@ func TestUnquote(t *testing.T) {
 			t.Errorf("Unquote(%#q) = %q, %v want %q, %v", s, out, err, "", ErrSyntax)
 		}
 	}
+
+	for _, tt := range misquotedChars {
+		_, _, _, err := UnquoteChar(tt.in, '"')
+		if err != tt.err {
+			t.Errorf("UnquoteChar(%#q) -> err = %v want %v", tt.in, err, tt.err)
+		}
+
+		if tt.err == nil {
+			// truncated valid input should result in unexpected EOF
+			for l := len(tt.in) - 1; l > 0; l-- {
+				_, _, _, err2 := UnquoteChar(tt.in[:l], '"')
+				if err2 != io.ErrUnexpectedEOF {
+					t.Errorf("UnquoteChar(%#q) -> err = %v want %v", tt.in[:l], err2, io.ErrUnexpectedEOF)
+				}
+			}
+		}
+	}
 }

 func BenchmarkUnquoteEasy(b *testing.B) {