encoder: Fix protocol 0 UNICODE emission for invalid UTF-8

In 9daf6a2a (Fix UNICODE decoding) I fixed protocol 0 UNICODE decoding by implementing "raw-unicode-escape" decoder and switching unpickler to use that to match 1-to-1 what Python unpickler does. In 57f875fd (encoder: Fix protocol 0 UNICODE emission) I further fixed protocol 0 UNICODE encoding by implementing "raw-unicode-escape" encoder and switching pickler to use that, trying to match 1-to-1 what Python pickler does. However there is a difference in between Python unicode and unicode on Go side: in Python unicode is immutable array of UCS code points. On Go side, unicode is immutable array of bytes, that, similarly to Go string are treated as being mostly UTF-8. We did not pick e.g. []rune to represent unicode on Go side because []rune is not immutable and so cannot be used as map keys. So Go unicode can be either valid UTF-8 or invalid UTF-8. For valid UTF-8 case everything is working as intended: our raw-unicode-escape decoder, because it works in terms of UCS, can produce only valid UTF-8, while our raw-unicode-escape encoder also matches 1-to-1 what python does because for valid UTF-8 utf8.DecodeRuneInString gives good rune and the encoder further works in terms of UCS. However for the case of invalid UTF-8, there is a difference in between what Python and Go raw-unicode-escape encoders do: for Python this case is simply impossible because there input is []UCS. For the Go case, in 57f875fd I extended the encoder to do: // invalid UTF-8 -> emit byte as is case r == utf8.RuneError: out = append(out, s[0]) which turned out to be not very thoughtful and wrong because original raw-unicode-escape also emits UCS < 0x100 as those plain bytes and so if we also emit invalid UTF-8 as is then the following two inputs would be encoded into the same representation "\x93": unicode("\x93") // invalid UTF-8 unicode("\xc2\x93) // UTF-8 of \u93 which would break `decode/encode = identity` invariant and corrupt the data because when loaded back it will be "\xc2\x93" instead of original "\x93". -> Fix it by rejecting to encode such invalid UTF-8 via protocol 0 UNICODE. Unfortunately rejection is the only reasonable choice because raw-unicode-escape codec does not allow \xAA to be present in the output stream and so there is simply no way to represent arbitrary bytes there. Better to give explicit error instead of corrupting the data. For protocols ≥ 1 arbitrary unicode - both valid and invalid UTF-8 - can still be loaded and saved because *BINUNICODE opcodes come with bytestring argument and there is no need to decode/encode those bytestrings.

encoder: Fix protocol 0 UNICODE emission for invalid UTF-8
In 9daf6a2a (Fix UNICODE decoding) I fixed protocol 0 UNICODE decoding by implementing "raw-unicode-escape" decoder and switching unpickler to use that to match 1-to-1 what Python unpickler does. In 57f875fd (encoder: Fix protocol 0 UNICODE emission) I further fixed protocol 0 UNICODE encoding by implementing "raw-unicode-escape" encoder and switching pickler to use that, trying to match 1-to-1 what Python pickler does. However there is a difference in between Python unicode and unicode on Go side: in Python unicode is immutable array of UCS code points. On Go side, unicode is immutable array of bytes, that, similarly to Go string are treated as being mostly UTF-8. We did not pick e.g. []rune to represent unicode on Go side because []rune is not immutable and so cannot be used as map keys. So Go unicode can be either valid UTF-8 or invalid UTF-8. For valid UTF-8 case everything is working as intended: our raw-unicode-escape decoder, because it works in terms of UCS, can produce only valid UTF-8, while our raw-unicode-escape encoder also matches 1-to-1 what python does because for valid UTF-8 utf8.DecodeRuneInString gives good rune and the encoder further works in terms of UCS. However for the case of invalid UTF-8, there is a difference in between what Python and Go raw-unicode-escape encoders do: for Python this case is simply impossible because there input is []UCS. For the Go case, in 57f875fd I extended the encoder to do: // invalid UTF-8 -> emit byte as is case r == utf8.RuneError: out = append(out, s[0]) which turned out to be not very thoughtful and wrong because original raw-unicode-escape also emits UCS < 0x100 as those plain bytes and so if we also emit invalid UTF-8 as is then the following two inputs would be encoded into the same representation "\x93": unicode("\x93") // invalid UTF-8 unicode("\xc2\x93) // UTF-8 of \u93 which would break `decode/encode = identity` invariant and corrupt the data because when loaded back it will be "\xc2\x93" instead of original "\x93". -> Fix it by rejecting to encode such invalid UTF-8 via protocol 0 UNICODE. Unfortunately rejection is the only reasonable choice because raw-unicode-escape codec does not allow \xAA to be present in the output stream and so there is simply no way to represent arbitrary bytes there. Better to give explicit error instead of corrupting the data. For protocols ≥ 1 arbitrary unicode - both valid and invalid UTF-8 - can still be loaded and saved because *BINUNICODE opcodes come with bytestring argument and there is no need to decode/encode those bytestrings.
9ee383b6 · Kirill Smelkov · 57927121 · 9ee383b6 · 9ee383b6 · 9ee383b6
Commit 9ee383b6 authored Jul 12, 2024 by Kirill Smelkov
7 changed files
--- a/encode.go
+++ b/encode.go
@@ -366,6 +366,8 @@ func (e *Encoder) encodeString(s string) error {
 	return e.emitf("%c%s\n", opString, pyquote(s))
 }

+var errP0UnicodeUTF8Only = errors.New(`protocol 0: unicode: raw-unicode-escape cannot represent invalid UTF-8`)
+
 // encodeUnicode emits UTF-8 encoded string s as unicode pickle object.
 func (e *Encoder) encodeUnicode(s string) error {
 	// protocol >= 1  -> BINUNICODE*
@@ -392,7 +394,14 @@ func (e *Encoder) encodeUnicode(s string) error {
 	}

 	// protocol 0: UNICODE
-	return e.emitf("%c%s\n", opUnicode, pyencodeRawUnicodeEscape(s))
+	uesc, err := pyencodeRawUnicodeEscape(s)
+	if err != nil {
+		if err != errPyRawUnicodeEscapeInvalidUTF8 {
+			panic(err) // errPyRawUnicodeEscapeInvalidUTF8 is the only possible error
+		}
+		return errP0UnicodeUTF8Only
+	}
+	return e.emitf("%c%s\n", opUnicode, uesc)
 }

 func (e *Encoder) encodeFloat(f float64) error {

--- a/fuzz.go
+++ b/fuzz.go
@@ -38,6 +38,10 @@ func Fuzz(data []byte) int {
 				// we cannot encode non-string Ref at proto=0
 				continue

+			case proto == 0 && err == errP0UnicodeUTF8Only:
+				// we cannot encode non-UTF8 Unicode at proto=0
+				continue
+
 			case proto <= 3 && err == errP0123GlobalStringLineOnly:
 				// we cannot encode Class (GLOBAL opcode) with \n at proto <= 4
 				continue

--- a/fuzz/corpus/test-a445a25b1972d141441449ed7a7c374d434eb665.pickle
+++ b/fuzz/corpus/test-a445a25b1972d141441449ed7a7c374d434eb665.pickle
+.
\ No newline at end of file
--- a/fuzz/corpus/test-f5fb2a13d0ae173186a134f5003ff5c9f5fca1dc.pickle
+++ b/fuzz/corpus/test-f5fb2a13d0ae173186a134f5003ff5c9f5fca1dc.pickle
--- a/ogorek_test.go
+++ b/ogorek_test.go
@@ -141,6 +141,7 @@ var (
 	P0123 = PP(0,1,2,3)
 	P0_   = PP(0,1,2,3,4,5)
 	P12   = PP(  1,2)
+	P123  = PP(  1,2,3)
 	P1_   = PP(  1,2,3,4,5)
 	P23   = PP(    2,3)
 	P2_   = PP(    2,3,4,5)
@@ -265,6 +266,12 @@ var tests = []TestEntry{

 		// TODO BINUNICODE8

+	// NOTE loosy because *UNICODE currently decodes as string
+	Xloosy("unicode(non-utf8)", unicode("\x93"), "\x93",
+		P0(errP0UnicodeUTF8Only),       // UNICODE cannot represent non-UTF8 sequences
+		P123("X\x01\x00\x00\x00\x93."), // BINUNICODE
+		P4_("\x8c\x01\x93.")),          // SHORT_BINUNICODE
+
 	// str/unicode with many control characters at P0
 	// this exercises escape-based STRING/UNICODE coding


--- a/pyquote.go
+++ b/pyquote.go
 package ogórek

 import (
+	"errors"
 	"fmt"
 	"strconv"
 	"unicode/utf8"
@@ -141,7 +142,10 @@ loop:
 	return string(out), nil
 }

-// pyencodeRawUnicodeEscape encodes input according to "raw-unicode-escape" Python codec..
+// errPyRawUnicodeEscapeInvalidUTF8 is returned by pyencodeRawUnicodeEscape on invalid UTF-8 input.
+var errPyRawUnicodeEscapeInvalidUTF8 = errors.New("pyencodeRawUnicodeEscape: invalid UTF-8")
+
+// pyencodeRawUnicodeEscape encodes input according to "raw-unicode-escape" Python codec.
 //
 // It is somewhat similar to escaping done by strconv.QuoteToASCII but uses
 // only "\u" and "\U", not e.g. \n or \xAA.
@@ -149,8 +153,13 @@ loop:
 // This encoding - not Go quoting - must be used when emitting unicode text
 // for UNICODE opcode argument.
 //
+// Since \xAA is not allowed to be present in the output stream it is not
+// possible to encode invalid UTF-8 input - errPyRawUnicodeEscapeInvalidUTF8 is
+// returned in such case. Otherwise the encoding always succeeds and
+// errPyRawUnicodeEscapeInvalidUTF8 is the only possible returned error.
+//
 // Please see pydecodeRawUnicodeEscape for details on the codec.
-func pyencodeRawUnicodeEscape(s string) string {
+func pyencodeRawUnicodeEscape(s string) (string, error) {
 	out := make([]byte, 0, len(s))

 	for {
@@ -160,9 +169,9 @@ func pyencodeRawUnicodeEscape(s string) string {
 		}

 		switch {
-		// invalid UTF-8 -> emit byte as is
+		// invalid UTF-8 -> cannot encode
 		case r == utf8.RuneError:
-			out = append(out, s[0])
+			return "", errPyRawUnicodeEscapeInvalidUTF8

 		// not strictly needed for encoding to "raw-unicode-escape", but pickle does it
 		case r == '\\' || r == '\n':
@@ -189,7 +198,7 @@ func pyencodeRawUnicodeEscape(s string) string {
 		s = s[width:]
 	}

-	return string(out)
+	return string(out), nil
 }

 // pydecodeRawUnicodeEscape decodes input according to "raw-unicode-escape" Python codec.

--- a/pyquote_test.go
+++ b/pyquote_test.go
@@ -6,22 +6,23 @@ import (

 // CodecTestCase represents 1 test case of a coder or decoder.
 //
-// Under the given transformation function in must be transformed to out.
+// Under the given transformation function in must be transformed to outOK.
 type CodecTestCase struct {
-	in, out string
+	in    string
+	outOK interface{} // string | error
 }

 // testCodec tests transform func applied to all test cases from testv.
 func testCodec(t *testing.T, transform func(in string)(string, error), testv []CodecTestCase) {
 	for _, tt := range testv {
 		s, err := transform(tt.in)
+		var out interface{} = s
 		if err != nil {
-			t.Errorf("%q -> error: %s", tt.in, err)
-			continue
+			out = err
 		}

-		if s != tt.out {
-			t.Errorf("%q -> unexpected:\nhave: %q\nwant: %q", tt.in, s, tt.out)
+		if out != tt.outOK {
+			t.Errorf("%q -> unexpected:\nhave: %#v\nwant: %#v", tt.in, out, tt.outOK)
 		}
 	}
 }
@@ -51,11 +52,10 @@ func TestPyDecodeStringEscape(t *testing.T) {
 }

 func TestPyEncodeRawUnicodeEscape(t *testing.T) {
-	testCodec(t, func(in string) (string, error) {
-		return pyencodeRawUnicodeEscape(in), nil
-	}, []CodecTestCase{
-		{"\xc3\x28", "\xc3\x28"}, // invalid UTF-8
-		{"\x00\x01\x80\xfe\xffabc", "\x00\x01\x80\xfe\xffabc"},
+	testCodec(t, pyencodeRawUnicodeEscape, []CodecTestCase{
+		{"\x93", errPyRawUnicodeEscapeInvalidUTF8},     // invalid UTF-8
+		{"\xc3\x28", errPyRawUnicodeEscapeInvalidUTF8}, // invalid UTF-8
+		{"\x00\x01abc", "\x00\x01abc"},
 		{`\`, `\u005c`},
 		{"\n", `\u000a`},
 		{`"'`, `"'`},
@@ -63,6 +63,7 @@ func TestPyEncodeRawUnicodeEscape(t *testing.T) {
 		{"hello\nÐ¼Ð¸Ñ\u0080\x01", `hello\u000aмир`+"\x01"},
 		{"\u1234\U00004321", `\u1234\u4321`},
 		{"\U00012345", `\U00012345`},
+		{"\u007f\u0080\u0093\u00ff", "\x7f\x80\x93\xff"},
 	})
 }