Commit 57f875fd authored by Kirill Smelkov's avatar Kirill Smelkov Committed by Kamil Kisiel

encoder: Fix protocol 0 UNICODE emission

Previously, we were quoting UNICODE opcode argument with strconv.QuoteToASCII().
However that function, in addition to \u and \U escapes, can produce
e.g. \n, \r, \xAA etc escapes. And all of the latter variants are not
treated as special escapes of a unicode literal by Python, thus leading
to data being wrongly received.

Fix it by doing exactly the same that Python pickle encoder does - the
UNICODE argument comes are "raw-unicode-escape" encoded.

This patch contains only codec tests - not end-to-end pickle tests,
because currently Encoder.encodeUnicode() is called only from under
Encoder.encodeString(), and there only from under

	if e.config.Protocol >= 3

We will indirectly add tests for encodeUnicode @ protocol=0 in the next
patches, while adding support for Python bytes.
parent 9936cf9d
...@@ -8,7 +8,6 @@ import ( ...@@ -8,7 +8,6 @@ import (
"math" "math"
"math/big" "math/big"
"reflect" "reflect"
"strconv"
"strings" "strings"
) )
...@@ -327,9 +326,7 @@ func (e *Encoder) encodeUnicode(s string) error { ...@@ -327,9 +326,7 @@ func (e *Encoder) encodeUnicode(s string) error {
} }
// protocol 0: UNICODE // protocol 0: UNICODE
us := strconv.QuoteToASCII(s) // "hello\nмир" -> `"hello\n\u043c\u0438\u0440"` return e.emitf("%c%s\n", opUnicode, pyencodeRawUnicodeEscape(s))
us = us[1 : len(us)-1] // -> `hello\n\u043c\u0438\u0440`
return e.emitf("%c%s\n", opUnicode, us)
} }
func (e *Encoder) encodeFloat(f float64) error { func (e *Encoder) encodeFloat(f float64) error {
......
...@@ -6,6 +6,8 @@ import ( ...@@ -6,6 +6,8 @@ import (
"unicode/utf8" "unicode/utf8"
) )
const hexdigits = "0123456789abcdef"
// pyquote, similarly to strconv.Quote, quotes s with " but does not use "\u" and "\U" inside. // pyquote, similarly to strconv.Quote, quotes s with " but does not use "\u" and "\U" inside.
// //
// We need to avoid \u and friends, since for regular strings Python translates // We need to avoid \u and friends, since for regular strings Python translates
...@@ -17,7 +19,6 @@ import ( ...@@ -17,7 +19,6 @@ import (
// Dumping strings in a way that is possible to copy/paste into Python and use // Dumping strings in a way that is possible to copy/paste into Python and use
// pickletools.dis and pickle.loads there to verify a pickle is also handy. // pickletools.dis and pickle.loads there to verify a pickle is also handy.
func pyquote(s string) string { func pyquote(s string) string {
const hexdigits = "0123456789abcdef"
out := make([]byte, 0, len(s)) out := make([]byte, 0, len(s))
for { for {
...@@ -140,6 +141,57 @@ loop: ...@@ -140,6 +141,57 @@ loop:
return string(out), nil return string(out), nil
} }
// pyencodeRawUnicodeEscape encodes input according to "raw-unicode-escape" Python codec..
//
// It is somewhat similar to escaping done by strconv.QuoteToASCII but uses
// only "\u" and "\U", not e.g. \n or \xAA.
//
// This encoding - not Go quoting - must be used when emitting unicode text
// for UNICODE opcode argument.
//
// Please see pydecodeRawUnicodeEscape for details on the codec.
func pyencodeRawUnicodeEscape(s string) string {
out := make([]byte, 0, len(s))
for {
r, width := utf8.DecodeRuneInString(s)
if width == 0 {
break
}
switch {
// invalid UTF-8 -> emit byte as is
case r == utf8.RuneError:
out = append(out, s[0])
// not strictly needed for encoding to "raw-unicode-escape", but pickle does it
case r == '\\' || r == '\n':
out = append(out, `\u00`...)
out = append(out, hexdigits[r>>4], hexdigits[r&0xf])
case r >= 0x10000:
out = append(out, `\U`...)
for i := (8-1)*4; i >= 0; i -= 4 {
out = append(out, hexdigits[(r >> uint(i)) & 0xf])
}
case r >= 0x100:
out = append(out, `\u`...)
for i := (4-1)*4; i >= 0; i -= 4 {
out = append(out, hexdigits[(r >> uint(i)) & 0xf])
}
// rune <= 0xff -> emit via 1 raw byte
default:
out = append(out, byte(r))
}
s = s[width:]
}
return string(out)
}
// pydecodeRawUnicodeEscape decodes input according to "raw-unicode-escape" Python codec. // pydecodeRawUnicodeEscape decodes input according to "raw-unicode-escape" Python codec.
// //
// The codec is essentially defined here: // The codec is essentially defined here:
......
...@@ -50,6 +50,22 @@ func TestPyDecodeStringEscape(t *testing.T) { ...@@ -50,6 +50,22 @@ func TestPyDecodeStringEscape(t *testing.T) {
}) })
} }
func TestPyEncodeRawUnicodeEscape(t *testing.T) {
testCodec(t, func(in string) (string, error) {
return pyencodeRawUnicodeEscape(in), nil
}, []CodecTestCase{
{"\xc3\x28", "\xc3\x28"}, // invalid UTF-8
{"\x00\x01\x80\xfe\xffabc", "\x00\x01\x80\xfe\xffabc"},
{`\`, `\u005c`},
{"\n", `\u000a`},
{`"'`, `"'`},
{"hello\nмир", `hello\u000a\u043c\u0438\u0440`},
{"hello\nмиÑ\u0080\x01", `hello\u000aмир`+"\x01"},
{"\u1234\U00004321", `\u1234\u4321`},
{"\U00012345", `\U00012345`},
})
}
func TestPyDecodeRawUnicodeEscape(t *testing.T) { func TestPyDecodeRawUnicodeEscape(t *testing.T) {
testCodec(t, pydecodeRawUnicodeEscape, []CodecTestCase{ testCodec(t, pydecodeRawUnicodeEscape, []CodecTestCase{
{`hello`, "hello"}, {`hello`, "hello"},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment