Commit e7d96969 authored by Kirill Smelkov's avatar Kirill Smelkov Committed by Kamil Kisiel

encoder: Fix string wrt protocol version

- we can use BINSTRING* only if protocol >= 1;
- at protocol 0 we thus have to use text STRING;
- if protocol >= 3 we have to emit the string as unicode pickle object
  the same way as Python3 does. If we don't do - Python3 won't be
  generally able to load our pickle:

	In [1]: s = b'U\x06\xd0\xbc\xd0\xb8\xd1\x80q\x00.'

  	In [2]: from pickletools import dis

  	In [3]: dis(s)
  	    0: U    SHORT_BINSTRING 'миÑ\x80'
  	    8: q    BINPUT     0
  	   10: .    STOP
  	highest protocol among opcodes = 1

  	In [4]: import pickle

  	In [5]: pickle.loads(s)
  	---------------------------------------------------------------------------
  	UnicodeDecodeError                        Traceback (most recent call last)
  	<ipython-input-5-764e4625bc41> in <module>()
  	----> 1 pickle.loads(s)

  	UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)

We already decode unicode pickle objects into string, this way
decode(encode(string)) remains always idempotent.
parent 98fb1987
...@@ -8,6 +8,7 @@ import ( ...@@ -8,6 +8,7 @@ import (
"math" "math"
"math/big" "math/big"
"reflect" "reflect"
"strconv"
"strings" "strings"
) )
...@@ -259,9 +260,20 @@ func (e *Encoder) encodeBool(b bool) error { ...@@ -259,9 +260,20 @@ func (e *Encoder) encodeBool(b bool) error {
} }
func (e *Encoder) encodeBytes(byt []byte) error { func (e *Encoder) encodeBytes(byt []byte) error {
return e.encodeString(string(byt))
}
func (e *Encoder) encodeString(s string) error {
// protocol >= 3 -> encode string as unicode object
// (as python3 does)
if e.config.Protocol >= 3 {
return e.encodeUnicode(s)
}
l := len(byt) l := len(s)
// protocol >= 1 -> BINSTRING*
if e.config.Protocol >= 1 {
if l < 256 { if l < 256 {
err := e.emit(opShortBinstring, byte(l)) err := e.emit(opShortBinstring, byte(l))
if err != nil { if err != nil {
...@@ -277,7 +289,44 @@ func (e *Encoder) encodeBytes(byt []byte) error { ...@@ -277,7 +289,44 @@ func (e *Encoder) encodeBytes(byt []byte) error {
} }
} }
return e.emitb(byt) return e.emits(s)
}
// protocol 0: STRING
// XXX Python uses both ' and " for quoting - we quote with " only.
// XXX -> use https://godoc.org/lab.nexedi.com/kirr/go123/xfmt#AppendQuotePy ?
return e.emitf("%c%q\n", opString, s)
}
// encodeUnicode emits UTF-8 encoded string s as unicode pickle object.
func (e *Encoder) encodeUnicode(s string) error {
// protocol >= 1 -> BINUNICODE*
if e.config.Protocol >= 1 {
l := len(s)
// protocol >= 4 -> SHORT_BINUNICODE
if l < 256 && e.config.Protocol >= 4 {
err := e.emit(opShortBinUnicode, byte(l))
if err != nil {
return err
}
} else {
var b = [1+4]byte{opBinunicode}
binary.LittleEndian.PutUint32(b[1:], uint32(l))
err := e.emitb(b[:])
if err != nil {
return err
}
}
return e.emits(s)
}
// protocol 0: UNICODE
us := strconv.QuoteToASCII(s) // "hello\nмир" -> `"hello\n\u043c\u0438\u0440"`
us = us[1 : len(us)-1] // -> `hello\n\u043c\u0438\u0440`
return e.emitf("%c%s\n", opUnicode, us)
} }
func (e *Encoder) encodeFloat(f float64) error { func (e *Encoder) encodeFloat(f float64) error {
...@@ -360,10 +409,6 @@ func (e *Encoder) encodeMap(m reflect.Value) error { ...@@ -360,10 +409,6 @@ func (e *Encoder) encodeMap(m reflect.Value) error {
return nil return nil
} }
func (e *Encoder) encodeString(s string) error {
return e.encodeBytes([]byte(s))
}
func (e *Encoder) encodeCall(v *Call) error { func (e *Encoder) encodeCall(v *Call) error {
err := e.emitf("%c%s\n%s\n", opGlobal, v.Callable.Module, v.Callable.Name) err := e.emitf("%c%s\n%s\n", opGlobal, v.Callable.Module, v.Callable.Name)
if err != nil { if err != nil {
......
...@@ -215,15 +215,36 @@ var tests = []TestEntry{ ...@@ -215,15 +215,36 @@ var tests = []TestEntry{
I("(lp0\nI1\naI2\naI3\naI01\na.")), I("(lp0\nI1\naI2\naI3\naI01\na.")),
X("str('abc')", "abc", X("str('abc')", "abc",
I("S'abc'\np0\n.")), P0("S\"abc\"\n."), // STRING
P12("U\x03abc."), // SHORT_BINSTRING
P3("X\x03\x00\x00\x00abc."), // BINUNICODE
P4_("\x8c\x03abc."), // SHORT_BINUNICODE
I("T\x03\x00\x00\x00abc."), // BINSTRING
I("S'abc'\np0\n."),
I("S'abc'\n.")),
X("unicode('日本語')", "日本語", X("unicode('日本語')", "日本語",
I("\x8c\t\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\x94."), // SHORT_BINUNICODE P0("S\"日本語\"\n."), // STRING
I("V\\u65e5\\u672c\\u8a9e\np0\n.")), // UNICODE P12("U\x09日本語."), // SHORT_BINSTRING
P3("X\x09\x00\x00\x00日本語."), // BINUNICODE
P4_("\x8c\x09\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e."), // SHORT_BINUNICODE
I("V\\u65e5\\u672c\\u8a9e\np0\n."), // UNICODE
I("X\x09\x00\x00\x00\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e.")), // BINUNICODE
// TODO BINUNICODE8
X("unicode('\\' 知事少时烦恼少、识人多处是非多。')", "' 知事少时烦恼少、识人多处是非多。", X("unicode('\\' 知事少时烦恼少、识人多处是非多。')", "' 知事少时烦恼少、识人多处是非多。",
// UNICODE // UNICODE
I("V' \\u77e5\\u4e8b\\u5c11\\u65f6\\u70e6\\u607c\\u5c11\\u3001\\u8bc6\\u4eba\\u591a\\u5904\\u662f\\u975e\\u591a\\u3002\n.")), I("V' \\u77e5\\u4e8b\\u5c11\\u65f6\\u70e6\\u607c\\u5c11\\u3001\\u8bc6\\u4eba\\u591a\\u5904\\u662f\\u975e\\u591a\\u3002\n."),
// BINUNICODE
P3("X\x32\x00\x00\x00' \xe7\x9f\xa5\xe4\xba\x8b\xe5\xb0\x91\xe6\x97\xb6\xe7\x83\xa6\xe6\x81\xbc\xe5\xb0\x91\xe3\x80\x81\xe8\xaf\x86\xe4\xba\xba\xe5\xa4\x9a\xe5\xa4\x84\xe6\x98\xaf\xe9\x9d\x9e\xe5\xa4\x9a\xe3\x80\x82."),
// SHORT_BINUNICODE
P4_("\x8c\x32' \xe7\x9f\xa5\xe4\xba\x8b\xe5\xb0\x91\xe6\x97\xb6\xe7\x83\xa6\xe6\x81\xbc\xe5\xb0\x91\xe3\x80\x81\xe8\xaf\x86\xe4\xba\xba\xe5\xa4\x9a\xe5\xa4\x84\xe6\x98\xaf\xe9\x9d\x9e\xe5\xa4\x9a\xe3\x80\x82.")),
// TODO BINUNICODE8
X("dict({})", make(map[interface{}]interface{}), X("dict({})", make(map[interface{}]interface{}),
I("(dp0\n.")), I("(dp0\n.")),
...@@ -239,11 +260,15 @@ var tests = []TestEntry{ ...@@ -239,11 +260,15 @@ var tests = []TestEntry{
X(`persref("abc")`, Ref{"abc"}, X(`persref("abc")`, Ref{"abc"},
P0("Pabc\n."), // PERSID P0("Pabc\n."), // PERSID
P12("U\x03abcQ.")), // SHORT_BINSTRING + BINPERSID P12("U\x03abcQ."), // SHORT_BINSTRING + BINPERSID
P3("X\x03\x00\x00\x00abcQ."), // BINUNICODE + BINPERSID
P4_("\x8c\x03abcQ.")), // SHORT_BINUNICODE + BINPERSID
X(`persref("abc\nd")`, Ref{"abc\nd"}, X(`persref("abc\nd")`, Ref{"abc\nd"},
P0(errP0PersIDStringLineOnly), // cannot be encoded P0(errP0PersIDStringLineOnly), // cannot be encoded
P12("U\x05abc\ndQ.")), // SHORT_BINSTRING + BINPERSID P12("U\x05abc\ndQ."), // SHORT_BINSTRING + BINPERSID
P3("X\x05\x00\x00\x00abc\ndQ."), // BINUNICODE + BINPERSID
P4_("\x8c\x05abc\ndQ.")), // SHORT_BINUNICODE + BINPERSID
X(`persref((1, 2))`, Ref{Tuple{int64(1), int64(2)}}, X(`persref((1, 2))`, Ref{Tuple{int64(1), int64(2)}},
P0(errP0PersIDStringLineOnly), // cannot be encoded P0(errP0PersIDStringLineOnly), // cannot be encoded
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment