Commit fb69a8ca authored by Kirill Smelkov's avatar Kirill Smelkov

go/*: Switch to StrictUnicode mode for pickling/unpickling

Because without StrictUnicode both bytestrings (str from py2) and
unicode (unicode from py2 and str from py3) load into the same Go type
string so it becomes impossible to distinguish them from each other.
Re-saving data, thus, also generally introduces changes as e.g.
string loaded via bytestring will be saved as unicode when pickling with
protocol 3. Or a loaded unicode will be saved as bytestring with
pickling via protocol=2.

-> Switching to StrictUnicode mode solves all those problems.

Please see updated documentatin for zodbpickle.go and
https://github.com/kisielk/og-rek/commit/b28613c2 for more details about
StrictUnicode mode.

For ZODB/go this is change in behaviour exposed to outside. However
there is currently only one known ZODB/go user - WCFS in Wendelin.core -
and that user will be updated correspondingly as well.
parent 3bbd2680
...@@ -23,7 +23,17 @@ ...@@ -23,7 +23,17 @@
// forking code from anywhere. // forking code from anywhere.
// //
// Use NewPickler/NewUnpickler to create pickle encoder and decoder // Use NewPickler/NewUnpickler to create pickle encoder and decoder
// correspondingly. // correspondingly. The decoder uses StrictUnicode=y mode to prevent loading
// *STRING and *UNICODE opcodes as the same type "string" on Go side and thus
// loosing information. The encoder uses StrictUnicode=y mode as well for
// symmetry so that decode/encode cycle is identity. As the result
//
// - py bytestring (str from py2) is represented as Go type ogórek.ByteString;
// - py unicode (unicode from py2 and str from py3) is represented as Go type string;
// - py bytes (bytes from py3 and zodbpickle.binary from py2) is represented as Go type ogórek.Bytes.
//
// At application level utilities like ogórek.AsBytes and ogórek.AsString are
// handy to work with unpickled data for pickles generated by either py2 or py3.
// //
// The encoder emits pickles with protocol=2 in order to support pristine python2. // The encoder emits pickles with protocol=2 in order to support pristine python2.
// //
...@@ -49,16 +59,21 @@ func NewPickler(w io.Writer, getref func(obj any) *pickle.Ref) *pickle.Encoder { ...@@ -49,16 +59,21 @@ func NewPickler(w io.Writer, getref func(obj any) *pickle.Ref) *pickle.Encoder {
// allow pristine python2 to decode the pickle. // allow pristine python2 to decode the pickle.
// TODO 2 -> 3 since ZODB5 switched to it and uses zodbpickle. // TODO 2 -> 3 since ZODB5 switched to it and uses zodbpickle.
Protocol: 2, // see top-level doc Protocol: 2, // see top-level doc
StrictUnicode: true, // see top-level doc
PersistentRef: getref, PersistentRef: getref,
}) })
} }
// NewUnpickler creates new pickle decoder. // NewUnpickler creates new pickle decoder.
// //
// The decoder adheres to semantic explained in top-level documentation of
// zodbpickle package.
//
// loadref, if !nil, will be used by the decoder to handle persistent // loadref, if !nil, will be used by the decoder to handle persistent
// references. See documentation of ogórek.DecoderConfig.PersistentLoad for details. // references. See documentation of ogórek.DecoderConfig.PersistentLoad for details.
func NewUnpickler(r io.Reader, loadref func(ref pickle.Ref) (any, error)) *pickle.Decoder { func NewUnpickler(r io.Reader, loadref func(ref pickle.Ref) (any, error)) *pickle.Decoder {
return pickle.NewDecoderWithConfig(r, &pickle.DecoderConfig{ return pickle.NewDecoderWithConfig(r, &pickle.DecoderConfig{
StrictUnicode: true, // see top-level doc
PersistentLoad: loadref, PersistentLoad: loadref,
}) })
} }
// Copyright (C) 2018-2021 Nexedi SA and Contributors. // Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com> // Kirill Smelkov <kirr@nexedi.com>
// //
// This program is free software: you can Use, Study, Modify and Redistribute // This program is free software: you can Use, Study, Modify and Redistribute
...@@ -31,6 +31,8 @@ import ( ...@@ -31,6 +31,8 @@ import (
"lab.nexedi.com/kirr/neo/go/transaction" "lab.nexedi.com/kirr/neo/go/transaction"
"lab.nexedi.com/kirr/neo/go/zodb" "lab.nexedi.com/kirr/neo/go/zodb"
_ "lab.nexedi.com/kirr/neo/go/zodb/wks" _ "lab.nexedi.com/kirr/neo/go/zodb/wks"
pickle "github.com/kisielk/og-rek"
) )
// kv is one (key, value) pair. // kv is one (key, value) pair.
...@@ -168,12 +170,12 @@ func TestBTree(t *testing.T) { ...@@ -168,12 +170,12 @@ func TestBTree(t *testing.T) {
} }
if !ok { if !ok {
t.Errorf("%s: get %v -> ø; want %v", tt.oid, kv.key, kv.value) t.Errorf("%s: get %v -> ø; want %#v", tt.oid, kv.key, kv.value)
continue continue
} }
if value != kv.value { if value != kv.value {
t.Errorf("%s: get %v -> %v; want %v", tt.oid, kv.key, value, kv.value) t.Errorf("%s: get %v -> %#v; want %#v", tt.oid, kv.key, value, kv.value)
} }
// XXX .next == nil // XXX .next == nil
...@@ -306,3 +308,9 @@ func TestBTree(t *testing.T) { ...@@ -306,3 +308,9 @@ func TestBTree(t *testing.T) {
t.Errorf("VMaxKey(): visit:\nhave: %v\nwant: %v", visitMax, visitMaxOK) t.Errorf("VMaxKey(): visit:\nhave: %v\nwant: %v", visitMax, visitMaxOK)
} }
} }
// ---- misc ----
// ztestdata_* use bstr
type bstr = pickle.ByteString
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors. # Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -91,6 +91,7 @@ def main2(): ...@@ -91,6 +91,7 @@ def main2():
for k, v in b.items(): for k, v in b.items():
if isinstance(v, str): if isinstance(v, str):
v = qq(v) v = qq(v)
v = "bstr(%s)" % v
elif isinstance(v, int): elif isinstance(v, int):
v = "int64(%d)" % v v = "int64(%d)" % v
else: else:
......
...@@ -5,10 +5,10 @@ package btree ...@@ -5,10 +5,10 @@ package btree
var smallTestv = [...]testEntry{ var smallTestv = [...]testEntry{
testEntry{oid: 7, kind: kindBucket, itemv: []kv{}}, testEntry{oid: 7, kind: kindBucket, itemv: []kv{}},
testEntry{oid: 4, kind: kindBucket, itemv: []kv{{10, int64(17)}, }}, testEntry{oid: 4, kind: kindBucket, itemv: []kv{{10, int64(17)}, }},
testEntry{oid: 1, kind: kindBucket, itemv: []kv{{15, int64(1)}, {23, "hello"}, }}, testEntry{oid: 1, kind: kindBucket, itemv: []kv{{15, int64(1)}, {23, bstr("hello")}, }},
testEntry{oid: 3, kind: kindBTree, itemv: []kv{}}, testEntry{oid: 3, kind: kindBTree, itemv: []kv{}},
testEntry{oid: 8, kind: kindBTree, itemv: []kv{{5, int64(4)}, }}, testEntry{oid: 8, kind: kindBTree, itemv: []kv{{5, int64(4)}, }},
testEntry{oid: 5, kind: kindBTree, itemv: []kv{{7, int64(3)}, {9, "world"}, }}, testEntry{oid: 5, kind: kindBTree, itemv: []kv{{7, int64(3)}, {9, bstr("world")}, }},
} }
const B3_oid = 6 const B3_oid = 6
......
...@@ -29,26 +29,9 @@ import ( ...@@ -29,26 +29,9 @@ import (
pickle "github.com/kisielk/og-rek" pickle "github.com/kisielk/og-rek"
) )
// Xstrbytes verifies and extacts str|bytes from unpickled value.
func Xstrbytes(x interface{}) (string, error) {
var s string
switch x := x.(type) {
default:
return "", fmt.Errorf("expect str|bytes; got %T", x)
case string:
s = x
case pickle.Bytes:
s = string(x)
}
return s, nil
}
// Xstrbytes8 verifies and extracts [8](str|bytes) from unpickled value as big-endian u64. // Xstrbytes8 verifies and extracts [8](str|bytes) from unpickled value as big-endian u64.
func Xstrbytes8(x interface{}) (uint64, error) { func Xstrbytes8(x interface{}) (uint64, error) {
s, err := Xstrbytes(x) s, err := pickle.AsBytes(x)
if err != nil { if err != nil {
return 0, err return 0, err
} }
......
...@@ -207,8 +207,10 @@ func xpyclass(xklass interface{}) (_ pickle.Class, err error) { ...@@ -207,8 +207,10 @@ func xpyclass(xklass interface{}) (_ pickle.Class, err error) {
if len(t) != 2 { if len(t) != 2 {
return pickle.Class{}, fmt.Errorf("xklass: expect [2](); got [%d]()", len(t)) return pickle.Class{}, fmt.Errorf("xklass: expect [2](); got [%d]()", len(t))
} }
modname, ok1 := t[0].(string) __, ok1 := t[0].(pickle.ByteString)
classname, ok2 := t[1].(string) modname := string(__)
__, ok2 := t[1].(pickle.ByteString)
classname := string(__)
if !(ok1 && ok2) { if !(ok1 && ok2) {
return pickle.Class{}, fmt.Errorf("xklass: expect (str, str); got (%T, %T)", t[0], t[1]) return pickle.Class{}, fmt.Errorf("xklass: expect (str, str); got (%T, %T)", t[0], t[1])
} }
......
...@@ -133,8 +133,8 @@ func (fsi *Index) Save(w io.Writer) (err error) { ...@@ -133,8 +133,8 @@ func (fsi *Index) Save(w io.Writer) (err error) {
if oidPrefix != oidPrefixCur || errStop != nil { if oidPrefix != oidPrefixCur || errStop != nil {
// emit (oid[0:6], oid[6:8]oid[6:8]...pos[2:8]pos[2:8]...) // emit (oid[0:6], oid[6:8]oid[6:8]...pos[2:8]pos[2:8]...)
binary.BigEndian.PutUint64(oidb[:], uint64(oidPrefixCur)) binary.BigEndian.PutUint64(oidb[:], uint64(oidPrefixCur))
t[0] = string(oidb[0:6]) t[0] = pickle.ByteString(oidb[0:6])
t[1] = string(bytes.Join([][]byte{oidBuf, posBuf}, nil)) t[1] = pickle.ByteString(bytes.Join([][]byte{oidBuf, posBuf}, nil))
err = p.Encode(pickle.Tuple(t[:])) err = p.Encode(pickle.Tuple(t[:]))
if err != nil { if err != nil {
return err return err
...@@ -280,7 +280,7 @@ loop: ...@@ -280,7 +280,7 @@ loop:
// decode oidPrefix // decode oidPrefix
xoidPrefixStr := v[0] xoidPrefixStr := v[0]
oidPrefixStr, ok := xoidPrefixStr.(string) oidPrefixStr, ok := xoidPrefixStr.(pickle.ByteString)
if !ok { if !ok {
return nil, fmt.Errorf("invalid oidPrefix: type %T", xoidPrefixStr) return nil, fmt.Errorf("invalid oidPrefix: type %T", xoidPrefixStr)
} }
...@@ -292,7 +292,7 @@ loop: ...@@ -292,7 +292,7 @@ loop:
// check fsBucket // check fsBucket
xkvStr := v[1] xkvStr := v[1]
kvStr, ok := xkvStr.(string) kvStr, ok := xkvStr.(pickle.ByteString)
if !ok { if !ok {
return nil, fmt.Errorf("invalid fsBucket: type %T", xkvStr) return nil, fmt.Errorf("invalid fsBucket: type %T", xkvStr)
} }
...@@ -301,7 +301,7 @@ loop: ...@@ -301,7 +301,7 @@ loop:
} }
// load btree from fsBucket entries // load btree from fsBucket entries
kvBuf := mem.Bytes(kvStr) kvBuf := mem.Bytes(string(kvStr))
n := len(kvBuf) / 8 n := len(kvBuf) / 8
oidBuf := kvBuf[:n*2] oidBuf := kvBuf[:n*2]
......
...@@ -152,7 +152,8 @@ func pktDecodeZ(pkb *pktBuf) (msg, error) { ...@@ -152,7 +152,8 @@ func pktDecodeZ(pkb *pktBuf) (msg, error) {
// XXX check flags are in range? // XXX check flags are in range?
m.flags = msgFlags(flags) m.flags = msgFlags(flags)
m.method, ok = tpkt[2].(string) __, ok := tpkt[2].(pickle.ByteString)
m.method = string(__)
if !ok { if !ok {
return m, derrf(".%d: method: got %T; expected str", m.msgid, tpkt[2]) return m, derrf(".%d: method: got %T; expected str", m.msgid, tpkt[2])
} }
...@@ -347,7 +348,7 @@ func (e encoding) xuint64Pack(v uint64) interface{} { ...@@ -347,7 +348,7 @@ func (e encoding) xuint64Pack(v uint64) interface{} {
case 'Z': case 'Z':
// pickle: -> str XXX do we need to emit bytes instead of str? // pickle: -> str XXX do we need to emit bytes instead of str?
return mem.String(b[:]) return pickle.ByteString(mem.String(b[:]))
case 'M': case 'M':
// msgpack: -> bin // msgpack: -> bin
...@@ -386,11 +387,11 @@ func (e encoding) asBytes(xb interface{}) ([]byte, bool) { ...@@ -386,11 +387,11 @@ func (e encoding) asBytes(xb interface{}) ([]byte, bool) {
case 'Z': case 'Z':
// pickle: str|bytes // pickle: str|bytes
s, err := pickletools.Xstrbytes(xb) s, err := pickle.AsBytes(xb)
if err != nil { if err != nil {
return nil, false return nil, false
} }
return mem.Bytes(s), true return mem.Bytes(string(s)), true
case 'M': case 'M':
// msgpack: bin // msgpack: bin
...@@ -407,7 +408,8 @@ func (e encoding) asString(xs interface{}) (string, bool) { ...@@ -407,7 +408,8 @@ func (e encoding) asString(xs interface{}) (string, bool) {
case 'Z': case 'Z':
// pickle: str // pickle: str
s, ok := xs.(string) __, ok := xs.(pickle.ByteString)
s := string(__)
return s, ok return s, ok
case 'M': case 'M':
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment