Commit a00d35fe authored by Marius Gedminas's avatar Marius Gedminas

Python 3: pickle bytestrings using SHORT_BINSTRING

This uses bytes_as_strings=True option introduced in zodbpickle 0.2 for
this purpose.

This way pickles produced on Python 3 are nearly the same as on Python 2.
There are some slight differences (Python 3 seems to perform more
memoizations which grows the size of some pickles by a couple of bytes),
but they're immaterial.

Now we can use zodbpickle's noload() on Python 3 to scan pickles for
persistent references.  We couldn't do that before, because Python 3
normally pickles byte strings as calls to codecs.encode(u'latin1-data',
'latin-1'), and noload() doesn't interpret the REDUCE opcode involved in
that representation.

Note that when you're pickling byte strings using bytes_as_strings=True,
you have to load them using encoding='bytes' (which breaks instances, so
cannot be used) or using errors='bytes' (which mean some bytestrings may
get unpickled as unicode instead).  I've tried hard to discover every
place that unpickles OIDs and added conversion to bytes in those places.

Applications dealing with binary data be prepared to handle bytestrings
that unexpectedly become unicode on unpickling.  That's the price of
Python 2 compatibility.
parent dc3ba252
......@@ -133,7 +133,7 @@ setup(name="ZODB",
'zc.lockfile',
'zdaemon >= 4.0.0a1',
'zope.interface',
] + (['zodbpickle'] if PY3 else []),
] + (['zodbpickle >= 0.2'] if PY3 else []),
zip_safe = False,
entry_points = """
[console_scripts]
......
......@@ -136,7 +136,7 @@ class PersistentReference(object):
# it. Fortunately, a class reference in a persistent
# reference is allowed to be a module+name tuple.
self.data = self.oid, klass.args
elif isinstance(data, bytes):
elif isinstance(data, (bytes, str)):
self.oid = data
else: # a list
reference_type = data[0]
......@@ -162,6 +162,10 @@ class PersistentReference(object):
assert len(data) == 1, 'unknown reference format'
self.oid = data[0]
self.weak = True
if not isinstance(self.oid, (bytes, type(None))):
assert isinstance(self.oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
self.oid = self.oid.encode('ascii')
def __cmp__(self, other):
if self is other or (
......
......@@ -120,6 +120,11 @@ class ExportImport:
if isinstance(ooid, tuple):
ooid, klass = ooid
if not isinstance(ooid, bytes):
assert isinstance(ooid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
ooid = ooid.encode('ascii')
if ooid in oids:
oid = oids[ooid]
else:
......
......@@ -33,6 +33,9 @@ checker = renormalizing.RENormalizing([
# this changes all the offsets in iterator.test
(re.compile('data.fs:207766'), 'data.fs:117080'),
(re.compile('data.fs:57991'), 'data.fs:35936'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile('data.fs:117679'), 'data.fs:117080'),
(re.compile('data.fs:36241'), 'data.fs:35936'),
])
def pack_keep_old():
......
......@@ -20,11 +20,17 @@ try:
except ImportError:
# Python 3.x: can't use stdlib's pickle because
# http://bugs.python.org/issue6784
from zodbpickle.pickle import Pickler, dump, dumps
from zodbpickle.pickle import Unpickler as _Unpickler, loads as _loads
import zodbpickle.pickle
from _compat_pickle import IMPORT_MAPPING, NAME_MAPPING
class Unpickler(_Unpickler):
class Pickler(zodbpickle.pickle.Pickler):
def __init__(self, f, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
super(Pickler, self).__init__(f, protocol, bytes_as_strings=True)
class Unpickler(zodbpickle.pickle.Unpickler):
def __init__(self, f):
super(Unpickler, self).__init__(f, encoding='ASCII', errors='bytes')
......@@ -38,11 +44,22 @@ except ImportError:
return super(Unpickler, self).find_class(modulename, name)
return self.find_global(modulename, name)
def dump(o, f, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
return zodbpickle.pickle.dump(o, f, protocol, bytes_as_strings=True)
def dumps(o, protocol=None):
if protocol:
# we want to be backwards-compatible with Python 2
assert 0 <= protocol < 3
return zodbpickle.pickle.dumps(o, protocol, bytes_as_strings=True)
def loads(s):
return _loads(s, encoding='ASCII', errors='bytes')
return zodbpickle.pickle.loads(s, encoding='ASCII', errors='bytes')
# XXX: overridable Unpickler.find_global as used in serialize.py?
# XXX: consistent spelling of inst_persistent_id/persistent_id?
# e.g. StorageTestBase and probably elsewhere
......
......@@ -63,6 +63,11 @@ def prefix_minus_one(s):
num = str2num(s)
return num2str(num - 1)
def ensure_bytes(s):
# on Python 3 we might pickle bytes and unpickle unicode strings
return s.encode('ascii') if not isinstance(s, bytes) else s
class fsIndex(object):
def __init__(self, data=None):
......@@ -85,14 +90,19 @@ class fsIndex(object):
def _setstate_0(self, state):
self.__dict__.clear()
self.__dict__.update(state)
self._data = OOBTree([
(ensure_bytes(k), v)
for (k, v) in self._data.items()
])
def _setstate_1(self, state):
self._data = OOBTree([
(k, fsBucket().fromString(v))
self._data = OOBTree([
(ensure_bytes(k), fsBucket().fromString(ensure_bytes(v)))
for (k, v) in state['_data']
])
def __getitem__(self, key):
assert isinstance(key, bytes)
return str2num(self._data[key[:6]][key[6:]])
def save(self, pos, fname):
......@@ -110,6 +120,10 @@ class fsIndex(object):
unpickler = Unpickler(f)
pos = unpickler.load()
if not isinstance(pos, int):
# NB: this might contain OIDs that got unpickled
# into Unicode strings on Python 3; hope the caller
# will pipe the result to fsIndex().update() to normalize
# the keys
return pos # Old format
index = class_()
data = index._data
......@@ -118,10 +132,11 @@ class fsIndex(object):
if not v:
break
k, v = v
data[k] = fsBucket().fromString(v)
data[ensure_bytes(k)] = fsBucket().fromString(ensure_bytes(v))
return dict(pos=pos, index=index)
def get(self, key, default=None):
assert isinstance(key, bytes)
tree = self._data.get(key[:6], default)
if tree is default:
return default
......@@ -131,6 +146,7 @@ class fsIndex(object):
return str2num(v)
def __setitem__(self, key, value):
assert isinstance(key, bytes)
value = num2str(value)
treekey = key[:6]
tree = self._data.get(treekey)
......@@ -140,6 +156,7 @@ class fsIndex(object):
tree[key[6:]] = value
def __delitem__(self, key):
assert isinstance(key, bytes)
treekey = key[:6]
tree = self._data.get(treekey)
if tree is None:
......@@ -156,13 +173,14 @@ class fsIndex(object):
def update(self, mapping):
for k, v in mapping.items():
self[k] = v
self[ensure_bytes(k)] = v
def has_key(self, key):
v = self.get(key, self)
return v is not self
def __contains__(self, key):
assert isinstance(key, bytes)
tree = self._data.get(key[:6])
if tree is None:
return False
......
......@@ -489,7 +489,7 @@ class ObjectReader:
def _persistent_load(self, reference):
if isinstance(reference, tuple):
return self.load_persistent(*reference)
elif isinstance(reference, bytes):
elif isinstance(reference, (bytes, str)):
return self.load_oid(reference)
else:
try:
......@@ -504,6 +504,11 @@ class ObjectReader:
# Quick instance reference. We know all we need to know
# to create the instance w/o hitting the db, so go for it!
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = self._cache.get(oid, None)
if obj is not None:
return obj
......@@ -538,6 +543,10 @@ class ObjectReader:
def load_persistent_weakref(self, oid, database_name=None):
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = WeakRef.__new__(WeakRef)
obj.oid = oid
if database_name is None:
......@@ -556,6 +565,10 @@ class ObjectReader:
loaders['w'] = load_persistent_weakref
def load_oid(self, oid):
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
obj = self._cache.get(oid, None)
if obj is not None:
return obj
......@@ -632,15 +645,9 @@ def referencesf(p, oids=None):
refs = []
u = Unpickler(BytesIO(p))
if sys.version_info[0] < 3:
u.persistent_load = refs
u.noload()
u.noload()
else:
# Py3: There is no `noload()` in Python 3.
u.persistent_load = refs.append
u.load()
u.load()
u.persistent_load = refs.append
u.noload()
u.noload()
# Now we have a list of referencs. Need to convert to list of
# oids:
......@@ -651,12 +658,17 @@ def referencesf(p, oids=None):
for reference in refs:
if isinstance(reference, tuple):
oid = reference[0]
elif isinstance(reference, bytes):
elif isinstance(reference, (bytes, str)):
oid = reference
else:
assert isinstance(reference, list)
continue
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
oids.append(oid)
return oids
......@@ -675,15 +687,9 @@ def get_refs(a_pickle):
refs = []
u = Unpickler(BytesIO(a_pickle))
if sys.version_info[0] < 3:
u.persistent_load = refs
u.noload()
u.noload()
else:
# Py3: There is no `noload()` in Python 3.
u.persistent_load = refs.append
u.load()
u.load()
u.persistent_load = refs.append
u.noload()
u.noload()
# Now we have a list of references. Need to convert to list of
# oids and class info:
......@@ -692,13 +698,18 @@ def get_refs(a_pickle):
for reference in refs:
if isinstance(reference, tuple):
data = reference
elif isinstance(reference, bytes):
data = reference, None
oid, klass = reference
elif isinstance(reference, (bytes, str)):
data, klass = reference, None
else:
assert isinstance(reference, list)
continue
result.append(data)
if not isinstance(oid, bytes):
assert isinstance(oid, str)
# this happens on Python 3 when all bytes in the oid are < 0x80
oid = oid.encode('ascii')
result.append((oid, klass))
return result
......@@ -58,6 +58,13 @@ class Object(object):
def getoid(self):
return self._oid
def __setstate__(self, state):
self.__dict__.clear()
self.__dict__.update(state)
if not isinstance(self._oid, bytes):
# Python 3
self._oid = self._oid.encode('ascii')
class C(Persistent):
pass
......@@ -89,7 +96,7 @@ def dumps(obj):
def pdumps(obj):
s = BytesIO()
p = Pickler(s)
p = Pickler(s, _protocol)
p.dump(obj)
p.dump(None)
return s.getvalue()
......
......@@ -82,6 +82,8 @@ checker = renormalizing.RENormalizing([
(re.compile(r'\bsize=65\b'), 'size=60'),
(re.compile(r'\offset=206\b'), 'offset=201'),
(re.compile(r'\bsize=156\b'), 'size=107'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile(r'\bsize=113\b'), 'size=107'),
])
......@@ -89,4 +91,5 @@ def test_suite():
return doctest.DocTestSuite(
setUp=zope.testing.setupstack.setUpDirectory,
tearDown=zope.testing.setupstack.tearDown,
optionflags=doctest.REPORT_NDIFF,
checker=ZODB.tests.util.checker + checker)
......@@ -11,11 +11,11 @@
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from pickle import Pickler, Unpickler
from ZODB.blob import Blob
from ZODB.DB import DB
from ZODB.FileStorage import FileStorage
from ZODB.tests.testConfig import ConfigTestBase
from ZODB._compat import Pickler, Unpickler
import os
if os.environ.get('USE_ZOPE_TESTING_DOCTEST'):
......
......@@ -183,8 +183,13 @@ checker = renormalizing.RENormalizing([
(re.compile(r'(PersistentMapping|OOBTree) at 206\b'), r'\1 at 201'),
(re.compile(r'(OOBTree) at 404\b'), r'\1 at 350'),
(re.compile(r'(PersistentMapping|OOBTree) at 531\b'), r'\1 at 477'),
# even with Pickler(bytes_as_strings=True) some of our pickles are larger
(re.compile(r'(OOBTree) at 361\b'), r'\1 at 350'),
(re.compile(r'\boffset=440\b'), 'offset=429'),
(re.compile(r'(PersistentMapping|OOBTree) at 488\b'), r'\1 at 477'),
])
def test_suite():
return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker)
return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker,
optionflags=doctest.REPORT_NDIFF)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment