From a00d35fe1873357bc621fc29013b6e2ca1bd0b0f Mon Sep 17 00:00:00 2001 From: Marius Gedminas <marius@gedmin.as> Date: Tue, 5 Mar 2013 17:03:49 +0200 Subject: [PATCH] Python 3: pickle bytestrings using SHORT_BINSTRING This uses bytes_as_strings=True option introduced in zodbpickle 0.2 for this purpose. This way pickles produced on Python 3 are nearly the same as on Python 2. There are some slight differences (Python 3 seems to perform more memoizations which grows the size of some pickles by a couple of bytes), but they're immaterial. Now we can use zodbpickle's noload() on Python 3 to scan pickles for persistent references. We couldn't do that before, because Python 3 normally pickles byte strings as calls to codecs.encode(u'latin1-data', 'latin-1'), and noload() doesn't interpret the REDUCE opcode involved in that representation. Note that when you're pickling byte strings using bytes_as_strings=True, you have to load them using encoding='bytes' (which breaks instances, so cannot be used) or using errors='bytes' (which mean some bytestrings may get unpickled as unicode instead). I've tried hard to discover every place that unpickles OIDs and added conversion to bytes in those places. Applications dealing with binary data be prepared to handle bytestrings that unexpectedly become unicode on unpickling. That's the price of Python 2 compatibility. --- setup.py | 2 +- src/ZODB/ConflictResolution.py | 6 +++- src/ZODB/ExportImport.py | 5 +++ src/ZODB/FileStorage/tests.py | 3 ++ src/ZODB/_compat.py | 27 +++++++++++--- src/ZODB/fsIndex.py | 26 +++++++++++--- src/ZODB/serialize.py | 59 ++++++++++++++++++------------- src/ZODB/tests/PackableStorage.py | 9 ++++- src/ZODB/tests/test_fsdump.py | 3 ++ src/ZODB/tests/testblob.py | 2 +- src/ZODB/tests/testfsoids.py | 7 +++- 11 files changed, 111 insertions(+), 38 deletions(-) diff --git a/setup.py b/setup.py index 487958c8..0cdea042 100644 --- a/setup.py +++ b/setup.py @@ -133,7 +133,7 @@ setup(name="ZODB", 'zc.lockfile', 'zdaemon >= 4.0.0a1', 'zope.interface', - ] + (['zodbpickle'] if PY3 else []), + ] + (['zodbpickle >= 0.2'] if PY3 else []), zip_safe = False, entry_points = """ [console_scripts] diff --git a/src/ZODB/ConflictResolution.py b/src/ZODB/ConflictResolution.py index 3e4ddbcc..5d70b90c 100644 --- a/src/ZODB/ConflictResolution.py +++ b/src/ZODB/ConflictResolution.py @@ -136,7 +136,7 @@ class PersistentReference(object): # it. Fortunately, a class reference in a persistent # reference is allowed to be a module+name tuple. self.data = self.oid, klass.args - elif isinstance(data, bytes): + elif isinstance(data, (bytes, str)): self.oid = data else: # a list reference_type = data[0] @@ -162,6 +162,10 @@ class PersistentReference(object): assert len(data) == 1, 'unknown reference format' self.oid = data[0] self.weak = True + if not isinstance(self.oid, (bytes, type(None))): + assert isinstance(self.oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + self.oid = self.oid.encode('ascii') def __cmp__(self, other): if self is other or ( diff --git a/src/ZODB/ExportImport.py b/src/ZODB/ExportImport.py index 9eb64db3..d2edb5cb 100644 --- a/src/ZODB/ExportImport.py +++ b/src/ZODB/ExportImport.py @@ -120,6 +120,11 @@ class ExportImport: if isinstance(ooid, tuple): ooid, klass = ooid + if not isinstance(ooid, bytes): + assert isinstance(ooid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + ooid = ooid.encode('ascii') + if ooid in oids: oid = oids[ooid] else: diff --git a/src/ZODB/FileStorage/tests.py b/src/ZODB/FileStorage/tests.py index a053bb05..b6ef613f 100644 --- a/src/ZODB/FileStorage/tests.py +++ b/src/ZODB/FileStorage/tests.py @@ -33,6 +33,9 @@ checker = renormalizing.RENormalizing([ # this changes all the offsets in iterator.test (re.compile('data.fs:207766'), 'data.fs:117080'), (re.compile('data.fs:57991'), 'data.fs:35936'), + # even with Pickler(bytes_as_strings=True) some of our pickles are larger + (re.compile('data.fs:117679'), 'data.fs:117080'), + (re.compile('data.fs:36241'), 'data.fs:35936'), ]) def pack_keep_old(): diff --git a/src/ZODB/_compat.py b/src/ZODB/_compat.py index eea284a2..1f2cde31 100644 --- a/src/ZODB/_compat.py +++ b/src/ZODB/_compat.py @@ -20,11 +20,17 @@ try: except ImportError: # Python 3.x: can't use stdlib's pickle because # http://bugs.python.org/issue6784 - from zodbpickle.pickle import Pickler, dump, dumps - from zodbpickle.pickle import Unpickler as _Unpickler, loads as _loads + import zodbpickle.pickle from _compat_pickle import IMPORT_MAPPING, NAME_MAPPING - class Unpickler(_Unpickler): + class Pickler(zodbpickle.pickle.Pickler): + def __init__(self, f, protocol=None): + if protocol: + # we want to be backwards-compatible with Python 2 + assert 0 <= protocol < 3 + super(Pickler, self).__init__(f, protocol, bytes_as_strings=True) + + class Unpickler(zodbpickle.pickle.Unpickler): def __init__(self, f): super(Unpickler, self).__init__(f, encoding='ASCII', errors='bytes') @@ -38,11 +44,22 @@ except ImportError: return super(Unpickler, self).find_class(modulename, name) return self.find_global(modulename, name) + def dump(o, f, protocol=None): + if protocol: + # we want to be backwards-compatible with Python 2 + assert 0 <= protocol < 3 + return zodbpickle.pickle.dump(o, f, protocol, bytes_as_strings=True) + + def dumps(o, protocol=None): + if protocol: + # we want to be backwards-compatible with Python 2 + assert 0 <= protocol < 3 + return zodbpickle.pickle.dumps(o, protocol, bytes_as_strings=True) + def loads(s): - return _loads(s, encoding='ASCII', errors='bytes') + return zodbpickle.pickle.loads(s, encoding='ASCII', errors='bytes') -# XXX: overridable Unpickler.find_global as used in serialize.py? # XXX: consistent spelling of inst_persistent_id/persistent_id? # e.g. StorageTestBase and probably elsewhere diff --git a/src/ZODB/fsIndex.py b/src/ZODB/fsIndex.py index 5a5c5b6f..47f684e5 100644 --- a/src/ZODB/fsIndex.py +++ b/src/ZODB/fsIndex.py @@ -63,6 +63,11 @@ def prefix_minus_one(s): num = str2num(s) return num2str(num - 1) +def ensure_bytes(s): + # on Python 3 we might pickle bytes and unpickle unicode strings + return s.encode('ascii') if not isinstance(s, bytes) else s + + class fsIndex(object): def __init__(self, data=None): @@ -85,14 +90,19 @@ class fsIndex(object): def _setstate_0(self, state): self.__dict__.clear() self.__dict__.update(state) + self._data = OOBTree([ + (ensure_bytes(k), v) + for (k, v) in self._data.items() + ]) def _setstate_1(self, state): - self._data = OOBTree([ - (k, fsBucket().fromString(v)) + self._data = OOBTree([ + (ensure_bytes(k), fsBucket().fromString(ensure_bytes(v))) for (k, v) in state['_data'] ]) def __getitem__(self, key): + assert isinstance(key, bytes) return str2num(self._data[key[:6]][key[6:]]) def save(self, pos, fname): @@ -110,6 +120,10 @@ class fsIndex(object): unpickler = Unpickler(f) pos = unpickler.load() if not isinstance(pos, int): + # NB: this might contain OIDs that got unpickled + # into Unicode strings on Python 3; hope the caller + # will pipe the result to fsIndex().update() to normalize + # the keys return pos # Old format index = class_() data = index._data @@ -118,10 +132,11 @@ class fsIndex(object): if not v: break k, v = v - data[k] = fsBucket().fromString(v) + data[ensure_bytes(k)] = fsBucket().fromString(ensure_bytes(v)) return dict(pos=pos, index=index) def get(self, key, default=None): + assert isinstance(key, bytes) tree = self._data.get(key[:6], default) if tree is default: return default @@ -131,6 +146,7 @@ class fsIndex(object): return str2num(v) def __setitem__(self, key, value): + assert isinstance(key, bytes) value = num2str(value) treekey = key[:6] tree = self._data.get(treekey) @@ -140,6 +156,7 @@ class fsIndex(object): tree[key[6:]] = value def __delitem__(self, key): + assert isinstance(key, bytes) treekey = key[:6] tree = self._data.get(treekey) if tree is None: @@ -156,13 +173,14 @@ class fsIndex(object): def update(self, mapping): for k, v in mapping.items(): - self[k] = v + self[ensure_bytes(k)] = v def has_key(self, key): v = self.get(key, self) return v is not self def __contains__(self, key): + assert isinstance(key, bytes) tree = self._data.get(key[:6]) if tree is None: return False diff --git a/src/ZODB/serialize.py b/src/ZODB/serialize.py index 70ba06a2..ad797986 100644 --- a/src/ZODB/serialize.py +++ b/src/ZODB/serialize.py @@ -489,7 +489,7 @@ class ObjectReader: def _persistent_load(self, reference): if isinstance(reference, tuple): return self.load_persistent(*reference) - elif isinstance(reference, bytes): + elif isinstance(reference, (bytes, str)): return self.load_oid(reference) else: try: @@ -504,6 +504,11 @@ class ObjectReader: # Quick instance reference. We know all we need to know # to create the instance w/o hitting the db, so go for it! + if not isinstance(oid, bytes): + assert isinstance(oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + oid = oid.encode('ascii') + obj = self._cache.get(oid, None) if obj is not None: return obj @@ -538,6 +543,10 @@ class ObjectReader: def load_persistent_weakref(self, oid, database_name=None): + if not isinstance(oid, bytes): + assert isinstance(oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + oid = oid.encode('ascii') obj = WeakRef.__new__(WeakRef) obj.oid = oid if database_name is None: @@ -556,6 +565,10 @@ class ObjectReader: loaders['w'] = load_persistent_weakref def load_oid(self, oid): + if not isinstance(oid, bytes): + assert isinstance(oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + oid = oid.encode('ascii') obj = self._cache.get(oid, None) if obj is not None: return obj @@ -632,15 +645,9 @@ def referencesf(p, oids=None): refs = [] u = Unpickler(BytesIO(p)) - if sys.version_info[0] < 3: - u.persistent_load = refs - u.noload() - u.noload() - else: - # Py3: There is no `noload()` in Python 3. - u.persistent_load = refs.append - u.load() - u.load() + u.persistent_load = refs.append + u.noload() + u.noload() # Now we have a list of referencs. Need to convert to list of # oids: @@ -651,12 +658,17 @@ def referencesf(p, oids=None): for reference in refs: if isinstance(reference, tuple): oid = reference[0] - elif isinstance(reference, bytes): + elif isinstance(reference, (bytes, str)): oid = reference else: assert isinstance(reference, list) continue + if not isinstance(oid, bytes): + assert isinstance(oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + oid = oid.encode('ascii') + oids.append(oid) return oids @@ -675,15 +687,9 @@ def get_refs(a_pickle): refs = [] u = Unpickler(BytesIO(a_pickle)) - if sys.version_info[0] < 3: - u.persistent_load = refs - u.noload() - u.noload() - else: - # Py3: There is no `noload()` in Python 3. - u.persistent_load = refs.append - u.load() - u.load() + u.persistent_load = refs.append + u.noload() + u.noload() # Now we have a list of references. Need to convert to list of # oids and class info: @@ -692,13 +698,18 @@ def get_refs(a_pickle): for reference in refs: if isinstance(reference, tuple): - data = reference - elif isinstance(reference, bytes): - data = reference, None + oid, klass = reference + elif isinstance(reference, (bytes, str)): + data, klass = reference, None else: assert isinstance(reference, list) continue - result.append(data) + if not isinstance(oid, bytes): + assert isinstance(oid, str) + # this happens on Python 3 when all bytes in the oid are < 0x80 + oid = oid.encode('ascii') + + result.append((oid, klass)) return result diff --git a/src/ZODB/tests/PackableStorage.py b/src/ZODB/tests/PackableStorage.py index 9c5883a1..d3c983ec 100644 --- a/src/ZODB/tests/PackableStorage.py +++ b/src/ZODB/tests/PackableStorage.py @@ -58,6 +58,13 @@ class Object(object): def getoid(self): return self._oid + def __setstate__(self, state): + self.__dict__.clear() + self.__dict__.update(state) + if not isinstance(self._oid, bytes): + # Python 3 + self._oid = self._oid.encode('ascii') + class C(Persistent): pass @@ -89,7 +96,7 @@ def dumps(obj): def pdumps(obj): s = BytesIO() - p = Pickler(s) + p = Pickler(s, _protocol) p.dump(obj) p.dump(None) return s.getvalue() diff --git a/src/ZODB/tests/test_fsdump.py b/src/ZODB/tests/test_fsdump.py index a58eb38e..e21f7e59 100644 --- a/src/ZODB/tests/test_fsdump.py +++ b/src/ZODB/tests/test_fsdump.py @@ -82,6 +82,8 @@ checker = renormalizing.RENormalizing([ (re.compile(r'\bsize=65\b'), 'size=60'), (re.compile(r'\offset=206\b'), 'offset=201'), (re.compile(r'\bsize=156\b'), 'size=107'), + # even with Pickler(bytes_as_strings=True) some of our pickles are larger + (re.compile(r'\bsize=113\b'), 'size=107'), ]) @@ -89,4 +91,5 @@ def test_suite(): return doctest.DocTestSuite( setUp=zope.testing.setupstack.setUpDirectory, tearDown=zope.testing.setupstack.tearDown, + optionflags=doctest.REPORT_NDIFF, checker=ZODB.tests.util.checker + checker) diff --git a/src/ZODB/tests/testblob.py b/src/ZODB/tests/testblob.py index 9d64b091..0cf0d967 100644 --- a/src/ZODB/tests/testblob.py +++ b/src/ZODB/tests/testblob.py @@ -11,11 +11,11 @@ # FOR A PARTICULAR PURPOSE. # ############################################################################## -from pickle import Pickler, Unpickler from ZODB.blob import Blob from ZODB.DB import DB from ZODB.FileStorage import FileStorage from ZODB.tests.testConfig import ConfigTestBase +from ZODB._compat import Pickler, Unpickler import os if os.environ.get('USE_ZOPE_TESTING_DOCTEST'): diff --git a/src/ZODB/tests/testfsoids.py b/src/ZODB/tests/testfsoids.py index f57eaeb5..147311bc 100644 --- a/src/ZODB/tests/testfsoids.py +++ b/src/ZODB/tests/testfsoids.py @@ -183,8 +183,13 @@ checker = renormalizing.RENormalizing([ (re.compile(r'(PersistentMapping|OOBTree) at 206\b'), r'\1 at 201'), (re.compile(r'(OOBTree) at 404\b'), r'\1 at 350'), (re.compile(r'(PersistentMapping|OOBTree) at 531\b'), r'\1 at 477'), + # even with Pickler(bytes_as_strings=True) some of our pickles are larger + (re.compile(r'(OOBTree) at 361\b'), r'\1 at 350'), + (re.compile(r'\boffset=440\b'), 'offset=429'), + (re.compile(r'(PersistentMapping|OOBTree) at 488\b'), r'\1 at 477'), ]) def test_suite(): - return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker) + return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker, + optionflags=doctest.REPORT_NDIFF) -- 2.30.9