From a00d35fe1873357bc621fc29013b6e2ca1bd0b0f Mon Sep 17 00:00:00 2001
From: Marius Gedminas <marius@gedmin.as>
Date: Tue, 5 Mar 2013 17:03:49 +0200
Subject: [PATCH] Python 3: pickle bytestrings using SHORT_BINSTRING

This uses bytes_as_strings=True option introduced in zodbpickle 0.2 for
this purpose.

This way pickles produced on Python 3 are nearly the same as on Python 2.
There are some slight differences (Python 3 seems to perform more
memoizations which grows the size of some pickles by a couple of bytes),
but they're immaterial.

Now we can use zodbpickle's noload() on Python 3 to scan pickles for
persistent references.  We couldn't do that before, because Python 3
normally pickles byte strings as calls to codecs.encode(u'latin1-data',
'latin-1'), and noload() doesn't interpret the REDUCE opcode involved in
that representation.

Note that when you're pickling byte strings using bytes_as_strings=True,
you have to load them using encoding='bytes' (which breaks instances, so
cannot be used) or using errors='bytes' (which mean some bytestrings may
get unpickled as unicode instead).  I've tried hard to discover every
place that unpickles OIDs and added conversion to bytes in those places.

Applications dealing with binary data be prepared to handle bytestrings
that unexpectedly become unicode on unpickling.  That's the price of
Python 2 compatibility.
---
 setup.py                          |  2 +-
 src/ZODB/ConflictResolution.py    |  6 +++-
 src/ZODB/ExportImport.py          |  5 +++
 src/ZODB/FileStorage/tests.py     |  3 ++
 src/ZODB/_compat.py               | 27 +++++++++++---
 src/ZODB/fsIndex.py               | 26 +++++++++++---
 src/ZODB/serialize.py             | 59 ++++++++++++++++++-------------
 src/ZODB/tests/PackableStorage.py |  9 ++++-
 src/ZODB/tests/test_fsdump.py     |  3 ++
 src/ZODB/tests/testblob.py        |  2 +-
 src/ZODB/tests/testfsoids.py      |  7 +++-
 11 files changed, 111 insertions(+), 38 deletions(-)

diff --git a/setup.py b/setup.py
index 487958c8..0cdea042 100644
--- a/setup.py
+++ b/setup.py
@@ -133,7 +133,7 @@ setup(name="ZODB",
         'zc.lockfile',
         'zdaemon >= 4.0.0a1',
         'zope.interface',
-        ] + (['zodbpickle'] if PY3 else []),
+        ] + (['zodbpickle >= 0.2'] if PY3 else []),
       zip_safe = False,
       entry_points = """
       [console_scripts]
diff --git a/src/ZODB/ConflictResolution.py b/src/ZODB/ConflictResolution.py
index 3e4ddbcc..5d70b90c 100644
--- a/src/ZODB/ConflictResolution.py
+++ b/src/ZODB/ConflictResolution.py
@@ -136,7 +136,7 @@ class PersistentReference(object):
                 # it.  Fortunately, a class reference in a persistent
                 # reference is allowed to be a module+name tuple.
                 self.data = self.oid, klass.args
-        elif isinstance(data, bytes):
+        elif isinstance(data, (bytes, str)):
             self.oid = data
         else: # a list
             reference_type = data[0]
@@ -162,6 +162,10 @@ class PersistentReference(object):
                 assert len(data) == 1, 'unknown reference format'
                 self.oid = data[0]
                 self.weak = True
+        if not isinstance(self.oid, (bytes, type(None))):
+            assert isinstance(self.oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            self.oid = self.oid.encode('ascii')
 
     def __cmp__(self, other):
         if self is other or (
diff --git a/src/ZODB/ExportImport.py b/src/ZODB/ExportImport.py
index 9eb64db3..d2edb5cb 100644
--- a/src/ZODB/ExportImport.py
+++ b/src/ZODB/ExportImport.py
@@ -120,6 +120,11 @@ class ExportImport:
             if isinstance(ooid, tuple):
                 ooid, klass = ooid
 
+            if not isinstance(ooid, bytes):
+                assert isinstance(ooid, str)
+                # this happens on Python 3 when all bytes in the oid are < 0x80
+                ooid = ooid.encode('ascii')
+
             if ooid in oids:
                 oid = oids[ooid]
             else:
diff --git a/src/ZODB/FileStorage/tests.py b/src/ZODB/FileStorage/tests.py
index a053bb05..b6ef613f 100644
--- a/src/ZODB/FileStorage/tests.py
+++ b/src/ZODB/FileStorage/tests.py
@@ -33,6 +33,9 @@ checker = renormalizing.RENormalizing([
     # this changes all the offsets in iterator.test
     (re.compile('data.fs:207766'), 'data.fs:117080'),
     (re.compile('data.fs:57991'), 'data.fs:35936'),
+    # even with Pickler(bytes_as_strings=True) some of our pickles are larger
+    (re.compile('data.fs:117679'), 'data.fs:117080'),
+    (re.compile('data.fs:36241'), 'data.fs:35936'),
 ])
 
 def pack_keep_old():
diff --git a/src/ZODB/_compat.py b/src/ZODB/_compat.py
index eea284a2..1f2cde31 100644
--- a/src/ZODB/_compat.py
+++ b/src/ZODB/_compat.py
@@ -20,11 +20,17 @@ try:
 except ImportError:
     # Python 3.x: can't use stdlib's pickle because
     # http://bugs.python.org/issue6784
-    from zodbpickle.pickle import Pickler, dump, dumps
-    from zodbpickle.pickle import Unpickler as _Unpickler, loads as _loads
+    import zodbpickle.pickle
     from _compat_pickle import IMPORT_MAPPING, NAME_MAPPING
 
-    class Unpickler(_Unpickler):
+    class Pickler(zodbpickle.pickle.Pickler):
+        def __init__(self, f, protocol=None):
+            if protocol:
+                # we want to be backwards-compatible with Python 2
+                assert 0 <= protocol < 3
+            super(Pickler, self).__init__(f, protocol, bytes_as_strings=True)
+
+    class Unpickler(zodbpickle.pickle.Unpickler):
         def __init__(self, f):
             super(Unpickler, self).__init__(f, encoding='ASCII', errors='bytes')
 
@@ -38,11 +44,22 @@ except ImportError:
                 return super(Unpickler, self).find_class(modulename, name)
             return self.find_global(modulename, name)
 
+    def dump(o, f, protocol=None):
+        if protocol:
+            # we want to be backwards-compatible with Python 2
+            assert 0 <= protocol < 3
+        return zodbpickle.pickle.dump(o, f, protocol, bytes_as_strings=True)
+
+    def dumps(o, protocol=None):
+        if protocol:
+            # we want to be backwards-compatible with Python 2
+            assert 0 <= protocol < 3
+        return zodbpickle.pickle.dumps(o, protocol, bytes_as_strings=True)
+
     def loads(s):
-        return _loads(s, encoding='ASCII', errors='bytes')
+        return zodbpickle.pickle.loads(s, encoding='ASCII', errors='bytes')
 
 
-# XXX: overridable Unpickler.find_global as used in serialize.py?
 # XXX: consistent spelling of inst_persistent_id/persistent_id?
 #      e.g. StorageTestBase and probably elsewhere
 
diff --git a/src/ZODB/fsIndex.py b/src/ZODB/fsIndex.py
index 5a5c5b6f..47f684e5 100644
--- a/src/ZODB/fsIndex.py
+++ b/src/ZODB/fsIndex.py
@@ -63,6 +63,11 @@ def prefix_minus_one(s):
     num = str2num(s)
     return num2str(num - 1)
 
+def ensure_bytes(s):
+    # on Python 3 we might pickle bytes and unpickle unicode strings
+    return s.encode('ascii') if not isinstance(s, bytes) else s
+
+
 class fsIndex(object):
 
     def __init__(self, data=None):
@@ -85,14 +90,19 @@ class fsIndex(object):
     def _setstate_0(self, state):
         self.__dict__.clear()
         self.__dict__.update(state)
+        self._data = OOBTree([
+            (ensure_bytes(k), v)
+            for (k, v) in self._data.items()
+            ])
 
     def _setstate_1(self, state):
-        self._data =  OOBTree([
-            (k, fsBucket().fromString(v))
+        self._data = OOBTree([
+            (ensure_bytes(k), fsBucket().fromString(ensure_bytes(v)))
             for (k, v) in state['_data']
             ])
 
     def __getitem__(self, key):
+        assert isinstance(key, bytes)
         return str2num(self._data[key[:6]][key[6:]])
 
     def save(self, pos, fname):
@@ -110,6 +120,10 @@ class fsIndex(object):
             unpickler = Unpickler(f)
             pos = unpickler.load()
             if not isinstance(pos, int):
+                # NB: this might contain OIDs that got unpickled
+                # into Unicode strings on Python 3; hope the caller
+                # will pipe the result to fsIndex().update() to normalize
+                # the keys
                 return pos                  # Old format
             index = class_()
             data = index._data
@@ -118,10 +132,11 @@ class fsIndex(object):
                 if not v:
                     break
                 k, v = v
-                data[k] = fsBucket().fromString(v)
+                data[ensure_bytes(k)] = fsBucket().fromString(ensure_bytes(v))
             return dict(pos=pos, index=index)
 
     def get(self, key, default=None):
+        assert isinstance(key, bytes)
         tree = self._data.get(key[:6], default)
         if tree is default:
             return default
@@ -131,6 +146,7 @@ class fsIndex(object):
         return str2num(v)
 
     def __setitem__(self, key, value):
+        assert isinstance(key, bytes)
         value = num2str(value)
         treekey = key[:6]
         tree = self._data.get(treekey)
@@ -140,6 +156,7 @@ class fsIndex(object):
         tree[key[6:]] = value
 
     def __delitem__(self, key):
+        assert isinstance(key, bytes)
         treekey = key[:6]
         tree = self._data.get(treekey)
         if tree is None:
@@ -156,13 +173,14 @@ class fsIndex(object):
 
     def update(self, mapping):
         for k, v in mapping.items():
-            self[k] = v
+            self[ensure_bytes(k)] = v
 
     def has_key(self, key):
         v = self.get(key, self)
         return v is not self
 
     def __contains__(self, key):
+        assert isinstance(key, bytes)
         tree = self._data.get(key[:6])
         if tree is None:
             return False
diff --git a/src/ZODB/serialize.py b/src/ZODB/serialize.py
index 70ba06a2..ad797986 100644
--- a/src/ZODB/serialize.py
+++ b/src/ZODB/serialize.py
@@ -489,7 +489,7 @@ class ObjectReader:
     def _persistent_load(self, reference):
         if isinstance(reference, tuple):
             return self.load_persistent(*reference)
-        elif isinstance(reference, bytes):
+        elif isinstance(reference, (bytes, str)):
             return self.load_oid(reference)
         else:
             try:
@@ -504,6 +504,11 @@ class ObjectReader:
         # Quick instance reference.  We know all we need to know
         # to create the instance w/o hitting the db, so go for it!
 
+        if not isinstance(oid, bytes):
+            assert isinstance(oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            oid = oid.encode('ascii')
+
         obj = self._cache.get(oid, None)
         if obj is not None:
             return obj
@@ -538,6 +543,10 @@ class ObjectReader:
 
 
     def load_persistent_weakref(self, oid, database_name=None):
+        if not isinstance(oid, bytes):
+            assert isinstance(oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            oid = oid.encode('ascii')
         obj = WeakRef.__new__(WeakRef)
         obj.oid = oid
         if database_name is None:
@@ -556,6 +565,10 @@ class ObjectReader:
     loaders['w'] = load_persistent_weakref
 
     def load_oid(self, oid):
+        if not isinstance(oid, bytes):
+            assert isinstance(oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            oid = oid.encode('ascii')
         obj = self._cache.get(oid, None)
         if obj is not None:
             return obj
@@ -632,15 +645,9 @@ def referencesf(p, oids=None):
 
     refs = []
     u = Unpickler(BytesIO(p))
-    if sys.version_info[0] < 3:
-        u.persistent_load = refs
-        u.noload()
-        u.noload()
-    else:
-        # Py3: There is no `noload()` in Python 3.
-        u.persistent_load = refs.append
-        u.load()
-        u.load()
+    u.persistent_load = refs.append
+    u.noload()
+    u.noload()
 
     # Now we have a list of referencs.  Need to convert to list of
     # oids:
@@ -651,12 +658,17 @@ def referencesf(p, oids=None):
     for reference in refs:
         if isinstance(reference, tuple):
             oid = reference[0]
-        elif isinstance(reference, bytes):
+        elif isinstance(reference, (bytes, str)):
             oid = reference
         else:
             assert isinstance(reference, list)
             continue
 
+        if not isinstance(oid, bytes):
+            assert isinstance(oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            oid = oid.encode('ascii')
+
         oids.append(oid)
 
     return oids
@@ -675,15 +687,9 @@ def get_refs(a_pickle):
 
     refs = []
     u = Unpickler(BytesIO(a_pickle))
-    if sys.version_info[0] < 3:
-        u.persistent_load = refs
-        u.noload()
-        u.noload()
-    else:
-        # Py3: There is no `noload()` in Python 3.
-        u.persistent_load = refs.append
-        u.load()
-        u.load()
+    u.persistent_load = refs.append
+    u.noload()
+    u.noload()
 
     # Now we have a list of references.  Need to convert to list of
     # oids and class info:
@@ -692,13 +698,18 @@ def get_refs(a_pickle):
 
     for reference in refs:
         if isinstance(reference, tuple):
-            data = reference
-        elif isinstance(reference, bytes):
-            data = reference, None
+            oid, klass = reference
+        elif isinstance(reference, (bytes, str)):
+            data, klass = reference, None
         else:
             assert isinstance(reference, list)
             continue
 
-        result.append(data)
+        if not isinstance(oid, bytes):
+            assert isinstance(oid, str)
+            # this happens on Python 3 when all bytes in the oid are < 0x80
+            oid = oid.encode('ascii')
+
+        result.append((oid, klass))
 
     return result
diff --git a/src/ZODB/tests/PackableStorage.py b/src/ZODB/tests/PackableStorage.py
index 9c5883a1..d3c983ec 100644
--- a/src/ZODB/tests/PackableStorage.py
+++ b/src/ZODB/tests/PackableStorage.py
@@ -58,6 +58,13 @@ class Object(object):
     def getoid(self):
         return self._oid
 
+    def __setstate__(self, state):
+        self.__dict__.clear()
+        self.__dict__.update(state)
+        if not isinstance(self._oid, bytes):
+            # Python 3
+            self._oid = self._oid.encode('ascii')
+
 
 class C(Persistent):
     pass
@@ -89,7 +96,7 @@ def dumps(obj):
 
 def pdumps(obj):
     s = BytesIO()
-    p = Pickler(s)
+    p = Pickler(s, _protocol)
     p.dump(obj)
     p.dump(None)
     return s.getvalue()
diff --git a/src/ZODB/tests/test_fsdump.py b/src/ZODB/tests/test_fsdump.py
index a58eb38e..e21f7e59 100644
--- a/src/ZODB/tests/test_fsdump.py
+++ b/src/ZODB/tests/test_fsdump.py
@@ -82,6 +82,8 @@ checker = renormalizing.RENormalizing([
     (re.compile(r'\bsize=65\b'), 'size=60'),
     (re.compile(r'\offset=206\b'), 'offset=201'),
     (re.compile(r'\bsize=156\b'), 'size=107'),
+    # even with Pickler(bytes_as_strings=True) some of our pickles are larger
+    (re.compile(r'\bsize=113\b'), 'size=107'),
 ])
 
 
@@ -89,4 +91,5 @@ def test_suite():
     return doctest.DocTestSuite(
         setUp=zope.testing.setupstack.setUpDirectory,
         tearDown=zope.testing.setupstack.tearDown,
+        optionflags=doctest.REPORT_NDIFF,
         checker=ZODB.tests.util.checker + checker)
diff --git a/src/ZODB/tests/testblob.py b/src/ZODB/tests/testblob.py
index 9d64b091..0cf0d967 100644
--- a/src/ZODB/tests/testblob.py
+++ b/src/ZODB/tests/testblob.py
@@ -11,11 +11,11 @@
 # FOR A PARTICULAR PURPOSE.
 #
 ##############################################################################
-from pickle import Pickler, Unpickler
 from ZODB.blob import Blob
 from ZODB.DB import DB
 from ZODB.FileStorage import FileStorage
 from ZODB.tests.testConfig import ConfigTestBase
+from ZODB._compat import Pickler, Unpickler
 
 import os
 if os.environ.get('USE_ZOPE_TESTING_DOCTEST'):
diff --git a/src/ZODB/tests/testfsoids.py b/src/ZODB/tests/testfsoids.py
index f57eaeb5..147311bc 100644
--- a/src/ZODB/tests/testfsoids.py
+++ b/src/ZODB/tests/testfsoids.py
@@ -183,8 +183,13 @@ checker = renormalizing.RENormalizing([
     (re.compile(r'(PersistentMapping|OOBTree) at 206\b'), r'\1 at 201'),
     (re.compile(r'(OOBTree) at 404\b'), r'\1 at 350'),
     (re.compile(r'(PersistentMapping|OOBTree) at 531\b'), r'\1 at 477'),
+    # even with Pickler(bytes_as_strings=True) some of our pickles are larger
+    (re.compile(r'(OOBTree) at 361\b'), r'\1 at 350'),
+    (re.compile(r'\boffset=440\b'), 'offset=429'),
+    (re.compile(r'(PersistentMapping|OOBTree) at 488\b'), r'\1 at 477'),
 ])
 
 
 def test_suite():
-    return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker)
+    return doctest.DocTestSuite(checker=ZODB.tests.util.checker + checker,
+                                optionflags=doctest.REPORT_NDIFF)
-- 
2.30.9