The FileStorage iterator now handles large files better. Whenm

iteratng from a starting transaction near the end of the file, the iterator will scan backward from the end of the file to find the starting point.

The FileStorage iterator now handles large files better. Whenm
iteratng from a starting transaction near the end of the file, the iterator will scan backward from the end of the file to find the starting point.
cf9f715d · Jim Fulton · 66a5e06a · cf9f715d · cf9f715d · cf9f715d
Commit cf9f715d authored Dec 22, 2008 by Jim Fulton
4 changed files
--- a/src/CHANGES.txt
+++ b/src/CHANGES.txt
@@ -42,6 +42,11 @@ New Features
 - As a small convenience (mainly for tests), you can now specify
  initial data as a string argument to the Blob constructor.
+- The FileStorage iterator now handles large files better.  Whenm
+  iteratng from a starting transaction near the end of the file, the
+  iterator will scan backward from the end of the file to find the
+  starting point.
 3.9.0a8 (2008-12-15)
 ====================

--- a/src/ZODB/FileStorage/FileStorage.py
+++ b/src/ZODB/FileStorage/FileStorage.py
@@ -21,7 +21,7 @@ from persistent.TimeStamp import TimeStamp
 from struct import pack, unpack
 from types import StringType
 from zc.lockfile import LockFile
-from ZODB.FileStorage.format import CorruptedDataError
+from ZODB.FileStorage.format import CorruptedError, CorruptedDataError
 from ZODB.FileStorage.format import FileStorageFormatter, DataHeader
 from ZODB.FileStorage.format import TRANS_HDR, TRANS_HDR_LEN
 from ZODB.FileStorage.format import TxnHeader, DATA_HDR, DATA_HDR_LEN
@@ -50,7 +50,6 @@ packed_version = "FS21"
 logger = logging.getLogger('ZODB.FileStorage')
 def panic(message, *data):
    logger.critical(message, *data)
    raise CorruptedTransactionError(message)
@@ -1641,16 +1640,23 @@ class FileIterator(FileStorageFormatter):
        assert isinstance(filename, str)
        file = open(filename, 'rb')
        self._file = file
+        self._file_name = filename
        if file.read(4) != packed_version:
            raise FileStorageFormatError(file.name)
        file.seek(0,2)
        self._file_size = file.tell()
+        if (pos < 4) or pos > self._file_size:
+            raise ValueError("Given position is greater than the file size",
+                             pos, self._file_size)
        self._pos = pos
        assert start is None or isinstance(start, str)
        assert stop is None or isinstance(stop, str)
+        self._start = start
+        self._stop = stop
        if start:
+            if self._file_size <= 4:
+                return
            self._skip_to_start(start)
-        self._stop = stop
    def __len__(self):
        # Define a bogus __len__() to make the iterator work
@@ -1674,32 +1680,87 @@ class FileIterator(FileStorageFormatter):
            file.close()
    def _skip_to_start(self, start):
-        # Scan through the transaction records doing almost no sanity
-        # checks.
        file = self._file
-        read = file.read
+        pos1 = self._pos
-        seek = file.seek
+        file.seek(pos1)
+        tid1 = file.read(8)
+        if len(tid1) < 8:
+            raise CorruptedError("Couldn't read tid.")
+        if start < tid1:
+            pos2 = pos1
+            tid2 = tid1
+            file.seek(4)
+            tid1 = file.read(8)
+            if start <= tid1:
+                self._pos = 4
+                return
+            pos1 = 4
+        else:
+            if start == tid1:
+                return
+            # Try to read the last transaction. We could be unlucky and
+            # opened the file while committing a transaction.  In that
+            # case, we'll just scan from the beginning if the file is
+            # small enough, otherwise we'll fail.
+            file.seek(self._file_size-8)
+            l = u64(file.read(8))
+            if not (l + 12 <= self._file_size and
+                    self._read_num(self._file_size-l) == l):
+                if self._file_size < (1<<20):
+                    return self._scan_foreward(start)
+                raise ValueError("Can't find last transaction in large file")
+            pos2 = self._file_size-l-8
+            file.seek(pos2)
+            tid2 = file.read(8)
+            if tid2 < tid1:
+                raise CorruptedError("Tids out of order.")
+            if tid2 <= start:
+                if tid2 == start:
+                    self._pos = pos2
+                else:
+                    self._pos = self._file_size
+                return
+        t1 = ZODB.TimeStamp.TimeStamp(tid1).timeTime()
+        t2 = ZODB.TimeStamp.TimeStamp(tid2).timeTime()
+        ts = ZODB.TimeStamp.TimeStamp(start).timeTime()
+        if (ts - t1) < (t2 - ts):
+            return self._scan_forward(pos1, start)
+        else:
+            return self._scan_backward(pos2, start)
+    def _scan_forward(self, pos, start):
+        logger.debug("Scan forward %s:%s looking for %r",
+                     self._file_name, pos, start)
+        file = self._file
        while 1:
-            seek(self._pos)
+            # Read the transaction record
-            h = read(16)
+            h = self._read_txn_header(pos)
-            if len(h) < 16:
+            if h.tid >= start:
+                self._pos = pos
                return
-            tid, stl = unpack(">8s8s", h)
-            if tid >= start:
+            pos += h.tlen + 8
+    def _scan_backward(self, pos, start):
+        logger.debug("Scan backward %s:%s looking for %r",
+                     self._file_name, pos, start)
+        file = self._file
+        seek = file.seek
+        read = file.read
+        while 1:
+            pos -= 8
+            seek(pos)
+            tlen = ZODB.utils.u64(read(8))
+            pos -= tlen
+            h = self._read_txn_header(pos)
+            if h.tid <= start:
+                if h.tid == start:
+                    self._pos = pos
+                else:
+                    self._pos = pos + tlen + 8
                return
-            tl = u64(stl)
-            try:
-                self._pos += tl + 8
-            except OverflowError:
-                self._pos = long(self._pos) + tl + 8
-            if __debug__:
-                # Sanity check
-                seek(self._pos - 8, 0)
-                rtl = read(8)
-                if rtl != stl:
-                    pos = file.tell() - 8
-                    panic("%s has inconsistent transaction length at %s "
-                          "(%s != %s)", file.name, pos, u64(rtl), u64(stl))
    # Iterator protocol
    def __iter__(self):

--- a/src/ZODB/FileStorage/iterator.test
+++ b/src/ZODB/FileStorage/iterator.test
+FileStorage-specific iterator tests
+===================================
+The FileStorage iterator has some special features that deserve some
+special tests.
+We'll make some assertions about time, so we'll take it over:
+    >>> now = 1229959248
+    >>> def faux_time():
+    ...     global now
+    ...     now += 0.1
+    ...     return now
+    >>> import time
+    >>> time_time = time.time
+    >>> time.time = faux_time
+Commit a bunch of transactions:
+    >>> import ZODB.FileStorage, transaction
+    >>> db = ZODB.DB('data.fs')
+    >>> tids = [db.storage.lastTransaction()]
+    >>> poss = [db.storage._pos]
+    >>> conn = db.open()
+    >>> for i in range(100):
+    ...     conn.root()[i] = conn.root().__class__()
+    ...     transaction.commit()
+    ...     tids.append(db.storage.lastTransaction())
+    ...     poss.append(db.storage._pos)
+Deciding where to start
+-----------------------
+By default, we start at the beginning:
+    >>> it = ZODB.FileStorage.FileIterator('data.fs')
+    >>> it.next().tid == tids[0]
+    True
+The file iterator has an optimization to deal with large files.  It
+can serarch from either the front or the back of the file, depending
+on the starting transaction given.  To see this, we'll turn on debug
+logging:
+    >>> import logging, sys
+    >>> old_log_level = logging.getLogger().getEffectiveLevel()
+    >>> logging.getLogger().setLevel(logging.DEBUG)
+    >>> handler = logging.StreamHandler(sys.stdout)
+    >>> logging.getLogger().addHandler(handler)
+If we specify a start transaction, we'll scan forward or backward, as
+seems best and set the next record to that:
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[0])
+    >>> it.next().tid == tids[0]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[1])
+    Scan forward data.fs:4 looking for '\x03z\xbd\xd8\xd06\x9c\xcc'
+    >>> it.next().tid == tids[1]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[30])
+    Scan forward data.fs:4 looking for '\x03z\xbd\xd8\xdc\x96.\xcc'
+    >>> it.next().tid == tids[30]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[70])
+    Scan backward data.fs:118274 looking for '\x03z\xbd\xd8\xed\xa7>\xcc'
+    >>> it.next().tid == tids[70]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-2])
+    Scan backward data.fs:118274 looking for '\x03z\xbd\xd8\xfa\x06\xd0\xcc'
+    >>> it.next().tid == tids[-2]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-1])
+    >>> it.next().tid == tids[-1]
+    True
+We can also supply a file position.  This can speed up finding the
+starting point, or just pick up where another iterator left off:
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', pos=poss[50])
+    >>> it.next().tid == tids[51]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[0], pos=4)
+    >>> it.next().tid == tids[0]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-1], pos=poss[-2])
+    >>> it.next().tid == tids[-1]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[50], pos=poss[50])
+    Scan backward data.fs:36542 looking for '\x03z\xbd\xd8\xe5\x1e\xb6\xcc'
+    >>> it.next().tid == tids[50]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[49], pos=poss[50])
+    Scan backward data.fs:36542 looking for '\x03z\xbd\xd8\xe4\xb1|\xcc'
+    >>> it.next().tid == tids[49]
+    True
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', tids[51], pos=poss[50])
+    >>> it.next().tid == tids[51]
+    True
+    >>> logging.getLogger().setLevel(old_log_level)
+    >>> logging.getLogger().removeHandler(handler)
+If a starting transaction is before the first transaction in the file,
+then the first transaction is returned.
+    >>> from ZODB.utils import p64, u64
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[0])-1))
+    >>> it.next().tid == tids[0]
+    True
+If it is after the last transaction, then iteration be empty:
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[-1])+1))
+    >>> list(it)
+    []
+Even if we write more transactions:
+    >>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[-1])+1))
+    >>> for i in range(10):
+    ...     conn.root()[i] = conn.root().__class__()
+    ...     transaction.commit()
+    >>> list(it)
+    []
+.. Cleanup
+    >>> time.time = time_time
+    >>> it.close()
+    >>> db.close()
--- a/src/ZODB/FileStorage/tests.py
+++ b/src/ZODB/FileStorage/tests.py
@@ -93,7 +93,7 @@ The pack_keep_old constructor argument controls whether a .old file (and .old di
 def test_suite():
    return unittest.TestSuite((
        doctest.DocFileSuite(
-            'zconfig.txt',
+            'zconfig.txt', 'iterator.test',
            setUp=ZODB.tests.util.setUp, tearDown=ZODB.tests.util.tearDown,
            ),
        doctest.DocTestSuite(