Commit cf9f715d authored by Jim Fulton's avatar Jim Fulton

The FileStorage iterator now handles large files better. Whenm

iteratng from a starting transaction near the end of the file, the
iterator will scan backward from the end of the file to find the
starting point.
parent 66a5e06a
...@@ -42,6 +42,11 @@ New Features ...@@ -42,6 +42,11 @@ New Features
- As a small convenience (mainly for tests), you can now specify - As a small convenience (mainly for tests), you can now specify
initial data as a string argument to the Blob constructor. initial data as a string argument to the Blob constructor.
- The FileStorage iterator now handles large files better. Whenm
iteratng from a starting transaction near the end of the file, the
iterator will scan backward from the end of the file to find the
starting point.
3.9.0a8 (2008-12-15) 3.9.0a8 (2008-12-15)
==================== ====================
......
...@@ -21,7 +21,7 @@ from persistent.TimeStamp import TimeStamp ...@@ -21,7 +21,7 @@ from persistent.TimeStamp import TimeStamp
from struct import pack, unpack from struct import pack, unpack
from types import StringType from types import StringType
from zc.lockfile import LockFile from zc.lockfile import LockFile
from ZODB.FileStorage.format import CorruptedDataError from ZODB.FileStorage.format import CorruptedError, CorruptedDataError
from ZODB.FileStorage.format import FileStorageFormatter, DataHeader from ZODB.FileStorage.format import FileStorageFormatter, DataHeader
from ZODB.FileStorage.format import TRANS_HDR, TRANS_HDR_LEN from ZODB.FileStorage.format import TRANS_HDR, TRANS_HDR_LEN
from ZODB.FileStorage.format import TxnHeader, DATA_HDR, DATA_HDR_LEN from ZODB.FileStorage.format import TxnHeader, DATA_HDR, DATA_HDR_LEN
...@@ -50,7 +50,6 @@ packed_version = "FS21" ...@@ -50,7 +50,6 @@ packed_version = "FS21"
logger = logging.getLogger('ZODB.FileStorage') logger = logging.getLogger('ZODB.FileStorage')
def panic(message, *data): def panic(message, *data):
logger.critical(message, *data) logger.critical(message, *data)
raise CorruptedTransactionError(message) raise CorruptedTransactionError(message)
...@@ -1641,16 +1640,23 @@ class FileIterator(FileStorageFormatter): ...@@ -1641,16 +1640,23 @@ class FileIterator(FileStorageFormatter):
assert isinstance(filename, str) assert isinstance(filename, str)
file = open(filename, 'rb') file = open(filename, 'rb')
self._file = file self._file = file
self._file_name = filename
if file.read(4) != packed_version: if file.read(4) != packed_version:
raise FileStorageFormatError(file.name) raise FileStorageFormatError(file.name)
file.seek(0,2) file.seek(0,2)
self._file_size = file.tell() self._file_size = file.tell()
if (pos < 4) or pos > self._file_size:
raise ValueError("Given position is greater than the file size",
pos, self._file_size)
self._pos = pos self._pos = pos
assert start is None or isinstance(start, str) assert start is None or isinstance(start, str)
assert stop is None or isinstance(stop, str) assert stop is None or isinstance(stop, str)
self._start = start
self._stop = stop
if start: if start:
if self._file_size <= 4:
return
self._skip_to_start(start) self._skip_to_start(start)
self._stop = stop
def __len__(self): def __len__(self):
# Define a bogus __len__() to make the iterator work # Define a bogus __len__() to make the iterator work
...@@ -1674,32 +1680,87 @@ class FileIterator(FileStorageFormatter): ...@@ -1674,32 +1680,87 @@ class FileIterator(FileStorageFormatter):
file.close() file.close()
def _skip_to_start(self, start): def _skip_to_start(self, start):
# Scan through the transaction records doing almost no sanity
# checks.
file = self._file file = self._file
read = file.read pos1 = self._pos
seek = file.seek file.seek(pos1)
tid1 = file.read(8)
if len(tid1) < 8:
raise CorruptedError("Couldn't read tid.")
if start < tid1:
pos2 = pos1
tid2 = tid1
file.seek(4)
tid1 = file.read(8)
if start <= tid1:
self._pos = 4
return
pos1 = 4
else:
if start == tid1:
return
# Try to read the last transaction. We could be unlucky and
# opened the file while committing a transaction. In that
# case, we'll just scan from the beginning if the file is
# small enough, otherwise we'll fail.
file.seek(self._file_size-8)
l = u64(file.read(8))
if not (l + 12 <= self._file_size and
self._read_num(self._file_size-l) == l):
if self._file_size < (1<<20):
return self._scan_foreward(start)
raise ValueError("Can't find last transaction in large file")
pos2 = self._file_size-l-8
file.seek(pos2)
tid2 = file.read(8)
if tid2 < tid1:
raise CorruptedError("Tids out of order.")
if tid2 <= start:
if tid2 == start:
self._pos = pos2
else:
self._pos = self._file_size
return
t1 = ZODB.TimeStamp.TimeStamp(tid1).timeTime()
t2 = ZODB.TimeStamp.TimeStamp(tid2).timeTime()
ts = ZODB.TimeStamp.TimeStamp(start).timeTime()
if (ts - t1) < (t2 - ts):
return self._scan_forward(pos1, start)
else:
return self._scan_backward(pos2, start)
def _scan_forward(self, pos, start):
logger.debug("Scan forward %s:%s looking for %r",
self._file_name, pos, start)
file = self._file
while 1: while 1:
seek(self._pos) # Read the transaction record
h = read(16) h = self._read_txn_header(pos)
if len(h) < 16: if h.tid >= start:
self._pos = pos
return return
tid, stl = unpack(">8s8s", h)
if tid >= start: pos += h.tlen + 8
def _scan_backward(self, pos, start):
logger.debug("Scan backward %s:%s looking for %r",
self._file_name, pos, start)
file = self._file
seek = file.seek
read = file.read
while 1:
pos -= 8
seek(pos)
tlen = ZODB.utils.u64(read(8))
pos -= tlen
h = self._read_txn_header(pos)
if h.tid <= start:
if h.tid == start:
self._pos = pos
else:
self._pos = pos + tlen + 8
return return
tl = u64(stl)
try:
self._pos += tl + 8
except OverflowError:
self._pos = long(self._pos) + tl + 8
if __debug__:
# Sanity check
seek(self._pos - 8, 0)
rtl = read(8)
if rtl != stl:
pos = file.tell() - 8
panic("%s has inconsistent transaction length at %s "
"(%s != %s)", file.name, pos, u64(rtl), u64(stl))
# Iterator protocol # Iterator protocol
def __iter__(self): def __iter__(self):
......
FileStorage-specific iterator tests
===================================
The FileStorage iterator has some special features that deserve some
special tests.
We'll make some assertions about time, so we'll take it over:
>>> now = 1229959248
>>> def faux_time():
... global now
... now += 0.1
... return now
>>> import time
>>> time_time = time.time
>>> time.time = faux_time
Commit a bunch of transactions:
>>> import ZODB.FileStorage, transaction
>>> db = ZODB.DB('data.fs')
>>> tids = [db.storage.lastTransaction()]
>>> poss = [db.storage._pos]
>>> conn = db.open()
>>> for i in range(100):
... conn.root()[i] = conn.root().__class__()
... transaction.commit()
... tids.append(db.storage.lastTransaction())
... poss.append(db.storage._pos)
Deciding where to start
-----------------------
By default, we start at the beginning:
>>> it = ZODB.FileStorage.FileIterator('data.fs')
>>> it.next().tid == tids[0]
True
The file iterator has an optimization to deal with large files. It
can serarch from either the front or the back of the file, depending
on the starting transaction given. To see this, we'll turn on debug
logging:
>>> import logging, sys
>>> old_log_level = logging.getLogger().getEffectiveLevel()
>>> logging.getLogger().setLevel(logging.DEBUG)
>>> handler = logging.StreamHandler(sys.stdout)
>>> logging.getLogger().addHandler(handler)
If we specify a start transaction, we'll scan forward or backward, as
seems best and set the next record to that:
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[0])
>>> it.next().tid == tids[0]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[1])
Scan forward data.fs:4 looking for '\x03z\xbd\xd8\xd06\x9c\xcc'
>>> it.next().tid == tids[1]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[30])
Scan forward data.fs:4 looking for '\x03z\xbd\xd8\xdc\x96.\xcc'
>>> it.next().tid == tids[30]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[70])
Scan backward data.fs:118274 looking for '\x03z\xbd\xd8\xed\xa7>\xcc'
>>> it.next().tid == tids[70]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-2])
Scan backward data.fs:118274 looking for '\x03z\xbd\xd8\xfa\x06\xd0\xcc'
>>> it.next().tid == tids[-2]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-1])
>>> it.next().tid == tids[-1]
True
We can also supply a file position. This can speed up finding the
starting point, or just pick up where another iterator left off:
>>> it = ZODB.FileStorage.FileIterator('data.fs', pos=poss[50])
>>> it.next().tid == tids[51]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[0], pos=4)
>>> it.next().tid == tids[0]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[-1], pos=poss[-2])
>>> it.next().tid == tids[-1]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[50], pos=poss[50])
Scan backward data.fs:36542 looking for '\x03z\xbd\xd8\xe5\x1e\xb6\xcc'
>>> it.next().tid == tids[50]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[49], pos=poss[50])
Scan backward data.fs:36542 looking for '\x03z\xbd\xd8\xe4\xb1|\xcc'
>>> it.next().tid == tids[49]
True
>>> it = ZODB.FileStorage.FileIterator('data.fs', tids[51], pos=poss[50])
>>> it.next().tid == tids[51]
True
>>> logging.getLogger().setLevel(old_log_level)
>>> logging.getLogger().removeHandler(handler)
If a starting transaction is before the first transaction in the file,
then the first transaction is returned.
>>> from ZODB.utils import p64, u64
>>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[0])-1))
>>> it.next().tid == tids[0]
True
If it is after the last transaction, then iteration be empty:
>>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[-1])+1))
>>> list(it)
[]
Even if we write more transactions:
>>> it = ZODB.FileStorage.FileIterator('data.fs', p64(u64(tids[-1])+1))
>>> for i in range(10):
... conn.root()[i] = conn.root().__class__()
... transaction.commit()
>>> list(it)
[]
.. Cleanup
>>> time.time = time_time
>>> it.close()
>>> db.close()
...@@ -93,7 +93,7 @@ The pack_keep_old constructor argument controls whether a .old file (and .old di ...@@ -93,7 +93,7 @@ The pack_keep_old constructor argument controls whether a .old file (and .old di
def test_suite(): def test_suite():
return unittest.TestSuite(( return unittest.TestSuite((
doctest.DocFileSuite( doctest.DocFileSuite(
'zconfig.txt', 'zconfig.txt', 'iterator.test',
setUp=ZODB.tests.util.setUp, tearDown=ZODB.tests.util.tearDown, setUp=ZODB.tests.util.setUp, tearDown=ZODB.tests.util.tearDown,
), ),
doctest.DocTestSuite( doctest.DocTestSuite(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment