Commit bd901052 authored by Jim Fulton's avatar Jim Fulton

Major refactoring of cache code to reduce memory consumption, which

was astronomical.  Also greatly simplified the code.
parent a6605d20
Whats new in ZODB 3.8.1 Whats new in ZODB 3.8.1
======================= =======================
...@@ -17,6 +19,9 @@ Bugs Fixed: ...@@ -17,6 +19,9 @@ Bugs Fixed:
- A bug in the cache-opening logic led to cache failure in the - A bug in the cache-opening logic led to cache failure in the
unlikely event that a cache has no free blocks. unlikely event that a cache has no free blocks.
- The cache used an excessive amount of memory, causing applications
with large caches to exhaust available memory.
- When using ZEO Client Storages, Errors occured when trying to store - When using ZEO Client Storages, Errors occured when trying to store
objects too big to fit in the ZEO cache file. objects too big to fit in the ZEO cache file.
......
...@@ -22,51 +22,54 @@ it to map this richer API onto the simple key-based API of the lower-level ...@@ -22,51 +22,54 @@ it to map this richer API onto the simple key-based API of the lower-level
FileCache. FileCache.
""" """
from struct import pack, unpack
import bisect import bisect
import BTrees.LLBTree
import BTrees.LOBTree
import logging import logging
import os import os
import struct
import tempfile import tempfile
import time import time
import ZODB.fsIndex
import ZODB.lock_file import ZODB.lock_file
from ZODB.utils import z64, u64 from ZODB.utils import p64, u64, z64
logger = logging.getLogger("ZEO.cache") logger = logging.getLogger("ZEO.cache")
##
# A disk-based cache for ZEO clients. # A disk-based cache for ZEO clients.
# <p> #
# This class provides an interface to a persistent, disk-based cache # This class provides an interface to a persistent, disk-based cache
# used by ZEO clients to store copies of database records from the # used by ZEO clients to store copies of database records from the
# server. # server.
# <p> #
# The details of the constructor as unspecified at this point. # The details of the constructor as unspecified at this point.
# <p> #
# Each entry in the cache is valid for a particular range of transaction # Each entry in the cache is valid for a particular range of transaction
# ids. The lower bound is the transaction that wrote the data. The # ids. The lower bound is the transaction that wrote the data. The
# upper bound is the next transaction that wrote a revision of the # upper bound is the next transaction that wrote a revision of the
# object. If the data is current, the upper bound is stored as None; # object. If the data is current, the upper bound is stored as None;
# the data is considered current until an invalidate() call is made. # the data is considered current until an invalidate() call is made.
# <p> #
# It is an error to call store() twice with the same object without an # It is an error to call store() twice with the same object without an
# intervening invalidate() to set the upper bound on the first cache # intervening invalidate() to set the upper bound on the first cache
# entry. <em>Perhaps it will be necessary to have a call the removes # entry. Perhaps it will be necessary to have a call the removes
# something from the cache outright, without keeping a non-current # something from the cache outright, without keeping a non-current
# entry.</em> # entry.
# <h3>Cache verification</h3>
# <p> # Cache verification
#
# When the client is connected to the server, it receives # When the client is connected to the server, it receives
# invalidations every time an object is modified. When the client is # invalidations every time an object is modified. When the client is
# disconnected then reconnects, it must perform cache verification to make # disconnected then reconnects, it must perform cache verification to make
# sure its cached data is synchronized with the storage's current state. # sure its cached data is synchronized with the storage's current state.
# <p> #
# quick verification # quick verification
# full verification # full verification
# <p> #
##
# FileCache stores a cache in a single on-disk file. # FileCache stores a cache in a single on-disk file.
# #
# On-disk cache structure. # On-disk cache structure.
...@@ -100,11 +103,13 @@ ZEC3_HEADER_SIZE = 12 ...@@ -100,11 +103,13 @@ ZEC3_HEADER_SIZE = 12
# #
# 1 byte allocation status ('a'). # 1 byte allocation status ('a').
# 4 bytes block size, >I format. # 4 bytes block size, >I format.
# 16 bytes oid + tid, string. # 8 byte oid
# size-OBJECT_HEADER_SIZE bytes, the serialization of an Object (see # 8 byte start_tid
# class Object for details). # 8 byte end_tid
# 2 byte version length
OBJECT_HEADER_SIZE = 1 + 4 + 16 # 4 byte data size
# version
# data
# The cache's currentofs goes around the file, circularly, forever. # The cache's currentofs goes around the file, circularly, forever.
# It's always the starting offset of some block. # It's always the starting offset of some block.
...@@ -120,11 +125,6 @@ OBJECT_HEADER_SIZE = 1 + 4 + 16 ...@@ -120,11 +125,6 @@ OBJECT_HEADER_SIZE = 1 + 4 + 16
class ClientCache(object): class ClientCache(object):
"""A simple in-memory cache.""" """A simple in-memory cache."""
##
# Do we put the constructor here?
# @param path path of persistent snapshot of cache state (a file path)
# @param size size of cache file, in bytes
# The default size of 200MB makes a lot more sense than the traditional # The default size of 200MB makes a lot more sense than the traditional
# default of 20MB. The default here is misleading, though, since # default of 20MB. The default here is misleading, though, since
# ClientStorage is the only user of ClientCache, and it always passes an # ClientStorage is the only user of ClientCache, and it always passes an
...@@ -140,57 +140,22 @@ class ClientCache(object): ...@@ -140,57 +140,22 @@ class ClientCache(object):
# to change the cache size in that case # to change the cache size in that case
self.maxsize = size self.maxsize = size
# The cache stores objects in a dict mapping (oid, tid) pairs # The number of records in the cache.
# to Object() records (see below). The tid is the transaction self._len = 0
# id that wrote the object. An object record includes data,
# serialno, and end tid. It has auxillary data structures to
# compute the appropriate tid, given the oid and a transaction id
# representing an arbitrary point in history.
#
# The serialized form of the cache just stores the Object()
# records. The in-memory form can be reconstructed from these
# records.
# Maps oid to current tid. Used to compute key for objects. # {oid -> pos}
self.current = {} self.current = ZODB.fsIndex.fsIndex()
# Maps oid to list of (start_tid, end_tid) pairs in sorted order. # {oid -> {tid->pos}}
# Used to find matching key for load of non-current data. # Note that caches in the wild seem to have very little non-current
self.noncurrent = {} # data, so this would seem to have little impact on memory consumption.
# I wonder if we even need to store non-current data in the cache.
# Map oid to (version, tid) pair. If there is no entry, the object self.noncurrent = BTrees.LOBTree.LOBTree()
# is not modified in a version.
self.version = {}
# tid for the most recent transaction we know about. This is also # tid for the most recent transaction we know about. This is also
# stored near the start of the file. # stored near the start of the file.
self.tid = None self.tid = None
# There's one Entry instance, kept in memory, for each currently
# allocated block in the file, and there's one allocated block in the
# file per serialized Object. filemap retrieves the Entry given the
# starting offset of a block, and key2entry retrieves the Entry given
# an object revision's key (an (oid, start_tid) pair). From an
# Entry, we can get the Object's key and file offset.
# Map offset in file to pair (data record size, Entry).
# Entry is None iff the block starting at offset is free.
# filemap always contains a complete account of what's in the
# file -- study method _verify_filemap for executable checking
# of the relevant invariants. An offset is at the start of a
# block iff it's a key in filemap. The data record size is
# stored in the file too, so we could just seek to the offset
# and read it up; keeping it in memory is an optimization.
self.filemap = {}
# Map key to Entry. After
# obj = key2entry[key]
# then
# obj.key == key
# is true. An object is currently stored on disk iff its key is in
# key2entry.
self.key2entry = {}
# Always the offset into the file of the start of a block. # Always the offset into the file of the start of a block.
# New and relocated objects are always written starting at # New and relocated objects are always written starting at
# currentofs. # currentofs.
...@@ -224,11 +189,9 @@ class ClientCache(object): ...@@ -224,11 +189,9 @@ class ClientCache(object):
self.f.write(magic) self.f.write(magic)
self.f.write(z64) self.f.write(z64)
# and one free block. # and one free block.
self.f.write('f' + struct.pack(">I", self.maxsize - self.f.write('f' + pack(">I", self.maxsize -
ZEC3_HEADER_SIZE)) ZEC3_HEADER_SIZE))
sync(self.f) sync(self.f)
self.filemap[ZEC3_HEADER_SIZE] = (self.maxsize - ZEC3_HEADER_SIZE,
None)
# Statistics: _n_adds, _n_added_bytes, # Statistics: _n_adds, _n_added_bytes,
# _n_evicts, _n_evicted_bytes, # _n_evicts, _n_evicted_bytes,
...@@ -237,12 +200,17 @@ class ClientCache(object): ...@@ -237,12 +200,17 @@ class ClientCache(object):
self._setup_trace(path) self._setup_trace(path)
# Backward compatibility. Client code used to have to use the fc
# attr to get to the file cache to get cache stats.
@property
def fc(self):
return self
## ##
# Scan the current contents of the cache file, calling `install` # Scan the current contents of the cache file, calling `install`
# for each object found in the cache. This method should only # for each object found in the cache. This method should only
# be called once to initialize the cache from disk. # be called once to initialize the cache from disk.
def scan(self, install): def open(self):
if self.f is not None: # we're not (re)using a pre-existing file if self.f is not None: # we're not (re)using a pre-existing file
return return
fsize = os.path.getsize(self.path) fsize = os.path.getsize(self.path)
...@@ -252,10 +220,12 @@ class ClientCache(object): ...@@ -252,10 +220,12 @@ class ClientCache(object):
fsize, self.maxsize) fsize, self.maxsize)
self.maxsize = fsize self.maxsize = fsize
self.f = open(self.path, 'rb+') self.f = open(self.path, 'rb+')
_magic = self.f.read(4) read = self.f.read
seek = self.f.seek
_magic = read(4)
if _magic != magic: if _magic != magic:
raise ValueError("unexpected magic number: %r" % _magic) raise ValueError("unexpected magic number: %r" % _magic)
self.tid = self.f.read(8) self.tid = read(8)
if len(self.tid) != 8: if len(self.tid) != 8:
raise ValueError("cache file too small -- no tid at start") raise ValueError("cache file too small -- no tid at start")
...@@ -263,38 +233,48 @@ class ClientCache(object): ...@@ -263,38 +233,48 @@ class ClientCache(object):
# file, and tell our parent about it too (via the `install` callback). # file, and tell our parent about it too (via the `install` callback).
# Remember the location of the largest free block. That seems a # Remember the location of the largest free block. That seems a
# decent place to start currentofs. # decent place to start currentofs.
max_free_size = 0 max_free_size = l = 0
ofs = max_free_offset = ZEC3_HEADER_SIZE ofs = max_free_offset = ZEC3_HEADER_SIZE
current = self.current
while ofs < fsize: while ofs < fsize:
self.f.seek(ofs) seek(ofs)
ent = None status = read(1)
status = self.f.read(1)
if status == 'a': if status == 'a':
size, rawkey = struct.unpack(">I16s", self.f.read(20)) size, oid, start_tid, end_tid = unpack(">I8s8s8s", read(28))
key = rawkey[:8], rawkey[8:]
assert key not in self.key2entry if end_tid == z64:
self.key2entry[key] = ent = Entry(key, ofs) current[oid] = ofs
install(self.f, ent) else:
assert start_tid < end_tid
self._set_noncurrent(oid, start_tid, ofs)
l += 1
elif status == 'f': elif status == 'f':
size, = struct.unpack(">I", self.f.read(4)) size, = unpack(">I", read(4))
elif status in '1234': elif status in '1234':
size = int(status) size = int(status)
else: else:
raise ValueError("unknown status byte value %s in client " raise ValueError("unknown status byte value %s in client "
"cache file" % 0, hex(ord(status))) "cache file" % 0, hex(ord(status)))
self.filemap[ofs] = size, ent
if ent is None and size > max_free_size:
max_free_size, max_free_offset = size, ofs
ofs += size ofs += size
if ofs != fsize: if ofs != fsize:
raise ValueError("final offset %s != file size %s in client " raise ValueError("final offset %s != file size %s in client "
"cache file" % (ofs, fsize)) "cache file" % (ofs, fsize))
if __debug__:
self._verify_filemap()
self.currentofs = max_free_offset self.currentofs = max_free_offset
self._len = l
def _set_noncurrent(self, oid, tid, ofs):
noncurrent_for_oid = self.noncurrent.get(u64(oid))
if noncurrent_for_oid is None:
noncurrent_for_oid = BTrees.LLBTree.LLBTree()
self.noncurrent[u64(oid)] = noncurrent_for_oid
noncurrent_for_oid[u64(tid)] = ofs
def _del_noncurrent(self, oid, tid):
noncurrent_for_oid = self.noncurrent[u64(oid)]
del noncurrent_for_oid[u64(tid)]
if not noncurrent_for_oid:
del self.noncurrent[u64(oid)]
def clearStats(self): def clearStats(self):
self._n_adds = self._n_added_bytes = 0 self._n_adds = self._n_added_bytes = 0
...@@ -310,17 +290,7 @@ class ClientCache(object): ...@@ -310,17 +290,7 @@ class ClientCache(object):
## ##
# The number of objects currently in the cache. # The number of objects currently in the cache.
def __len__(self): def __len__(self):
return len(self.key2entry) return self._len
##
# Iterate over the objects in the cache, producing an Entry for each.
def __iter__(self):
return self.key2entry.itervalues()
##
# Test whether an (oid, tid) pair is in the cache.
def __contains__(self, key):
return key in self.key2entry
## ##
# Close the underlying file. No methods accessing the cache should be # Close the underlying file. No methods accessing the cache should be
...@@ -349,159 +319,30 @@ class ClientCache(object): ...@@ -349,159 +319,30 @@ class ClientCache(object):
if self.currentofs + nbytes > self.maxsize: if self.currentofs + nbytes > self.maxsize:
self.currentofs = ZEC3_HEADER_SIZE self.currentofs = ZEC3_HEADER_SIZE
ofs = self.currentofs ofs = self.currentofs
seek = self.f.seek
read = self.f.read
current = self.current
while nbytes > 0: while nbytes > 0:
size, e = self.filemap.pop(ofs) seek(ofs)
if e is not None: status = read(1)
del self.key2entry[e.key] if status == 'a':
self._evictobj(e, size) size, oid, start_tid, end_tid = unpack(">I8s8s8s", read(28))
self._n_evicts += 1
self._n_evicted_bytes += size
if end_tid == z64:
del current[oid]
else:
self._del_noncurrent(oid, start_tid)
self._len -= 1
else:
if status == 'f':
size = unpack(">I", read(4))[0]
else:
size = int(status)
ofs += size ofs += size
nbytes -= size nbytes -= size
return ofs - self.currentofs return ofs - self.currentofs
##
# Write Object obj, with data, to file starting at currentofs.
# nfreebytes are already available for overwriting, and it's
# guranteed that's enough. obj.offset is changed to reflect the
# new data record position, and filemap and key2entry are updated to
# match.
def _writeobj(self, obj, nfreebytes):
size = OBJECT_HEADER_SIZE + obj.size
assert size <= nfreebytes
excess = nfreebytes - size
# If there's any excess (which is likely), we need to record a
# free block following the end of the data record. That isn't
# expensive -- it's all a contiguous write.
if excess == 0:
extra = ''
elif excess < 5:
extra = "01234"[excess]
else:
extra = 'f' + struct.pack(">I", excess)
self.f.seek(self.currentofs)
# Before writing data, we'll write a free block for the space freed.
# We'll come back with a last atomic write to rewrite the start of the
# allocated-block header.
self.f.write('f'+struct.pack(">I", nfreebytes))
# Now write the rest of the allocation block header and object data.
self.f.write(struct.pack(">8s8s", obj.key[0], obj.key[1]))
obj.serialize(self.f)
self.f.write(extra)
# Now, we'll go back and rewrite the beginning of the
# allocated block header.
self.f.seek(self.currentofs)
self.f.write('a'+struct.pack(">I", size))
# Update index
e = Entry(obj.key, self.currentofs)
self.key2entry[obj.key] = e
self.filemap[self.currentofs] = size, e
self.currentofs += size
if excess:
# We need to record the free block in filemap, but there's
# no need to advance currentofs beyond it. Instead it
# gives some breathing room for the next object to get
# written.
self.filemap[self.currentofs] = excess, None
##
# Add Object object to the cache. This may evict existing objects, to
# make room (and almost certainly will, in steady state once the cache
# is first full). The object must not already be in the cache. If the
# object is too large for the cache, False is returned, otherwise True.
def add(self, object):
size = OBJECT_HEADER_SIZE + object.size
# A number of cache simulation experiments all concluded that the
# 2nd-level ZEO cache got a much higher hit rate if "very large"
# objects simply weren't cached. For now, we ignore the request
# only if the entire cache file is too small to hold the object.
if size > self.maxsize - ZEC3_HEADER_SIZE:
return False
assert object.key not in self.key2entry
assert len(object.key[0]) == 8
assert len(object.key[1]) == 8
self._n_adds += 1
self._n_added_bytes += size
available = self._makeroom(size)
self._writeobj(object, available)
return True
##
# Evict the object represented by Entry `e` from the cache, freeing
# `size` bytes in the file for reuse. `size` is used only for summary
# statistics. This does not alter the file, or self.filemap or
# self.key2entry (those are the caller's responsibilities). It does
# invoke _evicted(Object) on our parent.
def _evictobj(self, e, size):
self._n_evicts += 1
self._n_evicted_bytes += size
# Load the object header into memory so we know how to
# update the parent's in-memory data structures.
self.f.seek(e.offset + OBJECT_HEADER_SIZE)
o = Object.fromFile(self.f, e.key, skip_data=True)
self._evicted(o)
##
# Return Object for key, or None if not in cache.
def access(self, key):
self._n_accesses += 1
e = self.key2entry.get(key)
if e is None:
return None
offset = e.offset
size, e2 = self.filemap[offset]
assert e is e2
self.f.seek(offset + OBJECT_HEADER_SIZE)
return Object.fromFile(self.f, key)
##
# Remove Object for key from cache, if present.
def remove(self, key):
# If an object is being explicitly removed, we need to load
# its header into memory and write a free block marker to the
# disk where the object was stored. We need to load the
# header to update the in-memory data structures held by
# ClientCache.
# We could instead just keep the header in memory at all times.
e = self.key2entry.pop(key, None)
if e is None:
return
offset = e.offset
size, e2 = self.filemap[offset]
assert e is e2
self.filemap[offset] = size, None
self.f.seek(offset + OBJECT_HEADER_SIZE)
o = Object.fromFile(self.f, key, skip_data=True)
assert size >= 5 # only free blocks are tiny
# Because `size` >= 5, we can change an allocated block to a free
# block just by overwriting the 'a' status byte with 'f' -- the
# size field stays the same.
self.f.seek(offset)
self.f.write('f')
self.f.flush()
self._evicted(o)
##
# Update on-disk representation of Object obj.
#
# This method should be called when the object header is modified.
# obj must be in the cache. The only real use for this is during
# invalidation, to set the end_tid field on a revision that was current
# (and so had an end_tid of None, but no longer does).
def update(self, obj):
e = self.key2entry[obj.key]
self.f.seek(e.offset + OBJECT_HEADER_SIZE)
obj.serialize_header(self.f)
## ##
# Update our idea of the most recent tid. This is stored in the # Update our idea of the most recent tid. This is stored in the
# instance, and also written out near the start of the cache file. The # instance, and also written out near the start of the cache file. The
...@@ -518,100 +359,6 @@ class ClientCache(object): ...@@ -518,100 +359,6 @@ class ClientCache(object):
self.f.write(tid) self.f.write(tid)
self.f.flush() self.f.flush()
##
# This debug method marches over the entire cache file, verifying that
# the current contents match the info in self.filemap and self.key2entry.
def _verify_filemap(self, display=False):
a = ZEC3_HEADER_SIZE
f = self.f
while a < self.maxsize:
f.seek(a)
status = f.read(1)
if status in 'af':
size, = struct.unpack(">I", f.read(4))
else:
size = int(status)
if display:
if a == self.currentofs:
print '*****',
print "%c%d" % (status, size),
size2, obj = self.filemap[a]
assert size == size2
assert (obj is not None) == (status == 'a')
if obj is not None:
assert obj.offset == a
assert self.key2entry[obj.key] is obj
a += size
if display:
print
assert a == self.maxsize
def open(self):
self.scan(self.install)
##
# Callback for FileCache.scan(), when a pre-existing file cache is
# used. For each object in the file, `install()` is invoked. `f`
# is the file object, positioned at the start of the serialized Object.
# `ent` is an Entry giving the object's key ((oid, start_tid) pair).
def install(self, f, ent):
# Called by cache storage layer to insert object.
o = Object.fromFile(f, ent.key, skip_data=True)
if o is None:
return
oid = o.key[0]
if o.version:
self.version[oid] = o.version, o.start_tid
elif o.end_tid is None:
self.current[oid] = o.start_tid
else:
assert o.start_tid < o.end_tid
this_span = o.start_tid, o.end_tid
span_list = self.noncurrent.get(oid)
if span_list:
bisect.insort_left(span_list, this_span)
else:
self.noncurrent[oid] = [this_span]
##
# Set the last transaction seen by the cache.
# @param tid a transaction id
# @exception ValueError attempt to set a new tid less than the current tid
## ##
# Return the last transaction seen by the cache. # Return the last transaction seen by the cache.
# @return a transaction id # @return a transaction id
...@@ -631,28 +378,29 @@ class ClientCache(object): ...@@ -631,28 +378,29 @@ class ClientCache(object):
# @defreturn 3-tuple: (string, string, string) # @defreturn 3-tuple: (string, string, string)
def load(self, oid, version=""): def load(self, oid, version=""):
tid = None ofs = self.current.get(oid)
if version: if ofs is None:
p = self.version.get(oid)
if p is None:
self._trace(0x20, oid, version)
return None
elif p[0] == version:
tid = p[1]
# Otherwise, we know the cache has version data but not
# for the requested version. Thus, we know it is safe
# to return the non-version data from the cache.
if tid is None:
tid = self.current.get(oid)
if tid is None:
self._trace(0x20, oid, version) self._trace(0x20, oid, version)
return None return None
o = self.access((oid, tid)) self.f.seek(ofs)
if o is None: read = self.f.read
assert read(1) == 'a'
size, saved_oid, tid, end_tid, lver, ldata = unpack(
">I8s8s8shI", read(34))
assert saved_oid == oid
if lver or version:
if lver != len(version) or read(lver) != version:
self._trace(0x20, oid, version) self._trace(0x20, oid, version)
return None return None
self._trace(0x22, oid, version, o.start_tid, o.end_tid, len(o.data))
return o.data, tid, o.version
data = read(ldata)
assert len(data) == ldata
assert read(8) == oid
self._n_accesses += 1
self._trace(0x22, oid, version, tid, end_tid, ldata)
return data, tid, version
## ##
# Return a non-current revision of oid that was current before tid. # Return a non-current revision of oid that was current before tid.
...@@ -661,30 +409,38 @@ class ClientCache(object): ...@@ -661,30 +409,38 @@ class ClientCache(object):
# @return data record, serial number, start tid, and end tid # @return data record, serial number, start tid, and end tid
# @defreturn 4-tuple: (string, string, string, string) # @defreturn 4-tuple: (string, string, string, string)
def loadBefore(self, oid, tid): def loadBefore(self, oid, before_tid):
L = self.noncurrent.get(oid) noncurrent_for_oid = self.noncurrent.get(u64(oid))
if L is None: if noncurrent_for_oid is None:
self._trace(0x24, oid, "", tid) self._trace(0x24, oid, "", before_tid)
return None return None
# A pair with None as the second element is less than any pair with
# the same first tid. Dubious: this relies on that None is less items = noncurrent_for_oid.items(None, u64(before_tid)-1)
# than any comparable non-None object in recent Pythons. if not items:
i = bisect.bisect_left(L, (tid, None)) self._trace(0x24, oid, "", before_tid)
# Now L[i-1] < (tid, None) < L[i], and the start_tid for everything in return None
# L[:i] is < tid, and the start_tid for everything in L[i:] is >= tid. tid, ofs = items[-1]
# Therefore the largest start_tid < tid must be at L[i-1]. If i is 0,
# there is no start_tid < tid: we don't have any data old enougn. self.f.seek(ofs)
if i == 0: read = self.f.read
self._trace(0x24, oid, "", tid) assert read(1) == 'a'
return size, saved_oid, saved_tid, end_tid, lver, ldata = unpack(
lo, hi = L[i-1] ">I8s8s8shI", read(34))
assert lo < tid assert saved_oid == oid
if tid > hi: # we don't have any data in the right range assert saved_tid == p64(tid)
self._trace(0x24, oid, "", tid) assert lver == 0
assert end_tid != z64
data = read(ldata)
assert len(data) == ldata
assert read(8) == oid
if end_tid < before_tid:
self._trace(0x24, oid, "", before_tid)
return None return None
o = self.access((oid, lo))
self._trace(0x26, oid, "", tid) self._n_accesses += 1
return o.data, o.start_tid, o.end_tid self._trace(0x26, oid, "", saved_tid)
return data, saved_tid, end_tid
## ##
# Return the version an object is modified in, or None for an # Return the version an object is modified in, or None for an
...@@ -693,12 +449,20 @@ class ClientCache(object): ...@@ -693,12 +449,20 @@ class ClientCache(object):
# @return name of version in which the object is modified # @return name of version in which the object is modified
# @defreturn string or None # @defreturn string or None
# XXX This approac is wrong, but who cares
def modifiedInVersion(self, oid): def modifiedInVersion(self, oid):
p = self.version.get(oid) ofs = self.current.get(oid)
if p is None: if ofs is None:
return None
self.f.seek(ofs)
read = self.f.read
assert self.f.read(1) == 'a'
size, saved_oid, saved_tid, end_tid, lver, ldata = unpack(
">I8s8s8shI", read(34))
assert saved_oid == oid
if lver:
return read(lver)
return None return None
version, tid = p
return version
## ##
# Store a new data record in the cache. # Store a new data record in the cache.
...@@ -720,62 +484,89 @@ class ClientCache(object): ...@@ -720,62 +484,89 @@ class ClientCache(object):
# the requested version, doesn't find it, then asks the server # the requested version, doesn't find it, then asks the server
# for that data. The server returns the non-version data, # for that data. The server returns the non-version data,
# which may already be in the cache. # which may already be in the cache.
if (oid, start_tid) in self: seek = self.f.seek
if end_tid is None:
ofs = self.current.get(oid)
if ofs:
seek(ofs)
read = self.f.read
assert read(1) == 'a'
size, saved_oid, saved_tid, end_tid = unpack(
">I8s8s8s", read(28))
assert saved_oid == oid
assert end_tid == z64
if saved_tid == start_tid:
return
raise ValueError("already have current data for oid")
else:
noncurrent_for_oid = self.noncurrent.get(u64(oid))
if noncurrent_for_oid and (u64(start_tid) in noncurrent_for_oid):
return return
o = Object((oid, start_tid), version, data, start_tid, end_tid)
if version: if version:
if end_tid is not None:
raise ValueError("cache only stores current version data") raise ValueError("cache only stores current version data")
if oid in self.version:
if self.version[oid] != (version, start_tid): size = 43 + len(version) + len(data)
raise ValueError("data already exists for version %r"
% self.version[oid][0]) # A number of cache simulation experiments all concluded that the
if not self.add(o): # 2nd-level ZEO cache got a much higher hit rate if "very large"
return # too large # objects simply weren't cached. For now, we ignore the request
self.version[oid] = version, start_tid # only if the entire cache file is too small to hold the object.
self._trace(0x50, oid, version, start_tid, dlen=len(data)) if size > self.maxsize - ZEC3_HEADER_SIZE:
return
self._n_adds += 1
self._n_added_bytes += size
self._len += 1
nfreebytes = self._makeroom(size)
assert size <= nfreebytes
excess = nfreebytes - size
# If there's any excess (which is likely), we need to record a
# free block following the end of the data record. That isn't
# expensive -- it's all a contiguous write.
if excess == 0:
extra = ''
elif excess < 5:
extra = "01234"[excess]
else: else:
if end_tid is None: extra = 'f' + pack(">I", excess)
_cur_start = self.current.get(oid)
if _cur_start: ofs = self.currentofs
if _cur_start != start_tid: seek(ofs)
raise ValueError( write = self.f.write
"already have current data for oid")
# Before writing data, we'll write a free block for the space freed.
# We'll come back with a last atomic write to rewrite the start of the
# allocated-block header.
write('f'+pack(">I", nfreebytes))
# Now write the rest of the allocation block header and object data.
write(pack(">8s8s8shi",
oid, start_tid, end_tid or z64, len(version), len(data),
))
if version:
write(version)
write(data)
write(oid)
write(extra)
# Now, we'll go back and rewrite the beginning of the
# allocated block header.
seek(ofs)
write('a'+pack(">I", size))
if end_tid:
self._set_noncurrent(oid, start_tid, ofs)
self._trace(0x54, oid, version, start_tid, end_tid, dlen=len(data))
else: else:
return self.current[oid] = ofs
if not self.add(o): if version:
return # too large self._trace(0x50, oid, version, start_tid, dlen=len(data))
self.current[oid] = start_tid
self._trace(0x52, oid, version, start_tid, dlen=len(data))
else: else:
L = self.noncurrent.setdefault(oid, []) self._trace(0x52, oid, version, start_tid, dlen=len(data))
p = start_tid, end_tid
if p in L:
return # duplicate store
if not self.add(o):
return # too large
bisect.insort_left(L, p)
self._trace(0x54, oid, version, start_tid, end_tid,
dlen=len(data))
## self.currentofs += size
# Remove all knowledge of noncurrent revisions of oid, both in
# self.noncurrent and in our FileCache. `version` and `tid` are used
# only for trace records.
def _remove_noncurrent_revisions(self, oid, version, tid):
noncurrent_list = self.noncurrent.get(oid)
if noncurrent_list:
# Note: must iterate over a copy of noncurrent_list. The
# FileCache remove() calls our _evicted() method, and that
# mutates the list.
for old_tid, dummy in noncurrent_list[:]:
# 0x1E = invalidate (hit, discarding current or non-current)
self._trace(0x1E, oid, version, tid)
self.remove((oid, old_tid))
# fc.remove() calling back to _evicted() should have removed
# the list from noncurrent when the last non-current revision
# was removed.
assert oid not in self.noncurrent
## ##
# If `tid` is None, or we have data for `oid` in a (non-empty) version, # If `tid` is None, or we have data for `oid` in a (non-empty) version,
...@@ -794,62 +585,36 @@ class ClientCache(object): ...@@ -794,62 +585,36 @@ class ClientCache(object):
if tid > self.tid and tid is not None: if tid > self.tid and tid is not None:
self.setLastTid(tid) self.setLastTid(tid)
remove_all_knowledge_of_oid = tid is None ofs = self.current.get(oid)
if ofs is None:
if oid in self.version:
# Forget we know about the version data.
# 0x1A = invalidate (hit, version)
self._trace(0x1A, oid, version, tid)
dllversion, dlltid = self.version[oid]
assert not version or version == dllversion, (version, dllversion)
self.remove((oid, dlltid))
assert oid not in self.version # .remove() got rid of it
# And continue: we must also remove any non-version data from
# the cache. Or, at least, I have such a poor understanding of
# versions that anything less drastic would probably be wrong.
remove_all_knowledge_of_oid = True
if remove_all_knowledge_of_oid:
self._remove_noncurrent_revisions(oid, version, tid)
# Only current, non-version data remains to be handled.
cur_tid = self.current.get(oid)
if not cur_tid:
# 0x10 == invalidate (miss) # 0x10 == invalidate (miss)
self._trace(0x10, oid, version, tid) self._trace(0x10, oid, version, tid)
return return
# We had current data for oid, but no longer. self.f.seek(ofs)
read = self.f.read
if remove_all_knowledge_of_oid: assert read(1) == 'a'
size, saved_oid, saved_tid, end_tid, lver = unpack(
">I8s8s8sh", read(30))
assert saved_oid == oid
assert end_tid == z64
del self.current[oid]
if tid is None or lver:
self.f.seek(ofs)
self.f.write('f'+pack(">I", size))
if lver:
# 0x1A = invalidate (hit, version)
self._trace(0x1A, oid, version, tid)
else:
# 0x1E = invalidate (hit, discarding current or non-current) # 0x1E = invalidate (hit, discarding current or non-current)
self._trace(0x1E, oid, version, tid) self._trace(0x1E, oid, version, tid)
self.remove((oid, cur_tid)) self._len -= 1
assert cur_tid not in self.current # .remove() got rid of it else:
return self.f.seek(ofs+21)
self.f.write(tid)
# Add the data we have to the list of non-current data for oid. self._set_noncurrent(oid, saved_tid, ofs)
assert tid is not None and cur_tid <= tid
# 0x1C = invalidate (hit, saving non-current) # 0x1C = invalidate (hit, saving non-current)
self._trace(0x1C, oid, version, tid) self._trace(0x1C, oid, version, tid)
del self.current[oid] # because we no longer have current data
# Update the end_tid half of oid's validity range on disk.
# TODO: Want to fetch object without marking it as accessed.
o = self.access((oid, cur_tid))
assert o is not None
assert o.end_tid is None # i.e., o was current
if o is None:
# TODO: Since we asserted o is not None above, this block
# should be removed; waiting on time to prove it can't happen.
return
o.end_tid = tid
self.update(o) # record the new end_tid on disk
# Add to oid's list of non-current data.
L = self.noncurrent.setdefault(oid, [])
bisect.insort_left(L, (cur_tid, tid))
## ##
# Generates (oid, serial, version) triples for all objects in the # Generates (oid, serial, version) triples for all objects in the
...@@ -857,13 +622,19 @@ class ClientCache(object): ...@@ -857,13 +622,19 @@ class ClientCache(object):
def contents(self): def contents(self):
# May need to materialize list instead of iterating; # May need to materialize list instead of iterating;
# depends on whether the caller may change the cache. # depends on whether the caller may change the cache.
for o in self: seek = self.f.seek
oid, tid = o.key read = self.f.read
if oid in self.version: for oid, ofs in self.current.iteritems():
obj = self.access(o.key) seek(ofs)
yield oid, tid, obj.version assert read(1) == 'a'
size, saved_oid, tid, end_tid, lver = unpack(">I8s8s8sh", read(30))
assert saved_oid == oid
assert end_tid == z64
if lver:
version = read(lver)
else: else:
yield oid, tid, "" version = ''
yield oid, tid, version
def dump(self): def dump(self):
from ZODB.utils import oid_repr from ZODB.utils import oid_repr
...@@ -880,29 +651,6 @@ class ClientCache(object): ...@@ -880,29 +651,6 @@ class ClientCache(object):
print oid_repr(x.key[0]), oid_repr(x.key[1]), oid_repr(end_tid) print oid_repr(x.key[0]), oid_repr(x.key[1]), oid_repr(end_tid)
print print
def _evicted(self, o):
# Called by the FileCache to signal that Object o has been evicted.
oid, tid = o.key
if o.end_tid is None:
if o.version:
del self.version[oid]
else:
del self.current[oid]
else:
# Although we use bisect to keep the list sorted,
# we never expect the list to be very long. So the
# brute force approach should normally be fine.
L = self.noncurrent[oid]
element = (o.start_tid, o.end_tid)
if len(L) == 1:
# We don't want to leave an empty list in the dict: if
# the oid is never referenced again, it would consume RAM
# forever more for no purpose.
assert L[0] == element
del self.noncurrent[oid]
else:
L.remove(element)
# If `path` isn't None (== we're using a persistent cache file), and # If `path` isn't None (== we're using a persistent cache file), and
# envar ZEO_CACHE_TRACE is set to a non-empty value, try to open # envar ZEO_CACHE_TRACE is set to a non-empty value, try to open
# path+'.trace' as a trace file, and store the file object in # path+'.trace' as a trace file, and store the file object in
...@@ -910,27 +658,22 @@ class ClientCache(object): ...@@ -910,27 +658,22 @@ class ClientCache(object):
# tracing by setting self._trace to a dummy function, and set # tracing by setting self._trace to a dummy function, and set
# self._tracefile to None. # self._tracefile to None.
def _setup_trace(self, path): def _setup_trace(self, path):
self._tracefile = None _tracefile = None
if path and os.environ.get("ZEO_CACHE_TRACE"): if path and os.environ.get("ZEO_CACHE_TRACE"):
tfn = path + ".trace" tfn = path + ".trace"
try: try:
self._tracefile = open(tfn, "ab") _tracefile = open(tfn, "ab")
self._trace(0x00)
except IOError, msg: except IOError, msg:
self._tracefile = None
logger.warning("cannot write tracefile %r (%s)", tfn, msg) logger.warning("cannot write tracefile %r (%s)", tfn, msg)
else: else:
logger.info("opened tracefile %r", tfn) logger.info("opened tracefile %r", tfn)
if self._tracefile is None: if _tracefile is None:
def notrace(*args, **kws): self._trace = lambda *a, **k: None
pass return
self._trace = notrace
def _trace(self, now = time.time
code, oid="", version="", tid=z64, end_tid=z64, dlen=0, def _trace(code, oid="", version="", tid=z64, end_tid=z64, dlen=0):
# The next two are just speed hacks.
time_time=time.time, struct_pack=struct.pack):
# The code argument is two hex digits; bits 0 and 7 must be zero. # The code argument is two hex digits; bits 0 and 7 must be zero.
# The first hex digit shows the operation, the second the outcome. # The first hex digit shows the operation, the second the outcome.
# This method has been carefully tuned to be as fast as possible. # This method has been carefully tuned to be as fast as possible.
...@@ -943,171 +686,16 @@ class ClientCache(object): ...@@ -943,171 +686,16 @@ class ClientCache(object):
if end_tid is None: if end_tid is None:
end_tid = z64 end_tid = z64
try: try:
self._tracefile.write( _tracefile.write(
struct_pack(">iiH8s8s", pack(">iiH8s8s",
time_time(), now(), encoded, len(oid), tid, end_tid) + oid,
encoded, )
len(oid),
tid, end_tid) + oid)
except: except:
print `tid`, `end_tid` print `tid`, `end_tid`
raise raise
## self._trace = _trace
# An Object stores the cached data for a single object. _trace(0x00)
# <p>
# The cached data includes the actual object data, the key, and three
# data fields that describe the validity period and version of the
# object. The key contains the oid and a redundant start_tid. The
# actual size of an object is variable, depending on the size of the
# data and whether it is in a version.
# <p>
# The serialized format does not include the key, because it is stored
# in the header used by the cache file's storage format.
# <p>
# Instances of Object are generally short-lived -- they're really a way to
# package data on the way to or from the disk file.
class Object(object):
__slots__ = (# pair (object id, txn id) -- something usable as a dict key;
# the second part of the pair is equal to start_tid
"key",
# string, tid of txn that wrote the data
"start_tid",
# string, tid of txn that wrote next revision, or None
# if the data is current; if not None, end_tid is strictly
# greater than start_tid
"end_tid",
# string, name of version
"version",
# string, the actual data record for the object
"data",
# total size of serialized object; this includes the
# data, version, and all overhead (header) bytes.
"size",
)
# A serialized Object on disk looks like:
#
# offset # bytes value
# ------ ------- -----
# 0 8 end_tid; string
# 8 2 len(version); 2-byte signed int
# 10 4 len(data); 4-byte signed int
# 14 len(version) version; string
# 14+len(version) len(data) the object pickle; string
# 14+len(version)+
# len(data) 8 oid; string
# The serialization format uses an end tid of "\0"*8 (z64), the least
# 8-byte string, to represent None. It isn't possible for an end_tid
# to be 0, because it must always be strictly greater than the start_tid.
fmt = ">8shi" # end_tid, len(self.version), len(self.data)
FIXED_HEADER_SIZE = struct.calcsize(fmt)
assert FIXED_HEADER_SIZE == 14
TOTAL_FIXED_SIZE = FIXED_HEADER_SIZE + 8 # +8 for the oid at the end
def __init__(self, key, version, data, start_tid, end_tid):
self.key = key
self.version = version
self.data = data
self.start_tid = start_tid
self.end_tid = end_tid
# The size of the serialized object on disk, including the
# 14-byte header, the lengths of data and version, and a
# copy of the 8-byte oid.
if data is not None:
self.size = self.TOTAL_FIXED_SIZE + len(data) + len(version)
##
# Return the fixed-sized serialization header as a string: pack end_tid,
# and the lengths of the .version and .data members.
def get_header(self):
return struct.pack(self.fmt,
self.end_tid or z64,
len(self.version),
len(self.data))
##
# Write the serialized representation of self to file f, at its current
# position.
def serialize(self, f):
f.writelines([self.get_header(),
self.version,
self.data,
self.key[0]])
##
# Write the fixed-size header for self, to file f at its current position.
# The only real use for this is when the current revision of an object
# in cache is invalidated. Then the end_tid field gets set to the tid
# of the transaction that caused the invalidation.
def serialize_header(self, f):
f.write(self.get_header())
##
# fromFile is a class constructor, unserializing an Object from the
# current position in file f. Exclusive access to f for the duration
# is assumed. The key is a (oid, start_tid) pair, and the oid must
# match the serialized oid. If `skip_data` is true, .data is left
# None in the Object returned, but all the other fields are populated.
# Else (`skip_data` is false, the default), all fields including .data
# are populated. .data can be big, so it's prudent to skip it when it
# isn't needed.
def fromFile(cls, f, key, skip_data=False):
s = f.read(cls.FIXED_HEADER_SIZE)
if not s:
return None
oid, start_tid = key
end_tid, vlen, dlen = struct.unpack(cls.fmt, s)
if end_tid == z64:
end_tid = None
version = f.read(vlen)
if vlen != len(version):
raise ValueError("corrupted record, version")
if skip_data:
data = None
f.seek(dlen, 1)
else:
data = f.read(dlen)
if dlen != len(data):
raise ValueError("corrupted record, data")
s = f.read(8)
if s != oid:
raise ValueError("corrupted record, oid")
return cls((oid, start_tid), version, data, start_tid, end_tid)
fromFile = classmethod(fromFile)
# Entry just associates a key with a file offset. It's used by FileCache.
class Entry(object):
__slots__ = (# object key -- something usable as a dict key.
'key',
# Offset from start of file to the object's data
# record; this includes all overhead bytes (status
# byte, size bytes, etc). The size of the data
# record is stored in the file near the start of the
# record, but for efficiency we also keep size in a
# dict (filemap; see later).
'offset',
)
def __init__(self, key=None, offset=None):
self.key = key
self.offset = offset
def sync(f): def sync(f):
......
...@@ -20,7 +20,7 @@ import zope.testing.setupstack ...@@ -20,7 +20,7 @@ import zope.testing.setupstack
from zope.testing import doctest from zope.testing import doctest
import ZEO.cache import ZEO.cache
from ZODB.utils import p64 from ZODB.utils import p64, u64
n1 = p64(1) n1 = p64(1)
n2 = p64(2) n2 = p64(2)
...@@ -45,9 +45,9 @@ class CacheTests(unittest.TestCase): ...@@ -45,9 +45,9 @@ class CacheTests(unittest.TestCase):
self.assertEqual(self.cache.getLastTid(), None) self.assertEqual(self.cache.getLastTid(), None)
self.cache.setLastTid(n2) self.cache.setLastTid(n2)
self.assertEqual(self.cache.getLastTid(), n2) self.assertEqual(self.cache.getLastTid(), n2)
self.cache.invalidate(None, "", n1) self.cache.invalidate(n1, "", n1)
self.assertEqual(self.cache.getLastTid(), n2) self.assertEqual(self.cache.getLastTid(), n2)
self.cache.invalidate(None, "", n3) self.cache.invalidate(n1, "", n3)
self.assertEqual(self.cache.getLastTid(), n3) self.assertEqual(self.cache.getLastTid(), n3)
self.assertRaises(ValueError, self.cache.setLastTid, n2) self.assertRaises(ValueError, self.cache.setLastTid, n2)
...@@ -99,9 +99,9 @@ class CacheTests(unittest.TestCase): ...@@ -99,9 +99,9 @@ class CacheTests(unittest.TestCase):
self.assertEqual(self.cache.loadBefore(n2, n4), None) self.assertEqual(self.cache.loadBefore(n2, n4), None)
def testException(self): def testException(self):
# Not allowed to save non-current version data
self.assertRaises(ValueError, self.assertRaises(ValueError,
self.cache.store, self.cache.store, n1, "version", n2, n3, "data")
n1, "version", n2, n3, "data")
self.cache.store(n1, "", n2, None, "data") self.cache.store(n1, "", n2, None, "data")
self.assertRaises(ValueError, self.assertRaises(ValueError,
self.cache.store, self.cache.store,
...@@ -149,9 +149,10 @@ class CacheTests(unittest.TestCase): ...@@ -149,9 +149,10 @@ class CacheTests(unittest.TestCase):
eq = self.assertEqual eq = self.assertEqual
eq(copy.getLastTid(), self.cache.getLastTid()) eq(copy.getLastTid(), self.cache.getLastTid())
eq(len(copy), len(self.cache)) eq(len(copy), len(self.cache))
eq(copy.version, self.cache.version) eq(dict(copy.current), dict(self.cache.current))
eq(copy.current, self.cache.current) eq(dict([(k, dict(v)) for (k, v) in copy.noncurrent.items()]),
eq(copy.noncurrent, self.cache.noncurrent) dict([(k, dict(v)) for (k, v) in self.cache.noncurrent.items()]),
)
def testCurrentObjectLargerThanCache(self): def testCurrentObjectLargerThanCache(self):
if self.cache.path: if self.cache.path:
...@@ -181,7 +182,7 @@ class CacheTests(unittest.TestCase): ...@@ -181,7 +182,7 @@ class CacheTests(unittest.TestCase):
self.assertEquals(None, self.cache.load(n1)) self.assertEquals(None, self.cache.load(n1))
# If an object cannot be stored in the cache, it must not be # If an object cannot be stored in the cache, it must not be
# recorded as non-current. # recorded as non-current.
self.assert_((n2, n3) not in cache.noncurrent[n1]) self.assert_(1 not in cache.noncurrent)
__test__ = dict( __test__ = dict(
kill_does_not_cause_cache_corruption = kill_does_not_cause_cache_corruption =
...@@ -233,9 +234,7 @@ __test__ = dict( ...@@ -233,9 +234,7 @@ __test__ = dict(
>>> import ZEO.cache, ZODB.utils >>> import ZEO.cache, ZODB.utils
>>> cache = ZEO.cache.ClientCache('cache', 1000) >>> cache = ZEO.cache.ClientCache('cache', 1000)
>>> data = 'X' * (1000 - ZEO.cache.ZEC3_HEADER_SIZE >>> data = 'X' * (1000 - ZEO.cache.ZEC3_HEADER_SIZE - 43)
... - ZEO.cache.OBJECT_HEADER_SIZE
... - ZEO.cache.Object.TOTAL_FIXED_SIZE)
>>> cache.store(ZODB.utils.p64(1), '', ZODB.utils.p64(1), None, data) >>> cache.store(ZODB.utils.p64(1), '', ZODB.utils.p64(1), None, data)
>>> cache.close() >>> cache.close()
>>> cache = ZEO.cache.ClientCache('cache', 1000) >>> cache = ZEO.cache.ClientCache('cache', 1000)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment