Commit 90c63e7e authored by Jim Fulton's avatar Jim Fulton

Reimplemented the ZEO Blob protocol:

- Avoid more than one round-trip call when loading blobs via copy from
  the server.

- Avoid loading large amounts of blob data into memory.  The old
  storeBlob implementation was likely to queue blob adta faster than
  it could be sent, leading to a large memory foot print for the
  queue. Now, iterators are used to read data from files only when the
  network layer is ready to send it.

- Fixed storeBlob to move the input file to the blob cache (when not
  sharing the blob directiry with the server).

- Extended the loadBlob locking model to work with multiple processes
  by using file locks rather than threading locks.  A common
  configuration is to use a client process per core, so that a machine
  is likely to have many client processes and it should be possible
  for the client processes to share a common blob cache.
parent 2745f437
This diff is collapsed.
......@@ -60,3 +60,18 @@ class ClientStorage:
def info(self, arg):
self.rpc.callAsync('info', arg)
def storeBlob(self, oid, serial, blobfilename):
def store():
yield ('recieveBlobStart', (oid, serial))
f = open(blobfilename, 'rb')
while 1:
chunk = f.read(59000)
if not chunk:
break
yield ('recieveBlobChunk', (oid, serial, chunk, ))
f.close()
yield ('recieveBlobStop', (oid, serial))
self.rpc.callAsyncIterator(store())
......@@ -13,6 +13,7 @@
##############################################################################
"""RPC stubs for interface exported by StorageServer."""
import os
import time
##
......@@ -219,11 +220,29 @@ class StorageServer:
def storea(self, oid, serial, data, version, id):
self.rpc.callAsync('storea', oid, serial, data, version, id)
def storeBlobEnd(self, oid, serial, data, version, id):
self.rpc.callAsync('storeBlobEnd', oid, serial, data, version, id)
def storeBlob(self, oid, serial, data, blobfilename, version, txn):
def storeBlob(self, oid, serial, chunk, version, id):
self.rpc.callAsync('storeBlob', oid, serial, chunk, version, id)
# Store a blob to the server. We don't want to real all of
# the data into memory, so we use a message iterator. This
# allows us to read the blob data as needed.
if blobfilename is None:
self.rpc.callAsync('storeEmptyBlob',
oid, serial, data, version, id(txn))
return
def store():
yield ('storeBlobStart', ())
f = open(blobfilename, 'rb')
while 1:
chunk = f.read(59000)
if not chunk:
break
yield ('storeBlobChunk', (chunk, ))
f.close()
yield ('storeBlobEnd', (oid, serial, data, version, id(txn)))
self.rpc.callAsyncIterator(store())
def storeBlobShared(self, oid, serial, data, filename, version, id):
self.rpc.callAsync('storeBlobShared', oid, serial, data, filename,
......@@ -271,8 +290,8 @@ class StorageServer:
def load(self, oid, version):
return self.rpc.call('load', oid, version)
def loadBlob(self, oid, serial, version, offset):
return self.rpc.call('loadBlob', oid, serial, version, offset)
def sendBlob(self, oid, serial):
return self.rpc.call('sendBlob', oid, serial)
def getTid(self, oid):
return self.rpc.call('getTid', oid)
......
......@@ -25,6 +25,7 @@ import cPickle
import logging
import os
import sys
import tempfile
import threading
import time
import warnings
......@@ -103,9 +104,8 @@ class ZEOStorage:
self.log_label = _label
self.authenticated = 0
self.auth_realm = auth_realm
self.blob_transfer = {}
self.blob_tempfile = None
self.blob_log = []
self.blob_loads = {}
# The authentication protocol may define extra methods.
self._extensions = {}
for func in self.extensions:
......@@ -525,25 +525,22 @@ class ZEOStorage:
self.stats.stores += 1
self.txnlog.store(oid, serial, data, version)
def storeBlobStart(self):
assert self.blob_tempfile is None
self.blob_tempfile = tempfile.mkstemp(
dir=self.storage.temporaryDirectory())
def storeBlobChunk(self, chunk):
os.write(self.blob_tempfile[0], chunk)
def storeBlobEnd(self, oid, serial, data, version, id):
key = (oid, id)
if key not in self.blob_transfer:
raise Exception, "Can't finish a non-started Blob"
tempname, tempfile = self.blob_transfer.pop(key)
tempfile.close()
fd, tempname = self.blob_tempfile
self.blob_tempfile = None
os.close(fd)
self.blob_log.append((oid, serial, data, tempname, version))
def storeBlob(self, oid, serial, chunk, version, id):
# XXX check that underlying storage supports blobs
key = (oid, id)
if key not in self.blob_transfer:
tempname = mktemp()
tempfile = open(tempname, "wb")
# XXX Force close and remove them when Storage closes
self.blob_transfer[key] = (tempname, tempfile)
else:
tempname, tempfile = self.blob_transfer[key]
tempfile.write(chunk)
def storeEmptyBlob(self, oid, serial, data, version, id):
self.blob_log.append((oid, serial, data, None, version))
def storeBlobShared(self, oid, serial, data, filename, version, id):
# Reconstruct the full path from the filename in the OID directory
......@@ -551,17 +548,8 @@ class ZEOStorage:
filename)
self.blob_log.append((oid, serial, data, filename, version))
def loadBlob(self, oid, serial, version, offset):
key = (oid, serial)
if not key in self.blob_loads:
self.blob_loads[key] = \
open(self.storage.loadBlob(oid, serial, version))
blobdata = self.blob_loads[key]
blobdata.seek(offset)
chunk = blobdata.read(4096)
if not chunk:
del self.blob_loads[key]
return chunk
def sendBlob(self, oid, serial):
self.client.storeBlob(oid, serial, self.storage.loadBlob(oid, serial))
# The following four methods return values, so they must acquire
# the storage lock and begin the transaction before returning.
......
......@@ -59,12 +59,14 @@ class TransactionBuffer:
self.closed = 0
self.count = 0
self.size = 0
self.blobs = []
# It's safe to use a fast pickler because the only objects
# stored are builtin types -- strings or None.
self.pickler = cPickle.Pickler(self.file, 1)
self.pickler.fast = 1
def close(self):
self.clear()
self.lock.acquire()
try:
self.closed = 1
......@@ -82,6 +84,9 @@ class TransactionBuffer:
finally:
self.lock.release()
def storeBlob(self, oid, blobfilename):
self.blobs.append((oid, blobfilename))
def _store(self, oid, version, data):
"""Store oid, version, data for later retrieval"""
if self.closed:
......@@ -113,6 +118,10 @@ class TransactionBuffer:
self.file.seek(0)
self.count = 0
self.size = 0
while self.blobs:
oid, serial, blobfilename = self.blobs.pop()
if os.path.exists(blobfilename):
os.remove(blobfilename)
finally:
self.lock.release()
......
......@@ -22,6 +22,7 @@ import random
import signal
import socket
import tempfile
import threading
import time
import unittest
import shutil
......@@ -520,91 +521,99 @@ class CommonBlobTests:
self._storage.tpc_abort(t)
raise
filename = self._storage.loadBlob(oid, serial, version)
filename = self._storage.loadBlob(oid, serial)
self.assertEquals(somedata, open(filename, 'rb').read())
class BlobAdaptedFileStorageTests(GenericTests, CommonBlobTests):
"""ZEO backed by a BlobStorage-adapted FileStorage."""
def setUp(self):
self.blobdir = tempfile.mkdtemp() # This is the blob directory on the ZEO server
self.blobdir = tempfile.mkdtemp() # blob directory on ZEO server
self.filestorage = tempfile.mktemp()
super(BlobAdaptedFileStorageTests, self).setUp()
def checkLoadBlobLocks(self):
def checkStoreAndLoadBlob(self):
from ZODB.utils import oid_repr, tid_repr
from ZODB.Blobs.Blob import Blob
from ZODB.Blobs.BlobStorage import BLOB_SUFFIX
from ZODB.tests.StorageTestBase import zodb_pickle, ZERO, \
handle_serials
import transaction
version = ''
somedata = 'a' * 10
somedata_path = os.path.join(self.blob_cache_dir, 'somedata')
somedata = open(somedata_path, 'w+b')
for i in range(1000000):
somedata.write("%s\n" % i)
somedata.seek(0)
blob = Blob()
bd_fh = blob.open('w')
bd_fh.write(somedata)
ZODB.utils.cp(somedata, bd_fh)
bd_fh.close()
tfname = bd_fh.name
oid = self._storage.new_oid()
data = zodb_pickle(blob)
self.assert_(os.path.exists(tfname))
t = transaction.Transaction()
try:
self._storage.tpc_begin(t)
r1 = self._storage.storeBlob(oid, ZERO, data, tfname, '', t)
r2 = self._storage.tpc_vote(t)
serial = handle_serials(oid, r1, r2)
revid = handle_serials(oid, r1, r2)
self._storage.tpc_finish(t)
except:
self._storage.tpc_abort(t)
raise
# The uncommitted data file should have been removed
self.assert_(not os.path.exists(tfname))
class Dummy:
def __init__(self):
self.acquired = 0
self.released = 0
def acquire(self):
self.acquired += 1
def release(self):
self.released += 1
class statusdict(dict):
def __init__(self):
self.added = []
self.removed = []
def __setitem__(self, k, v):
self.added.append(k)
super(statusdict, self).__setitem__(k, v)
def __delitem__(self, k):
self.removed.append(k)
super(statusdict, self).__delitem__(k)
# ensure that we do locking properly
filename = self._storage.fshelper.getBlobFilename(oid, serial)
thestatuslock = self._storage.blob_status_lock = Dummy()
thebloblock = Dummy()
def getBlobLock():
return thebloblock
# override getBlobLock to test that locking is performed
self._storage.getBlobLock = getBlobLock
thestatusdict = self._storage.blob_status = statusdict()
filename = self._storage.loadBlob(oid, serial, version)
def check_data(path):
self.assert_(os.path.exists(path))
f = open(path, 'rb')
somedata.seek(0)
d1 = d2 = 1
while d1 or d2:
d1 = f.read(8096)
d2 = somedata.read(8096)
self.assertEqual(d1, d2)
self.assertEqual(thestatuslock.acquired, 2)
self.assertEqual(thestatuslock.released, 2)
self.assertEqual(thebloblock.acquired, 1)
self.assertEqual(thebloblock.released, 1)
# The file should have been copied to the server:
filename = os.path.join(self.blobdir, oid_repr(oid),
tid_repr(revid) + BLOB_SUFFIX)
check_data(filename)
self.assertEqual(thestatusdict.added, [(oid, serial)])
self.assertEqual(thestatusdict.removed, [(oid, serial)])
# It should also be in the cache:
filename = os.path.join(self.blob_cache_dir, oid_repr(oid),
tid_repr(revid) + BLOB_SUFFIX)
check_data(filename)
# If we remove it from the cache and call loadBlob, it should
# come back. We can do this in many threads. We'll instrument
# the method that is used to request data from teh server to
# verify that it is only called once.
sendBlob_org = ZEO.ServerStub.StorageServer.sendBlob
calls = []
def sendBlob(self, oid, serial):
calls.append((oid, serial))
sendBlob_org(self, oid, serial)
os.remove(filename)
returns = []
threads = [
threading.Thread(
target=lambda :
returns.append(self._storage.loadBlob(oid, revid))
)
for i in range(10)
]
[thread.start() for thread in threads]
[thread.join() for thread in threads]
[self.assertEqual(r, filename) for r in returns]
check_data(filename)
class BlobWritableCacheTests(GenericTests, CommonBlobTests):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment