Commit 5e7f34d2 authored by Julien Muchembled's avatar Julien Muchembled

New algorithm for deadlock avoidance

The time complexity of previous one was too bad. With several tens of
concurrent transactions, we saw commits take minutes to complete and
the whole application looked frozen.

This new algorithm is much simpler. Instead of asking the oldest
transaction to somewhat restart (we used the "rebase" term because
the concept was similar to what git-rebase does), the storage gives
it priority and the newest is asked to relock (this request is ignored
if vote already happened, which means there was actually no deadlock).

testLocklessWriteDuringConflictResolution was initially more complex
because Transaction.written (client) ignored KeyError (which is not the
case anymore since commit 8ef1ddba).
parent d98b576c
...@@ -524,32 +524,11 @@ class Application(ThreadedApplication): ...@@ -524,32 +524,11 @@ class Application(ThreadedApplication):
while 1: while 1:
# We iterate over conflict_dict, and clear it, # We iterate over conflict_dict, and clear it,
# because new items may be added by calls to _store. # because new items may be added by calls to _store.
# This is also done atomically, to avoid race conditions
# with PrimaryNotificationsHandler.notifyDeadlock
try: try:
oid, serial = pop_conflict() oid, serial = pop_conflict()
except KeyError: except KeyError:
return return
try:
data, old_serial, _ = data_dict.pop(oid) data, old_serial, _ = data_dict.pop(oid)
except KeyError:
assert oid is None, (oid, serial)
# Storage refused us from taking object lock, to avoid a
# possible deadlock. TID is actually used for some kind of
# "locking priority": when a higher value has the lock,
# this means we stored objects "too late", and we would
# otherwise cause a deadlock.
# To recover, we must ask storages to release locks we
# hold (to let possibly-competing transactions acquire
# them), and requeue our already-sent store requests.
ttid = txn_context.ttid
logging.info('Deadlock avoidance triggered for TXN %s'
' with new locking TID %s', dump(ttid), dump(serial))
txn_context.locking_tid = serial
packet = Packets.AskRebaseTransaction(ttid, serial)
for uuid in txn_context.conn_dict:
self._askStorageForWrite(txn_context, uuid, packet)
else:
if data is CHECKED_SERIAL: if data is CHECKED_SERIAL:
raise ReadConflictError(oid=oid, raise ReadConflictError(oid=oid,
serials=(serial, old_serial)) serials=(serial, old_serial))
...@@ -569,7 +548,6 @@ class Application(ThreadedApplication): ...@@ -569,7 +548,6 @@ class Application(ThreadedApplication):
# in this case. # in this case.
raise ConflictError(oid=oid, serials=(serial, old_serial), raise ConflictError(oid=oid, serials=(serial, old_serial),
data=data or None) data=data or None)
else:
logging.info( logging.info(
'Conflict resolution succeeded for %s@%s with %s', 'Conflict resolution succeeded for %s@%s with %s',
dump(oid), dump(old_serial), dump(serial)) dump(oid), dump(old_serial), dump(serial))
...@@ -609,6 +587,7 @@ class Application(ThreadedApplication): ...@@ -609,6 +587,7 @@ class Application(ThreadedApplication):
"""Store current transaction.""" """Store current transaction."""
txn_context = self._txn_container.get(transaction) txn_context = self._txn_container.get(transaction)
self.waitStoreResponses(txn_context) self.waitStoreResponses(txn_context)
txn_context.stored = True
ttid = txn_context.ttid ttid = txn_context.ttid
ext = transaction._extension ext = transaction._extension
ext = dumps(ext, _protocol) if ext else '' ext = dumps(ext, _protocol) if ext else ''
...@@ -640,10 +619,8 @@ class Application(ThreadedApplication): ...@@ -640,10 +619,8 @@ class Application(ThreadedApplication):
if not (uuid in failed or uuid in uuid_set): if not (uuid in failed or uuid in uuid_set):
break break
else: else:
# Very unlikely. Instead of raising, we could recover # Very unlikely. Trying to recover
# the transaction by doing something similar to # is not worth the complexity.
# deadlock avoidance; that would be done before voting.
# But it's not worth the complexity.
raise NEOStorageError( raise NEOStorageError(
'partition %s not fully write-locked' % offset) 'partition %s not fully write-locked' % offset)
failed = [uuid for uuid, running in failed.iteritems() if running] failed = [uuid for uuid, running in failed.iteritems() if running]
......
...@@ -149,13 +149,6 @@ class PrimaryNotificationsHandler(MTEventHandler): ...@@ -149,13 +149,6 @@ class PrimaryNotificationsHandler(MTEventHandler):
if node and node.isConnected(): if node and node.isConnected():
node.getConnection().close() node.getConnection().close()
def notifyDeadlock(self, conn, ttid, locking_tid):
for txn_context in self.app.txn_contexts():
if txn_context.ttid == ttid:
txn_context.conflict_dict[None] = locking_tid
txn_context.wakeup(conn)
break
class PrimaryAnswersHandler(AnswerBaseHandler): class PrimaryAnswersHandler(AnswerBaseHandler):
""" Handle that process expected packets from the primary master """ """ Handle that process expected packets from the primary master """
......
...@@ -28,11 +28,24 @@ from ..transactions import Transaction ...@@ -28,11 +28,24 @@ from ..transactions import Transaction
from ..exception import NEOStorageError, NEOStorageNotFoundError from ..exception import NEOStorageError, NEOStorageNotFoundError
from ..exception import NEOStorageReadRetry, NEOStorageDoesNotExistError from ..exception import NEOStorageReadRetry, NEOStorageDoesNotExistError
@apply
class _DeadlockPacket(object):
handler_method_name = 'notifyDeadlock'
_args = ()
getId = int
class StorageEventHandler(MTEventHandler): class StorageEventHandler(MTEventHandler):
def _acceptIdentification(*args): def _acceptIdentification(*args):
pass pass
def notifyDeadlock(self, conn, ttid, oid):
for txn_context in self.app.txn_contexts():
if txn_context.ttid == ttid:
txn_context.queue.put((conn, _DeadlockPacket, {'oid': oid}))
break
class StorageBootstrapHandler(AnswerBaseHandler): class StorageBootstrapHandler(AnswerBaseHandler):
""" Handler used when connecting to a storage node """ """ Handler used when connecting to a storage node """
...@@ -72,22 +85,28 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -72,22 +85,28 @@ class StorageAnswersHandler(AnswerBaseHandler):
answerCheckCurrentSerial = answerStoreObject answerCheckCurrentSerial = answerStoreObject
def answerRebaseTransaction(self, conn, oid_list): def notifyDeadlock(self, conn, oid):
# To avoid a possible deadlock, storage nodes are waiting for our
# lock to be cancelled, so that a transaction that started earlier
# can complete. This is done by acquiring the lock again.
txn_context = self.app.getHandlerData() txn_context = self.app.getHandlerData()
if txn_context.stored:
return
ttid = txn_context.ttid ttid = txn_context.ttid
queue = txn_context.queue logging.info('Deadlock avoidance triggered for %s:%s',
dump(oid), dump(ttid))
assert conn.getUUID() not in txn_context.data_dict.get(oid, ((),))[-1]
try: try:
for oid in oid_list:
# We could have an extra parameter to tell the storage if we # We could have an extra parameter to tell the storage if we
# still have the data, and in this case revert what was done # still have the data, and in this case revert what was done
# in Transaction.written. This would save bandwidth in case of # in Transaction.written. This would save bandwidth in case of
# conflict. # conflict.
conn.ask(Packets.AskRebaseObject(ttid, oid), conn.ask(Packets.AskRelockObject(ttid, oid),
queue=queue, oid=oid) queue=txn_context.queue, oid=oid)
except ConnectionClosed: except ConnectionClosed:
txn_context.conn_dict[conn.getUUID()] = None txn_context.conn_dict[conn.getUUID()] = None
def answerRebaseObject(self, conn, conflict, oid): def answerRelockObject(self, conn, conflict, oid):
if conflict: if conflict:
txn_context = self.app.getHandlerData() txn_context = self.app.getHandlerData()
serial, conflict, data = conflict serial, conflict, data = conflict
...@@ -105,7 +124,7 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -105,7 +124,7 @@ class StorageAnswersHandler(AnswerBaseHandler):
assert oid in txn_context.data_dict assert oid in txn_context.data_dict
if serial <= txn_context.conflict_dict.get(oid, ''): if serial <= txn_context.conflict_dict.get(oid, ''):
# Another node already reported this conflict or a newer, # Another node already reported this conflict or a newer,
# by answering to this rebase or to the previous store. # by answering to this relock or to the previous store.
return return
# A node has not answered yet to a previous store. Do not wait # A node has not answered yet to a previous store. Do not wait
# it to report the conflict because it may fail before. # it to report the conflict because it may fail before.
...@@ -119,8 +138,8 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -119,8 +138,8 @@ class StorageAnswersHandler(AnswerBaseHandler):
compression, checksum, data = data compression, checksum, data = data
if checksum != makeChecksum(data): if checksum != makeChecksum(data):
raise NEOStorageError( raise NEOStorageError(
'wrong checksum while getting back data for' 'wrong checksum while getting back data for %s:%s'
' object %s during rebase of transaction %s' ' (deadlock avoidance)'
% (dump(oid), dump(txn_context.ttid))) % (dump(oid), dump(txn_context.ttid)))
data = decompress_list[compression](data) data = decompress_list[compression](data)
size = len(data) size = len(data)
......
...@@ -22,19 +22,12 @@ from neo.lib.protocol import Packets ...@@ -22,19 +22,12 @@ from neo.lib.protocol import Packets
from neo.lib.util import dump from neo.lib.util import dump
from .exception import NEOStorageError from .exception import NEOStorageError
@apply
class _WakeupPacket(object):
handler_method_name = 'pong'
_args = ()
getId = int
class Transaction(object): class Transaction(object):
cache_size = 0 # size of data in cache_dict cache_size = 0 # size of data in cache_dict
data_size = 0 # size of data in data_dict data_size = 0 # size of data in data_dict
error = None error = None
locking_tid = None stored = False
voted = False voted = False
ttid = None # XXX: useless, except for testBackupReadOnlyAccess ttid = None # XXX: useless, except for testBackupReadOnlyAccess
lockless_dict = None # {partition: {uuid}} lockless_dict = None # {partition: {uuid}}
...@@ -55,16 +48,13 @@ class Transaction(object): ...@@ -55,16 +48,13 @@ class Transaction(object):
def __repr__(self): def __repr__(self):
error = self.error error = self.error
return ("<%s ttid=%s locking_tid=%s voted=%u" return ("<%s ttid=%s voted=%u"
" #queue=%s #writing=%s #written=%s%s>") % ( " #queue=%s #writing=%s #written=%s%s>") % (
self.__class__.__name__, self.__class__.__name__,
dump(self.ttid), dump(self.locking_tid), self.voted, dump(self.ttid), self.voted,
len(self.queue._queue), len(self.data_dict), len(self.cache_dict), len(self.queue._queue), len(self.data_dict), len(self.cache_dict),
' error=%r' % error if error else '') ' error=%r' % error if error else '')
def wakeup(self, conn):
self.queue.put((conn, _WakeupPacket, {}))
def write(self, app, packet, object_id, **kw): def write(self, app, packet, object_id, **kw):
uuid_list = [] uuid_list = []
pt = app.pt pt = app.pt
...@@ -78,14 +68,6 @@ class Transaction(object): ...@@ -78,14 +68,6 @@ class Transaction(object):
conn = conn_dict[uuid] conn = conn_dict[uuid]
except KeyError: except KeyError:
conn = conn_dict[uuid] = app.getStorageConnection(node) conn = conn_dict[uuid] = app.getStorageConnection(node)
if self.locking_tid and 'oid' in kw:
# A deadlock happened but this node is not aware of it.
# Tell it to write-lock with the same locking tid as
# for the other nodes. The condition on kw is to
# distinguish whether we're writing an oid or
# transaction metadata.
conn.ask(Packets.AskRebaseTransaction(
self.ttid, self.locking_tid), queue=self.queue)
conn.ask(packet, queue=self.queue, **kw) conn.ask(packet, queue=self.queue, **kw)
uuid_list.append(uuid) uuid_list.append(uuid)
except AttributeError: except AttributeError:
...@@ -128,8 +110,8 @@ class Transaction(object): ...@@ -128,8 +110,8 @@ class Transaction(object):
lockless = self.lockless_dict = defaultdict(set) lockless = self.lockless_dict = defaultdict(set)
lockless[app.pt.getPartition(oid)].add(uuid) lockless[app.pt.getPartition(oid)].add(uuid)
if oid in self.conflict_dict: if oid in self.conflict_dict:
# In the case of a rebase, uuid_list may not contain the id # In case of deadlock avoidance, uuid_list may not contain the
# of the node reporting a conflict. # id of the node reporting a conflict.
return return
if uuid_list: if uuid_list:
return return
......
...@@ -322,8 +322,8 @@ class EventQueue(object): ...@@ -322,8 +322,8 @@ class EventQueue(object):
# Stable sort when 2 keys are equal. # Stable sort when 2 keys are equal.
# XXX: Is it really useful to keep events with same key ordered # XXX: Is it really useful to keep events with same key ordered
# chronologically ? The caller could use more specific keys. For # chronologically ? The caller could use more specific keys.
# write-locks (by the storage node), the locking tid seems enough. # For write-locks (by the storage node), the ttid seems enough.
sortQueuedEvents = (lambda key=itemgetter(0): lambda self: sortQueuedEvents = (lambda key=itemgetter(0): lambda self:
self._event_queue.sort(key=key))() self._event_queue.sort(key=key))()
...@@ -334,14 +334,6 @@ class EventQueue(object): ...@@ -334,14 +334,6 @@ class EventQueue(object):
if key is not None: if key is not None:
self.sortQueuedEvents() self.sortQueuedEvents()
def sortAndExecuteQueuedEvents(self):
if self._executing_event < 0:
self.sortQueuedEvents()
self.executeQueuedEvents()
else:
# We can't sort events when they're being processed.
self._executing_event = 1
def executeQueuedEvents(self): def executeQueuedEvents(self):
# Not reentrant. When processing a queued event, calling this method # Not reentrant. When processing a queued event, calling this method
# only tells the caller to retry all events from the beginning, because # only tells the caller to retry all events from the beginning, because
...@@ -369,10 +361,6 @@ class EventQueue(object): ...@@ -369,10 +361,6 @@ class EventQueue(object):
while done: while done:
del queue[done.pop()] del queue[done.pop()]
self._executing_event = 0 self._executing_event = 0
# What sortAndExecuteQueuedEvents could not do immediately
# is done here:
if event[0] is not None:
self.sortQueuedEvents()
self._executing_event = -1 self._executing_event = -1
def logQueuedEvents(self): def logQueuedEvents(self):
......
...@@ -20,7 +20,7 @@ from msgpack import packb ...@@ -20,7 +20,7 @@ from msgpack import packb
# The protocol version must be increased whenever upgrading a node may require # The protocol version must be increased whenever upgrading a node may require
# to upgrade other nodes. # to upgrade other nodes.
PROTOCOL_VERSION = 1 PROTOCOL_VERSION = 2
# By encoding the handshake packet with msgpack, the whole NEO stream can be # By encoding the handshake packet with msgpack, the whole NEO stream can be
# decoded with msgpack. The first byte is 0x92, which is different from TLS # decoded with msgpack. The first byte is 0x92, which is different from TLS
# Handshake (0x16). # Handshake (0x16).
...@@ -532,28 +532,17 @@ class Packets(dict): ...@@ -532,28 +532,17 @@ class Packets(dict):
""") """)
NotifyDeadlock = notify(""" NotifyDeadlock = notify("""
Ask master to generate a new TTID that will be used by the client to A client acquired a write-lock before another one that has a smaller
solve a deadlock by rebasing the transaction on top of concurrent TTID, leading to a possible deadlock. In order to solve it, this asks
changes. the client with the greatest TTID to lock again if it can't vote.
:nodes: S -> M -> C :nodes: S -> C
""") """)
AskRebaseTransaction, AnswerRebaseTransaction = request(""" AskRelockObject, AnswerRelockObject = request("""
Rebase a transaction to solve a deadlock. Relock an object change to solve a deadlock.
:nodes: C -> S :nodes: C -> S
""")
AskRebaseObject, AnswerRebaseObject = request("""
Rebase an object change to solve a deadlock.
:nodes: C -> S
XXX: It is a request packet to simplify the implementation. For more
efficiency, this should be turned into a notification, and the
RebaseTransaction should answered once all objects are rebased
(so that the client can still wait on something).
""", data_path=(1, 0, 2, 0)) """, data_path=(1, 0, 2, 0))
AskStoreObject, AnswerStoreObject = request(""" AskStoreObject, AnswerStoreObject = request("""
......
...@@ -68,9 +68,6 @@ class StorageServiceHandler(BaseServiceHandler): ...@@ -68,9 +68,6 @@ class StorageServiceHandler(BaseServiceHandler):
conn.answer(p) conn.answer(p)
app.pt.updatable(conn.getUUID(), offset_list) app.pt.updatable(conn.getUUID(), offset_list)
def notifyDeadlock(self, conn, *args):
self.app.tm.deadlock(conn.getUUID(), *args)
def answerInformationLocked(self, conn, ttid): def answerInformationLocked(self, conn, ttid):
self.app.tm.lock(ttid, conn.getUUID()) self.app.tm.lock(ttid, conn.getUUID())
......
...@@ -19,15 +19,13 @@ from time import time ...@@ -19,15 +19,13 @@ from time import time
from struct import pack, unpack from struct import pack, unpack
from neo.lib import logging from neo.lib import logging
from neo.lib.handler import DelayEvent, EventQueue from neo.lib.handler import DelayEvent, EventQueue
from neo.lib.protocol import Packets, ProtocolError, uuid_str, \ from neo.lib.protocol import ProtocolError, uuid_str, ZERO_OID, ZERO_TID
ZERO_OID, ZERO_TID
from neo.lib.util import dump, u64, addTID, tidFromTime from neo.lib.util import dump, u64, addTID, tidFromTime
class Transaction(object): class Transaction(object):
""" """
A pending transaction A pending transaction
""" """
locking_tid = ZERO_TID
_tid = None _tid = None
_msg_id = None _msg_id = None
_oid_list = None _oid_list = None
...@@ -292,19 +290,6 @@ class TransactionManager(EventQueue): ...@@ -292,19 +290,6 @@ class TransactionManager(EventQueue):
logging.debug('Begin %s', txn) logging.debug('Begin %s', txn)
return tid return tid
def deadlock(self, storage_id, ttid, locking_tid):
try:
txn = self._ttid_dict[ttid]
except KeyError:
return
if txn.locking_tid <= locking_tid:
client = txn.getNode()
txn.locking_tid = locking_tid = self._nextTID()
logging.info('Deadlock avoidance triggered by %s for %s:'
' new locking tid for TXN %s is %s', uuid_str(storage_id),
uuid_str(client.getUUID()), dump(ttid), dump(locking_tid))
client.send(Packets.NotifyDeadlock(ttid, locking_tid))
def vote(self, app, ttid, uuid_list): def vote(self, app, ttid, uuid_list):
""" """
Check that the transaction can be voted Check that the transaction can be voted
......
...@@ -133,25 +133,23 @@ class ClientOperationHandler(BaseHandler): ...@@ -133,25 +133,23 @@ class ClientOperationHandler(BaseHandler):
compression, checksum, data, data_serial, ttid, time.time()), compression, checksum, data, data_serial, ttid, time.time()),
*e.args) *e.args)
def askRebaseTransaction(self, conn, *args): def askRelockObject(self, conn, ttid, oid):
conn.answer(Packets.AnswerRebaseTransaction(
self.app.tm.rebase(conn, *args)))
def askRebaseObject(self, conn, ttid, oid):
try: try:
self._askRebaseObject(conn, ttid, oid, None) self.app.tm.relockObject(ttid, oid, True)
except DelayEvent, e: except DelayEvent, e:
# locked by a previous transaction, retry later # locked by a previous transaction, retry later
self.app.tm.queueEvent(self._askRebaseObject, self.app.tm.queueEvent(self._askRelockObject,
conn, (ttid, oid, time.time()), *e.args) conn, (ttid, oid, time.time()), *e.args)
else:
conn.answer(Packets.AnswerRelockObject(None))
def _askRebaseObject(self, conn, ttid, oid, request_time): def _askRelockObject(self, conn, ttid, oid, request_time):
conflict = self.app.tm.rebaseObject(ttid, oid) conflict = self.app.tm.relockObject(ttid, oid, False)
if request_time and SLOW_STORE is not None: if request_time and SLOW_STORE is not None:
duration = time.time() - request_time duration = time.time() - request_time
if duration > SLOW_STORE: if duration > SLOW_STORE:
logging.info('RebaseObject delay: %.02fs', duration) logging.info('RelockObject delay: %.02fs', duration)
conn.answer(Packets.AnswerRebaseObject(conflict)) conn.answer(Packets.AnswerRelockObject(conflict))
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition): def askTIDsFrom(self, conn, min_tid, max_tid, length, partition):
conn.answer(Packets.AnswerTIDsFrom(self.app.dm.getReplicationTIDList( conn.answer(Packets.AnswerTIDsFrom(self.app.dm.getReplicationTIDList(
...@@ -251,8 +249,7 @@ class ClientReadOnlyOperationHandler(ClientOperationHandler): ...@@ -251,8 +249,7 @@ class ClientReadOnlyOperationHandler(ClientOperationHandler):
askVoteTransaction = _readOnly askVoteTransaction = _readOnly
askStoreObject = _readOnly askStoreObject = _readOnly
askFinalTID = _readOnly askFinalTID = _readOnly
askRebaseObject = _readOnly askRelockObject = _readOnly
askRebaseTransaction = _readOnly
# takes write lock & is only used when going to commit # takes write lock & is only used when going to commit
askCheckCurrentSerial = _readOnly askCheckCurrentSerial = _readOnly
......
This diff is collapsed.
...@@ -26,9 +26,8 @@ AnswerPack(bool) ...@@ -26,9 +26,8 @@ AnswerPack(bool)
AnswerPartitionList(int,int,[[(int,CellStates)]]) AnswerPartitionList(int,int,[[(int,CellStates)]])
AnswerPartitionTable(int,int,[[(int,CellStates)]]) AnswerPartitionTable(int,int,[[(int,CellStates)]])
AnswerPrimary(int) AnswerPrimary(int)
AnswerRebaseObject(?(p64,p64,?(int,bin,bin)))
AnswerRebaseTransaction([p64])
AnswerRecovery(?int,?p64,?p64) AnswerRecovery(?int,?p64,?p64)
AnswerRelockObject(?(p64,p64,?(int,bin,bin)))
AnswerStoreObject(?p64) AnswerStoreObject(?p64)
AnswerStoreTransaction() AnswerStoreTransaction()
AnswerTIDs([p64]) AnswerTIDs([p64])
...@@ -61,9 +60,8 @@ AskPack(p64) ...@@ -61,9 +60,8 @@ AskPack(p64)
AskPartitionList(int,int,?) AskPartitionList(int,int,?)
AskPartitionTable() AskPartitionTable()
AskPrimary() AskPrimary()
AskRebaseObject(p64,p64)
AskRebaseTransaction(p64,p64)
AskRecovery() AskRecovery()
AskRelockObject(p64,p64)
AskStoreObject(p64,p64,int,bin,bin,?p64,?p64) AskStoreObject(p64,p64,int,bin,bin,?p64,?p64)
AskStoreTransaction(p64,bin,bin,bin,[p64]) AskStoreTransaction(p64,bin,bin,bin,[p64])
AskTIDs(int,int,int) AskTIDs(int,int,int)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment