Commit 661c955c authored by Vincent Pelletier's avatar Vincent Pelletier

Solve store deadlocks.

Store deadlocks might happen when the order in which storage nodes receive
object data (and hence, grant write lock for those objects) is not
consistent among clients. So when a store timeouts, it might be because
another transaction got the store lock on this storage and object, while
it might wait for locks we got on other storage nodes or objects.
In doubt, report a conflict to abort current transaction in hope for the
other transaction to succeed.

git-svn-id: https://svn.erp5.org/repos/neo/trunk@2109 71dcc9de-d417-0410-9af5-da40c76e7ee4
parent fc514154
...@@ -34,7 +34,7 @@ from neo.protocol import NodeTypes, Packets, INVALID_PARTITION ...@@ -34,7 +34,7 @@ from neo.protocol import NodeTypes, Packets, INVALID_PARTITION
from neo.event import EventManager from neo.event import EventManager
from neo.util import makeChecksum as real_makeChecksum, dump from neo.util import makeChecksum as real_makeChecksum, dump
from neo.locking import Lock from neo.locking import Lock
from neo.connection import MTClientConnection from neo.connection import MTClientConnection, OnTimeout
from neo.node import NodeManager from neo.node import NodeManager
from neo.connector import getConnectorHandler from neo.connector import getConnectorHandler
from neo.client.exception import NEOStorageError from neo.client.exception import NEOStorageError
...@@ -581,6 +581,7 @@ class Application(object): ...@@ -581,6 +581,7 @@ class Application(object):
checksum = makeChecksum(compressed_data) checksum = makeChecksum(compressed_data)
p = Packets.AskStoreObject(oid, serial, compression, p = Packets.AskStoreObject(oid, serial, compression,
checksum, compressed_data, self.local_var.tid) checksum, compressed_data, self.local_var.tid)
on_timeout = OnTimeout(self.onStoreTimeout, oid)
# Store object in tmp cache # Store object in tmp cache
self.local_var.data_dict[oid] = data self.local_var.data_dict[oid] = data
# Store data on each node # Store data on each node
...@@ -592,13 +593,21 @@ class Application(object): ...@@ -592,13 +593,21 @@ class Application(object):
if conn is None: if conn is None:
continue continue
try: try:
conn.ask(p) conn.ask(p, on_timeout=on_timeout)
except ConnectionClosed: except ConnectionClosed:
continue continue
self._waitAnyMessage(False) self._waitAnyMessage(False)
return None return None
def onStoreTimeout(self, conn, msg_id, oid):
# Ask the storage if someone locks the object.
# Shorten timeout to react earlier to an unresponding storage.
conn.ask(Packets.AskHasLock(oid), timeout=5)
# Stop expecting the timed-out store request.
self.dispatcher.forget(conn, msg_id)
return True
@profiler_decorator @profiler_decorator
def _handleConflicts(self, tryToResolveConflict): def _handleConflicts(self, tryToResolveConflict):
result = [] result = []
......
...@@ -16,10 +16,11 @@ ...@@ -16,10 +16,11 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from ZODB.TimeStamp import TimeStamp from ZODB.TimeStamp import TimeStamp
from ZODB.POSException import ConflictError
from neo import logging from neo import logging
from neo.client.handlers import BaseHandler, AnswerBaseHandler from neo.client.handlers import BaseHandler, AnswerBaseHandler
from neo.protocol import NodeTypes, ProtocolError from neo.protocol import NodeTypes, ProtocolError, LockState
from neo.util import dump from neo.util import dump
from neo.client.exception import NEOStorageError from neo.client.exception import NEOStorageError
...@@ -132,3 +133,20 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -132,3 +133,20 @@ class StorageAnswersHandler(AnswerBaseHandler):
for oid in oid_list: for oid in oid_list:
data_dict[oid] = '' data_dict[oid] = ''
def answerHasLock(self, conn, oid, status):
if status == LockState.GRANTED_TO_OTHER:
# Object is locked by another transaction, and we have waited until
# timeout. To avoid a deadlock, abort current transaction (we might
# be locking objects the other transaction is waiting for).
raise ConflictError, 'Lock wait timeout for oid %s on %r' % (
dump(oid), conn.getNode())
elif status == LockState.GRANTED:
logging.info('Store of oid %s was successful, but after timeout.',
dump(oid))
# XXX: Not sure what to do in this case yet, for now do nothing.
else:
# Nobody has the lock, although we asked storage to lock. This
# means there is a software bug somewhere.
# XXX: Not sure what to do in this case yet
raise NotImplementedError
...@@ -332,6 +332,12 @@ class EventHandler(object): ...@@ -332,6 +332,12 @@ class EventHandler(object):
def answerUndoTransaction(self, conn, oid_list, error_oid_list, conflict_oid_list): def answerUndoTransaction(self, conn, oid_list, error_oid_list, conflict_oid_list):
raise UnexpectedPacketError raise UnexpectedPacketError
def askHasLock(self, tid, oid):
raise UnexpectedPacketError
def answerHasLock(self, oid, status):
raise UnexpectedPacketError
# Error packet handlers. # Error packet handlers.
def error(self, conn, code, message): def error(self, conn, code, message):
...@@ -437,6 +443,8 @@ class EventHandler(object): ...@@ -437,6 +443,8 @@ class EventHandler(object):
d[Packets.NotifyReplicationDone] = self.notifyReplicationDone d[Packets.NotifyReplicationDone] = self.notifyReplicationDone
d[Packets.AskUndoTransaction] = self.askUndoTransaction d[Packets.AskUndoTransaction] = self.askUndoTransaction
d[Packets.AnswerUndoTransaction] = self.answerUndoTransaction d[Packets.AnswerUndoTransaction] = self.answerUndoTransaction
d[Packets.AskHasLock] = self.askHasLock
d[Packets.AnswerHasLock] = self.answerHasLock
return d return d
......
...@@ -77,6 +77,12 @@ class CellStates(Enum): ...@@ -77,6 +77,12 @@ class CellStates(Enum):
DISCARDED = Enum.Item(4) DISCARDED = Enum.Item(4)
CellStates = CellStates() CellStates = CellStates()
class LockState(Enum):
NOT_LOCKED = Enum.Item(1)
GRANTED = Enum.Item(2)
GRANTED_TO_OTHER = Enum.Item(3)
LockState = LockState()
# used for logging # used for logging
node_state_prefix_dict = { node_state_prefix_dict = {
NodeStates.RUNNING: 'R', NodeStates.RUNNING: 'R',
...@@ -159,6 +165,13 @@ def _decodeErrorCode(original_error_code): ...@@ -159,6 +165,13 @@ def _decodeErrorCode(original_error_code):
original_error_code) original_error_code)
return error_code return error_code
def _decodeLockState(original_lock_state):
lock_state = LockState.get(original_lock_state)
if lock_state is None:
raise PacketMalformedError('invalid lock state %d' % (
original_lock_state, ))
return lock_state
def _decodeAddress(address): def _decodeAddress(address):
if address == '\0' * 6: if address == '\0' * 6:
return None return None
...@@ -1547,6 +1560,30 @@ class AnswerUndoTransaction(Packet): ...@@ -1547,6 +1560,30 @@ class AnswerUndoTransaction(Packet):
append(read(OID_LEN)) append(read(OID_LEN))
return (oid_list, error_oid_list, conflict_oid_list) return (oid_list, error_oid_list, conflict_oid_list)
class AskHasLock(Packet):
"""
Ask a storage is oid is locked by another transaction.
C -> S
"""
def _encode(self, tid, oid):
return _encodeTID(tid) + _encodeTID(oid)
def _decode(self, body):
return (_decodeTID(body[:8]), _decodeTID(body[8:]))
class AnswerHasLock(Packet):
"""
Answer whether a transaction holds the write lock for requested object.
"""
_header_format = '!8sH'
def _encode(self, oid, state):
return pack(self._header_format, oid, state)
def _decode(self, body):
oid, state = unpack(self._header_format, body)
return (oid, _decodeLockState(state))
class Error(Packet): class Error(Packet):
""" """
Error is a special type of message, because this can be sent against Error is a special type of message, because this can be sent against
...@@ -1770,6 +1807,10 @@ class PacketRegistry(dict): ...@@ -1770,6 +1807,10 @@ class PacketRegistry(dict):
0x0033, 0x0033,
AskUndoTransaction, AskUndoTransaction,
AnswerUndoTransaction) AnswerUndoTransaction)
AskHasLock, AnswerHasLock = register(
0x0034,
AskHasLock,
AnswerHasLock)
# build a "singleton" # build a "singleton"
Packets = PacketRegistry() Packets = PacketRegistry()
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from neo import protocol from neo import protocol
from neo.protocol import Packets from neo.protocol import Packets, LockState
from neo.storage.handlers import BaseClientAndStorageOperationHandler from neo.storage.handlers import BaseClientAndStorageOperationHandler
from neo.storage.transactions import ConflictError, DelayedError from neo.storage.transactions import ConflictError, DelayedError
import time import time
...@@ -124,3 +124,13 @@ class ClientOperationHandler(BaseClientAndStorageOperationHandler): ...@@ -124,3 +124,13 @@ class ClientOperationHandler(BaseClientAndStorageOperationHandler):
conn.answer(Packets.AnswerUndoTransaction(oid_list, error_oid_list, conn.answer(Packets.AnswerUndoTransaction(oid_list, error_oid_list,
conflict_oid_list)) conflict_oid_list))
def askHasLock(self, conn, tid, oid):
locking_tid = self.app.tm.getLockingTID(oid)
if locking_tid is None:
state = LockState.NOT_LOCKED
elif locking_tid is tid:
state = LockState.GRANTED
else:
state = LockState.GRANTED_TO_OTHER
conn.answer(Packets.AnswerHasLock(oid, state))
...@@ -18,10 +18,11 @@ ...@@ -18,10 +18,11 @@
import unittest import unittest
from mock import Mock from mock import Mock
from neo.tests import NeoTestBase from neo.tests import NeoTestBase
from neo.protocol import NodeTypes from neo.protocol import NodeTypes, LockState
from neo.client.handlers.storage import StorageBootstrapHandler, \ from neo.client.handlers.storage import StorageBootstrapHandler, \
StorageAnswersHandler StorageAnswersHandler
from neo.client.exception import NEOStorageError from neo.client.exception import NEOStorageError
from ZODB.POSException import ConflictError
MARKER = [] MARKER = []
...@@ -257,6 +258,17 @@ class StorageAnswerHandlerTests(NeoTestBase): ...@@ -257,6 +258,17 @@ class StorageAnswerHandlerTests(NeoTestBase):
self.assertEqual(undo_error_oid_list, [oid_2]) self.assertEqual(undo_error_oid_list, [oid_2])
self.assertEqual(data_dict, {oid_1: ''}) self.assertEqual(data_dict, {oid_1: ''})
def test_answerHasLock(self):
uuid = self.getNewUUID()
conn = self.getFakeConnection(uuid=uuid)
oid = self.getOID(0)
self.assertRaises(ConflictError, self.handler.answerHasLock, conn, oid,
LockState.GRANTED_TO_OTHER)
# XXX: Just check that this doesn't raise for the moment.
self.handler.answerHasLock(conn, oid, LockState.GRANTED)
# TODO: Test LockState.NOT_LOCKED case when implemented.
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -24,7 +24,7 @@ from neo.storage.transactions import ConflictError, DelayedError ...@@ -24,7 +24,7 @@ from neo.storage.transactions import ConflictError, DelayedError
from neo.storage.handlers.client import ClientOperationHandler from neo.storage.handlers.client import ClientOperationHandler
from neo.protocol import INVALID_PARTITION from neo.protocol import INVALID_PARTITION
from neo.protocol import INVALID_TID, INVALID_OID, INVALID_SERIAL from neo.protocol import INVALID_TID, INVALID_OID, INVALID_SERIAL
from neo.protocol import Packets from neo.protocol import Packets, LockState
class StorageClientHandlerTests(NeoTestBase): class StorageClientHandlerTests(NeoTestBase):
...@@ -277,5 +277,24 @@ class StorageClientHandlerTests(NeoTestBase): ...@@ -277,5 +277,24 @@ class StorageClientHandlerTests(NeoTestBase):
self.assertEqual(oid_list_2, [oid_2]) self.assertEqual(oid_list_2, [oid_2])
self.assertEqual(oid_list_3, [oid_3]) self.assertEqual(oid_list_3, [oid_3])
def test_askHasLock(self):
tid_1 = self.getNextTID()
tid_2 = self.getNextTID()
oid = self.getNextTID()
def getLockingTID(oid):
return locking_tid
self.app.tm.getLockingTID = getLockingTID
for locking_tid, status in (
(None, LockState.NOT_LOCKED),
(tid_1, LockState.GRANTED),
(tid_2, LockState.GRANTED_TO_OTHER),
):
conn = self._getConnection()
self.operation.askHasLock(conn, tid_1, oid)
p_oid, p_status = self.checkAnswerPacket(conn,
Packets.AnswerHasLock, decode=True)
self.assertEqual(oid, p_oid)
self.assertEqual(status, p_status)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from neo.protocol import NodeTypes, NodeStates, CellStates, ClusterStates from neo.protocol import NodeTypes, NodeStates, CellStates, ClusterStates
from neo.protocol import ErrorCodes, Packets, Errors from neo.protocol import ErrorCodes, Packets, Errors, LockState
from neo.protocol import INVALID_TID from neo.protocol import INVALID_TID
from neo.tests import NeoTestBase from neo.tests import NeoTestBase
...@@ -589,6 +589,17 @@ class ProtocolTests(NeoTestBase): ...@@ -589,6 +589,17 @@ class ProtocolTests(NeoTestBase):
p = Packets.AnswerPartitionList(ptid, row_list) p = Packets.AnswerPartitionList(ptid, row_list)
self.assertEqual(p.decode(), (ptid, row_list)) self.assertEqual(p.decode(), (ptid, row_list))
def test_AskHasLock(self):
tid = self.getNextTID()
oid = self.getNextTID()
p = Packets.AskHasLock(tid, oid)
self.assertEqual(p.decode(), (tid, oid))
def test_AnswerHasLock(self):
oid = self.getNextTID()
for lock_state in LockState.itervalues():
p = Packets.AnswerHasLock(oid, lock_state)
self.assertEqual(p.decode(), (oid, lock_state))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment