Commit d752aadb authored by Julien Muchembled's avatar Julien Muchembled

Give new ids to clients whose ids were already reallocated

Although the change applies to any node with a temporary ids (all but storage),
only clients don't have addresses and are therefore not recognizable.

After a client is disconnected from the master and before reconnecting, another
client may join the cluster and "steals" the id of the first client. This issue
leads to stuck clients, failing in loop with exceptions like the following one:

    ERROR ZODB.Connection Couldn't load state for 0x0251
    Traceback (most recent call last):
      File "ZODB/Connection.py", line 860, in setstate
        self._setstate(obj)
      File "ZODB/Connection.py", line 901, in _setstate
        p, serial = self._storage.load(obj._p_oid, '')
      File "neo/client/Storage.py", line 82, in load
        return self.app.load(oid)[:2]
      File "neo/client/app.py", line 353, in load
        data, tid, next_tid, _ = self._loadFromStorage(oid, tid, before_tid)
      File "neo/client/app.py", line 373, in _loadFromStorage
        for node, conn in self.cp.iterateForObject(oid, readable=True):
      File "neo/client/pool.py", line 91, in iterateForObject
        pt = self.app.pt
      File "neo/client/app.py", line 145, in __getattr__
        self._getMasterConnection()
      File "neo/client/app.py", line 214, in _getMasterConnection
        result = self.master_conn = self._connectToPrimaryNode()
      File "neo/client/app.py", line 246, in _connectToPrimaryNode
        handler=handler)
      File "neo/lib/threaded_app.py", line 154, in _ask
        _handlePacket(qconn, qpacket, kw, handler)
      File "neo/lib/threaded_app.py", line 135, in _handlePacket
        handler.dispatch(conn, packet, kw)
      File "neo/lib/handler.py", line 66, in dispatch
        method(conn, *args, **kw)
      File "neo/lib/handler.py", line 188, in error
        getattr(self, Errors[code])(conn, message)
      File "neo/client/handlers/__init__.py", line 23, in protocolError
        raise StorageError("protocol error: %s" % message)
    StorageError: protocol error: already connected
parent b62b8dc3
...@@ -32,8 +32,12 @@ class IdentificationHandler(MasterHandler): ...@@ -32,8 +32,12 @@ class IdentificationHandler(MasterHandler):
app = self.app app = self.app
if node: if node:
if node.isRunning(): if node.isRunning():
if uuid > 0:
# cloned/evil/buggy node connecting to us # cloned/evil/buggy node connecting to us
raise ProtocolError('already connected') raise ProtocolError('already connected')
# The peer wants a temporary id that's already assigned.
# Let's give it another one.
node = uuid = None
else: else:
assert not node.isConnected() assert not node.isConnected()
node.setAddress(address) node.setAddress(address)
......
...@@ -1290,6 +1290,8 @@ class Test(NEOThreadedTest): ...@@ -1290,6 +1290,8 @@ class Test(NEOThreadedTest):
m2c, = cluster.master.getConnectionList(cluster.client) m2c, = cluster.master.getConnectionList(cluster.client)
cluster.client._cache.clear() cluster.client._cache.clear()
c.cacheMinimize() c.cacheMinimize()
# Make the master disconnects the client when the latter is about
# to send a AskObject packet to the storage node.
with cluster.client.filterConnection(cluster.storage) as c2s: with cluster.client.filterConnection(cluster.storage) as c2s:
c2s.add(disconnect) c2s.add(disconnect)
# Storages are currently notified of clients that get # Storages are currently notified of clients that get
...@@ -1297,9 +1299,23 @@ class Test(NEOThreadedTest): ...@@ -1297,9 +1299,23 @@ class Test(NEOThreadedTest):
# Should it change, the clients would have to disconnect on # Should it change, the clients would have to disconnect on
# their own. # their own.
self.assertRaises(TransientError, getattr, c, "root") self.assertRaises(TransientError, getattr, c, "root")
uuid = cluster.client.uuid
# Let's use a second client to steal the node id of the first one.
client = cluster.newClient()
try:
client.sync()
self.assertEqual(uuid, client.uuid)
# The client reconnects successfully to the master and storage,
# with a different node id. This time, we get a different error
# if it's only disconnected from the storage.
with Patch(ClientOperationHandler, with Patch(ClientOperationHandler,
askObject=lambda orig, self, conn, *args: conn.close()): askObject=lambda orig, self, conn, *args: conn.close()):
self.assertRaises(NEOStorageError, getattr, c, "root") self.assertRaises(NEOStorageError, getattr, c, "root")
self.assertNotEqual(uuid, cluster.client.uuid)
# Second reconnection, for a successful load.
c.root
finally:
client.close()
finally: finally:
cluster.stop() cluster.stop()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment