In STOPPING cluster state, really wait for all transaction to be finished

9e433594 · Julien Muchembled · 35737c9b · 9e433594 · 9e433594 · 9e433594
Commit 9e433594 authored Mar 15, 2017 by Julien Muchembled
4 changed files
--- a/neo/client/handlers/master.py
+++ b/neo/client/handlers/master.py
@@ -158,6 +158,10 @@ class PrimaryNotificationsHandler(MTEventHandler):
    def stopOperation(self, conn):
        logging.critical("master node ask to stop operation")

+    def notifyClusterInformation(self, conn, state):
+        # TODO: on shutdown, abort any transaction that is not voted
+        logging.info("cluster switching to %s state", state)
+
    def invalidateObjects(self, conn, tid, oid_list):
        app = self.app
        if app.ignore_invalidations:

--- a/neo/lib/handler.py
+++ b/neo/lib/handler.py
@@ -276,6 +276,7 @@ class AnswerBaseHandler(EventHandler):


 class _DelayedConnectionEvent(EventHandler):
+    # WARNING: This assumes that the connection handler does not change.

    handler_method_name = '_func'
    __new__ = object.__new__

--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -460,6 +460,7 @@ class Application(BaseApplication):
                elif state == ClusterStates.BACKINGUP:
                    handler = self.client_ro_service_handler
                else:
+                    if state != ClusterStates.STOPPING:
                        conn.abort()
                    continue
            elif node.isStorage() and storage_handler:
@@ -489,15 +490,18 @@ class Application(BaseApplication):
    def shutdown(self):
        """Close all connections and exit"""
        self.changeClusterState(ClusterStates.STOPPING)
-        self.listening_conn.close()
-        for conn in self.em.getConnectionList():
-            node = self.nm.getByUUID(conn.getUUID())
-            if node is None or not node.isIdentified():
-                conn.close()
-        # No need to change handlers in order to reject RequestIdentification
-        # & AskBeginTransaction packets because they won't be any:
-        # the only remaining connected peers are identified non-clients
-        # and we don't accept new connections anymore.
+        # Marking a fictional storage node as starting operation blocks any
+        # request to start a new transaction. Do this way has 2 advantages:
+        # - It's simpler than changing the handler of all clients,
+        #   which is anyway not supported by EventQueue.
+        # - Returning an error code would cause activity on client side for
+        #   nothing.
+        # What's important is to not abort during the second phase of commits
+        # and for this, clients must even be able to reconnect, in case of
+        # failure during tpc_finish.
+        # We're rarely involved in vote, so we have to trust clients that they
+        # abort any transaction that is still in the first phase.
+        self.storage_starting_set.add(None)
        try:
            # wait for all transaction to be finished
            while self.tm.hasPending():
@@ -506,13 +510,13 @@ class Application(BaseApplication):
            logging.critical('No longer operational')

        logging.info("asking remaining nodes to shutdown")
+        self.listening_conn.close()
        handler = EventHandler(self)
-        now = monotonic_time()
        for node in self.nm.getConnectedList():
            conn = node.getConnection()
            if node.isStorage():
                conn.setHandler(handler)
-                conn.send(Packets.NotifyNodeInformation(now, ((
+                conn.send(Packets.NotifyNodeInformation(monotonic_time(), ((
                    node.getType(), node.getAddress(), node.getUUID(),
                    NodeStates.TEMPORARILY_DOWN, None),)))
                conn.abort()

--- a/neo/tests/threaded/test.py
+++ b/neo/tests/threaded/test.py
@@ -29,7 +29,8 @@ from transaction.interfaces import TransientError
 from ZODB import DB, POSException
 from ZODB.DB import TransactionalUndo
 from neo.storage.transactions import TransactionManager, ConflictError
-from neo.lib.connection import ServerConnection, MTClientConnection
+from neo.lib.connection import ConnectionClosed, \
+    ServerConnection, MTClientConnection
 from neo.lib.exception import DatabaseFailure, StoppedOperation
 from neo.lib.handler import DelayEvent
 from neo.lib import logging
@@ -824,27 +825,46 @@ class Test(NEOThreadedTest):
        self._testShutdown(cluster)

    def _testShutdown(self, cluster):
-        if 1:
-            # fill DB a little
-            t, c = cluster.getTransaction()
-            c.root()[''] = ''
-            t.commit()
+        def before_finish(_):
            # tell admin to shutdown the cluster
            cluster.neoctl.setClusterState(ClusterStates.STOPPING)
+            self.tic()
+            l = threading.Lock(); l.acquire()
+            with ConnectionFilter() as f:
+                # Make we sure that we send t2/BeginTransaction
+                # before t1/AskFinishTransaction
+                @f.delayAskBeginTransaction
+                def delay(_):
+                    l.release()
+                    return False
+                t2.start()
+                l.acquire()
+        t1, c1 = cluster.getTransaction()
+        ob = c1.root()['1'] = PCounter()
+        t1.commit()
+        ob.value += 1
+        TransactionalResource(t1, 0, tpc_finish=before_finish)
+        t2, c2 = cluster.getTransaction()
+        c2.root()['2'] = None
+        t2 = self.newPausedThread(t2.commit)
+        with Patch(cluster.client, _connectToPrimaryNode=lambda *_:
+                self.fail("unexpected reconnection to master")):
+            t1.commit()
+        self.assertRaises(ConnectionClosed, t2.join)
        # all nodes except clients should exit
        cluster.join(cluster.master_list
                   + cluster.storage_list
                   + cluster.admin_list)
        cluster.stop() # stop and reopen DB to check partition tables
-        dm = cluster.storage_list[0].dm
-        self.assertEqual(1, dm.getPTID())
-        pt = list(dm.getPartitionTable())
-        self.assertEqual(20, len(pt))
-        for _, _, state in pt:
-            self.assertEqual(state, CellStates.UP_TO_DATE)
-        for s in cluster.storage_list[1:]:
-            self.assertEqual(s.dm.getPTID(), 1)
-            self.assertEqual(list(s.dm.getPartitionTable()), pt)
+        cluster.start()
+        pt = cluster.admin.pt
+        self.assertEqual(1, pt.getID())
+        for row in pt.partition_list:
+            for cell in row:
+                self.assertEqual(cell.getState(), CellStates.UP_TO_DATE)
+        t, c = cluster.getTransaction()
+        self.assertEqual(c.root()['1'].value, 1)
+        self.assertNotIn('2', c.root())

    @with_cluster()
    def testInternalInvalidation(self, cluster):