From afd801c8dff58edcbfc6ee931bb78c2333a5b266 Mon Sep 17 00:00:00 2001
From: Julien Muchembled <jm@nexedi.com>
Date: Tue, 17 Jan 2017 12:22:56 +0100
Subject: [PATCH] qa: fix random RuntimeError when starting cluster in
 testClusterStartup

---
 neo/master/handlers/administration.py |  3 +++
 neo/tests/functional/__init__.py      | 12 ++++++++++++
 neo/tests/functional/testCluster.py   |  5 ++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/neo/master/handlers/administration.py b/neo/master/handlers/administration.py
index 71b72c6a..5311c380 100644
--- a/neo/master/handlers/administration.py
+++ b/neo/master/handlers/administration.py
@@ -64,6 +64,9 @@ class AdministrationHandler(MasterHandler):
             for node in storage_list:
                 assert node.isPending(), node
                 if node.getConnection().isPending():
+                    # XXX: It's wrong to use ProtocolError here. We must reply
+                    #      less aggressively because the admin has no way to
+                    #      know that there's still pending activity.
                     raise ProtocolError('Cannot exit recovery now: node %r is '
                         'entering cluster' % (node, ))
             app._startup_allowed = True
diff --git a/neo/tests/functional/__init__.py b/neo/tests/functional/__init__.py
index 6ce35e74..4217b438 100644
--- a/neo/tests/functional/__init__.py
+++ b/neo/tests/functional/__init__.py
@@ -416,6 +416,18 @@ class NEOCluster(object):
         if not pdb.wait(test, MAX_START_TIME):
             raise AssertionError('Timeout when starting cluster')
 
+    def startCluster(self):
+        # Even if the storage nodes are in the expected state, there may still
+        # be activity between them and the master, preventing the cluster to
+        # start.
+        def start(last_try):
+            try:
+                self.neoctl.startCluster()
+            except (NotReadyException, RuntimeError):
+                return False, e
+            return True, None
+        self.expectCondition(start)
+
     def stop(self, clients=True):
         # Suspend all processes to kill before actually killing them, so that
         # nodes don't log errors because they get disconnected from other nodes:
diff --git a/neo/tests/functional/testCluster.py b/neo/tests/functional/testCluster.py
index 1db98fd8..cc5112eb 100644
--- a/neo/tests/functional/testCluster.py
+++ b/neo/tests/functional/testCluster.py
@@ -31,7 +31,6 @@ class ClusterTests(NEOFunctionalTest):
     def testClusterStartup(self):
         neo = self.neo = NEOCluster(['test_neo1', 'test_neo2'], replicas=1,
                          temp_dir=self.getTempDirectory())
-        neoctl = neo.neoctl
         neo.run()
         # Runing a new cluster doesn't exit Recovery state.
         s1, s2 = neo.getStorageProcessList()
@@ -40,7 +39,7 @@ class ClusterTests(NEOFunctionalTest):
         neo.expectClusterRecovering()
         # When allowing cluster to exit Recovery, it reaches Running state and
         # all present storage nodes reach running state.
-        neoctl.startCluster()
+        neo.startCluster()
         neo.expectRunning(s1)
         neo.expectRunning(s2)
         neo.expectClusterRunning()
@@ -64,7 +63,7 @@ class ClusterTests(NEOFunctionalTest):
         neo.expectPending(s1)
         neo.expectUnknown(s2)
         neo.expectClusterRecovering()
-        neoctl.startCluster()
+        neo.startCluster()
         neo.expectRunning(s1)
         neo.expectUnknown(s2)
         neo.expectClusterRunning()
-- 
2.30.9