master: add --backup to initialize in BACKINGUP

This applies only to an empty cluster and makes the cluster transition from the initial state RECOVERING to BACKINGUP without passing through state RUNNING: (empty) -> RECOVERING -> VERIFYING -> STARTING_BACKUP -> BACKINGUP This is the only way to reach BACKINGUP without having to pass through RUNNING first and manually transitioning to STARTING_BACKUP. It is the only way to reach state BACKINGUP fully automatically. See merge request !25

master: add --backup to initialize in BACKINGUP
This applies only to an empty cluster and makes the cluster transition from the initial state RECOVERING to BACKINGUP without passing through state RUNNING: (empty) -> RECOVERING -> VERIFYING -> STARTING_BACKUP -> BACKINGUP This is the only way to reach BACKINGUP without having to pass through RUNNING first and manually transitioning to STARTING_BACKUP. It is the only way to reach state BACKINGUP fully automatically. See merge request !25
13ef5f15 · Xavier Thompson · Julien Muchembled · 3d435f55 · 13ef5f15 · 13ef5f15
Commit 13ef5f15 authored Dec 09, 2024 by Xavier Thompson Committed by Julien Muchembled Dec 16, 2024
Showing with 70 additions and 12 deletions

neo/master/app.py neo/master/app.py +10 -1

neo/tests/threaded/__init__.py neo/tests/threaded/__init__.py +2 -1

neo/tests/threaded/testReplication.py neo/tests/threaded/testReplication.py +58 -10

No files found.
--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -92,6 +92,10 @@ class Application(BaseApplication):
            help='the name of cluster to backup')
        _('M', 'upstream-masters', parse=util.parseMasterList,
            help='list of master nodes in the cluster to backup')
+        _.bool('B', 'backup',
+            help="transition automatically toward BACKINGUP instead of RUNNING"
+                 " - without passing through RUNNING - if the cluster is empty"
+                 " (this requires --upstream-cluster and --upstream-master)")
        _.int('i', 'nid',
            help="specify an NID to use for this process (testing purpose)")

@@ -144,6 +148,7 @@ class Application(BaseApplication):
                                 " different from cluster name")
            self.backup_app = BackupApplication(self, upstream_cluster,
                                                config['upstream_masters'])
+        self.backup_initially = config.get('backup')

        self.administration_handler = administration.AdministrationHandler(
            self)
@@ -308,7 +313,11 @@ class Application(BaseApplication):
                self.runManager(RecoveryManager)
                try:
                    self.runManager(VerificationManager)
-                    if not self.backup_tid:
+                    if (self.backup_initially and
+                        self.getLastTransaction() == ZERO_TID):
+                        self.pt.setBackupTidDict({}) # {} <=> all ZERO_TID
+                        self.backup_tid = ZERO_TID
+                    elif not self.backup_tid:
                        self.provideService()
                        # self.provideService only returns without raising
                        # when switching to backup mode.

--- a/neo/tests/threaded/__init__.py
+++ b/neo/tests/threaded/__init__.py
@@ -764,7 +764,7 @@ class NEOCluster(object):
    def __init__(self, master_count=1, partitions=1, replicas=0, upstream=None,
                       adapter=os.getenv('NEO_TESTS_ADAPTER', 'SQLite'),
                       storage_count=None, db_list=None, clear_databases=True,
-                       compress=True, backup_count=0,
+                       compress=True, backup_count=0, backup_initially=False,
                       importer=None, autostart=None, dedup=False, name=None):
        self.name = name or self._allocateName()
        self.backup_list = [self._allocateName() for x in xrange(backup_count)]
@@ -782,6 +782,7 @@ class NEOCluster(object):
            kw.update(upstream_cluster=upstream.name,
                upstream_masters=parseMasterList(upstream.master_nodes))
        self.master_list = [MasterApplication(autostart=autostart,
+                                              backup=backup_initially,
                                              address=x, **kw)
                            for x in master_list]
        if db_list is None:

--- a/neo/tests/threaded/testReplication.py
+++ b/neo/tests/threaded/testReplication.py
@@ -32,7 +32,7 @@ from neo.lib.connector import SocketConnector
 from neo.lib.connection import ClientConnection
 from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \
    ZERO_OID, ZERO_TID, MAX_TID, uuid_str
-from neo.lib.util import add64, p64, u64
+from neo.lib.util import add64, p64, u64, parseMasterList
 from .. import Patch, TransactionalResource
 from . import ConnectionFilter, LockLock, NEOCluster, NEOThreadedTest, \
    predictable_random, with_cluster
@@ -93,11 +93,22 @@ class ReplicationTests(NEOThreadedTest):
                    tid, upstream_name, source_dict = packet._args
                    return not upstream_name and all(source_dict.itervalues())
            with NEOCluster(partitions=np, replicas=nr-1, storage_count=5,
-                            upstream=upstream) as backup:
-                backup.start()
-                # Initialize & catch up.
-                backup.neoctl.setClusterState(ClusterStates.STARTING_BACKUP)
-                self.tic()
+                            upstream=upstream, backup_initially=True) as backup:
+                state_list = []
+                def changeClusterState(orig, state):
+                    state_list.append(state)
+                    orig(state)
+                with Patch(backup.master, changeClusterState=changeClusterState):
+                    # Initialize & catch up.
+                    backup.start()
+                    self.tic()
+                # Check that backup cluster goes straight to BACKINGUP.
+                self.assertEqual(state_list, [
+                    ClusterStates.RECOVERING,
+                    ClusterStates.VERIFYING,
+                    ClusterStates.STARTING_BACKUP,
+                    ClusterStates.BACKINGUP])
+
                self.assertEqual(np*nr, self.checkBackup(backup))
                # Normal case, following upstream cluster closely.
                importZODB(17)
@@ -229,11 +240,48 @@ class ReplicationTests(NEOThreadedTest):
            # Do not start with an empty DB so that 'primary_dict' below is not
            # empty on the first iteration.
            importZODB(1)
+
+            # --- ASIDE ---
+            # Check that master crashes when started with --backup but without
+            # upstream (-C,--upstream-cluster and -M,--upstream-masters) info.
            with NEOCluster(partitions=np, replicas=2, storage_count=4,
-                            upstream=upstream) as backup:
-                backup.start()
-                backup.neoctl.setClusterState(ClusterStates.STARTING_BACKUP)
-                self.tic()
+                            backup_initially=True) as backup:
+                exitmsg = []
+                def exit(orig, msg):
+                    exitmsg.append(msg)
+                    orig(msg)
+                state_list = []
+                def changeClusterState(orig, state):
+                    state_list.append(state)
+                    orig(state)
+                m = backup.master
+                with Patch(sys, exit=exit), Patch(
+                        m, changeClusterState=changeClusterState):
+                    self.assertRaises(AssertionError, backup.start)
+                backup.join((m,))
+                self.assertEqual(exitmsg, [m.no_upstream_msg])
+                self.assertEqual(state_list, [
+                    ClusterStates.RECOVERING,
+                    ClusterStates.VERIFYING])
+                del state_list[:]
+                # Now check that restarting the master with upstream info and
+                # with --backup makes the cluster go to BACKINGUP.
+                m.resetNode(
+                    upstream_cluster=upstream.name,
+                    upstream_masters=parseMasterList(upstream.master_nodes))
+                backup.upstream = upstream
+                with Patch(m, changeClusterState=changeClusterState):
+                    # Initialize & catch up.
+                    m.start()
+                    self.tic()
+                # Check that backup cluster goes straight to BACKINGUP.
+                self.assertEqual(state_list, [
+                    ClusterStates.RECOVERING,
+                    ClusterStates.VERIFYING,
+                    ClusterStates.STARTING_BACKUP,
+                    ClusterStates.BACKINGUP])
+                # --- END ASIDE ---
+
                storage_list = [x.uuid for x in backup.storage_list]
                slave = set(xrange(len(storage_list))).difference
                for event in xrange(10):