slapgrid: Fix an upgrade bug in offline processing

When upgrading from a version of slapos.core that does not support processing instances without a connection to master to one that does, the following edgecase could occur: 1. some partition is stopped and processed with old version 2. slapos.core is upgraded 3. connection to master is lost 4. offline processing wrongly starts the services of stopped partition This was because the new version removes the supervisord file for a partition when the partition is stopped (in online mode) so that the offline mode can just start all the existing supervisord files. But since the partitions was never processed in online mode with the new version, this file was never removed. To fix this, we use the (now no longer used ) .requested_state file of the previous version to determine the state of the partition in offline mode, then we remove both the .requested_state file and the supervisord file, fixing the discrepancy.

slapgrid: Fix an upgrade bug in offline processing
When upgrading from a version of slapos.core that does not support processing instances without a connection to master to one that does, the following edgecase could occur: 1. some partition is stopped and processed with old version 2. slapos.core is upgraded 3. connection to master is lost 4. offline processing wrongly starts the services of stopped partition This was because the new version removes the supervisord file for a partition when the partition is stopped (in online mode) so that the offline mode can just start all the existing supervisord files. But since the partitions was never processed in online mode with the new version, this file was never removed. To fix this, we use the (now no longer used ) .requested_state file of the previous version to determine the state of the partition in offline mode, then we remove both the .requested_state file and the supervisord file, fixing the discrepancy.
c9b394c1 · Xavier Thompson · 14745358 · c9b394c1 · c9b394c1
Commit c9b394c1 authored Sep 05, 2023 by Xavier Thompson
Hide whitespace changes
Inline Side-by-side

Showing with 99 additions and 0 deletions

slapos/grid/slapgrid.py slapos/grid/slapgrid.py +34 -0

slapos/tests/test_slapgrid.py slapos/tests/test_slapgrid.py +65 -0

No files found.
--- a/slapos/grid/slapgrid.py
+++ b/slapos/grid/slapgrid.py
@@ -1518,6 +1518,40 @@ stderr_logfile_backups=1

  def processComputerPartitionListOffline(self):
    self.logger.info('Processing computer partitions offline...')
+    # Backwards compatibility: remove stopped services
+    for name in os.listdir(self.instance_root):
+      instance_path = os.path.join(self.instance_root, name)
+      state_path = os.path.join(instance_path, '.requested_state')
+      try:
+        with open(state_path) as f:
+          requested_state = f.read()
+        os.remove(state_path)
+      except (IOError, OSError) as e:
+        if e.errno != errno.ENOENT and e.errno != errno.ENOTDIR:
+          raise
+        requested_state = None
+      if requested_state == 'stopped':
+        local_partition = Partition(
+          software_path=None,
+          instance_path=instance_path,
+          shared_part_list='',
+          supervisord_partition_configuration_dir=(
+            _getSupervisordConfigurationDirectory(self.instance_root)),
+          supervisord_socket=self.supervisord_socket,
+          computer_partition=None,
+          computer_id=self.computer_id,
+          partition_id=name,
+          server_url=self.master_url,
+          software_release_url='toto',
+          certificate_repository_path=self.certificate_repository_path,
+          buildout=self.buildout,
+          buildout_debug=self.buildout_debug,
+          logger=self.logger,
+          instance_storage_home=self.instance_storage_home,
+          ipv4_global_network=self.ipv4_global_network,
+        )
+        local_partition.stop()
+    # Offline: start all existing services
    try:
      supervisord_socket_path = _getSupervisordSocketPath(
        self.instance_root,

--- a/slapos/tests/test_slapgrid.py
+++ b/slapos/tests/test_slapgrid.py
@@ -1089,6 +1089,71 @@ exit 1
        '/getComputerPartitionCertificate' # /getFullComputerInformation is cached
      ])

+  def test_stopped_partition_remains_stopped_after_master_connection_loss(self):
+    computer = self.getTestComputerClass()(
+      self.software_root, self.instance_root, instance_amount=2)
+
+    for i in range(2):
+      partition = computer.instance_list[i]
+      partition.requested_state = 'started'
+      partition.software.setBuildout()
+      run_path = os.path.join(partition.partition_path, 'etc', 'run')
+      os.makedirs(run_path)
+      with open(os.path.join(run_path, 'runner'), 'w') as f:
+        f.write("#!/bin/sh\necho 'Working'\ntouch 'runner_worked'")
+        os.fchmod(f.fileno(), 0o755)
+
+    control_partition = computer.instance_list[0]
+    test_partition = computer.instance_list[1]
+
+    control_file = os.path.join(control_partition.partition_path, 'runner_worked')
+    test_file = os.path.join(test_partition.partition_path, 'runner_worked')
+
+    def assertRunnerWorked(path):
+      for _ in range(50):
+        if os.path.exists(path):
+          break
+        time.sleep(0.1)
+      else:
+        self.assertTrue(os.path.exists(path))
+
+    with httmock.HTTMock(computer.request_handler):
+      self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
+      self.assertInstanceDirectoryListEqual(['0', '1'])
+      assertRunnerWorked(control_file)
+      assertRunnerWorked(test_file)
+      for i in range(2):
+        six.assertCountEqual(self, os.listdir(computer.instance_list[i].partition_path),
+                              ['.slapgrid', '.%d_runner.log' % i, 'buildout.cfg', 'etc',
+                              'runner_worked', 'software_release', 'worked',
+                              '.slapos-retention-lock-delay'])
+      self.assertEqual(control_partition.state, 'started')
+      self.assertEqual(test_partition.state, 'started')
+
+    # simulate stopping the partition with old version
+    test_partition.state = 'stopped'
+    state_path = os.path.join(test_partition.partition_path, '.requested_state')
+    with open(state_path, 'w') as f:
+      f.write('stopped')
+
+    computer.status_code = 503 # connection loss
+    os.unlink(control_file)
+    os.unlink(test_file)
+
+    with httmock.HTTMock(computer.request_handler):
+      self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_OFFLINE_SUCCESS)
+      self.assertInstanceDirectoryListEqual(['0', '1'])
+      assertRunnerWorked(control_file)
+      self.assertFalse(os.path.exists(test_file))
+      self.assertEqual(computer.sequence, [
+        '/getFullComputerInformation',
+        '/getComputerPartitionCertificate',
+        '/startedComputerPartition',
+        '/getComputerPartitionCertificate',
+        '/startedComputerPartition',
+        '/getComputerPartitionCertificate' # /getFullComputerInformation is cached
+      ])
+
 class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase):

  def setUp(self):