Commit c9b394c1 authored by Xavier Thompson's avatar Xavier Thompson

slapgrid: Fix an upgrade bug in offline processing

When upgrading from a version of slapos.core that does not support
processing instances without a connection to master to one that does,
the following edgecase could occur:

1. some partition is stopped and processed with old version
2. slapos.core is upgraded
3. connection to master is lost
4. offline processing wrongly starts the services of stopped partition

This was because the new version removes the supervisord file for a
partition when the partition is stopped (in online mode) so that the
offline mode can just start all the existing supervisord files. But
since the partitions was never processed in online mode with the new
version, this file was never removed.

To fix this, we use the (now no longer used ) .requested_state file
of the previous version to determine the state of the partition in
offline mode, then we remove both the .requested_state file and the
supervisord file, fixing the discrepancy.
parent 14745358
...@@ -1518,6 +1518,40 @@ stderr_logfile_backups=1 ...@@ -1518,6 +1518,40 @@ stderr_logfile_backups=1
def processComputerPartitionListOffline(self): def processComputerPartitionListOffline(self):
self.logger.info('Processing computer partitions offline...') self.logger.info('Processing computer partitions offline...')
# Backwards compatibility: remove stopped services
for name in os.listdir(self.instance_root):
instance_path = os.path.join(self.instance_root, name)
state_path = os.path.join(instance_path, '.requested_state')
try:
with open(state_path) as f:
requested_state = f.read()
os.remove(state_path)
except (IOError, OSError) as e:
if e.errno != errno.ENOENT and e.errno != errno.ENOTDIR:
raise
requested_state = None
if requested_state == 'stopped':
local_partition = Partition(
software_path=None,
instance_path=instance_path,
shared_part_list='',
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=None,
computer_id=self.computer_id,
partition_id=name,
server_url=self.master_url,
software_release_url='toto',
certificate_repository_path=self.certificate_repository_path,
buildout=self.buildout,
buildout_debug=self.buildout_debug,
logger=self.logger,
instance_storage_home=self.instance_storage_home,
ipv4_global_network=self.ipv4_global_network,
)
local_partition.stop()
# Offline: start all existing services
try: try:
supervisord_socket_path = _getSupervisordSocketPath( supervisord_socket_path = _getSupervisordSocketPath(
self.instance_root, self.instance_root,
......
...@@ -1089,6 +1089,71 @@ exit 1 ...@@ -1089,6 +1089,71 @@ exit 1
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached '/getComputerPartitionCertificate' # /getFullComputerInformation is cached
]) ])
def test_stopped_partition_remains_stopped_after_master_connection_loss(self):
computer = self.getTestComputerClass()(
self.software_root, self.instance_root, instance_amount=2)
for i in range(2):
partition = computer.instance_list[i]
partition.requested_state = 'started'
partition.software.setBuildout()
run_path = os.path.join(partition.partition_path, 'etc', 'run')
os.makedirs(run_path)
with open(os.path.join(run_path, 'runner'), 'w') as f:
f.write("#!/bin/sh\necho 'Working'\ntouch 'runner_worked'")
os.fchmod(f.fileno(), 0o755)
control_partition = computer.instance_list[0]
test_partition = computer.instance_list[1]
control_file = os.path.join(control_partition.partition_path, 'runner_worked')
test_file = os.path.join(test_partition.partition_path, 'runner_worked')
def assertRunnerWorked(path):
for _ in range(50):
if os.path.exists(path):
break
time.sleep(0.1)
else:
self.assertTrue(os.path.exists(path))
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
assertRunnerWorked(test_file)
for i in range(2):
six.assertCountEqual(self, os.listdir(computer.instance_list[i].partition_path),
['.slapgrid', '.%d_runner.log' % i, 'buildout.cfg', 'etc',
'runner_worked', 'software_release', 'worked',
'.slapos-retention-lock-delay'])
self.assertEqual(control_partition.state, 'started')
self.assertEqual(test_partition.state, 'started')
# simulate stopping the partition with old version
test_partition.state = 'stopped'
state_path = os.path.join(test_partition.partition_path, '.requested_state')
with open(state_path, 'w') as f:
f.write('stopped')
computer.status_code = 503 # connection loss
os.unlink(control_file)
os.unlink(test_file)
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_OFFLINE_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
self.assertFalse(os.path.exists(test_file))
self.assertEqual(computer.sequence, [
'/getFullComputerInformation',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached
])
class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase): class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase):
def setUp(self): def setUp(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment