Commit 4e44bf95 authored by Xavier Thompson's avatar Xavier Thompson

slapgrid: Fix connectionless instance processing

See merge request nexedi/slapos.core!572
parents be11fd4f c9b394c1
Pipeline #29855 failed with stage
in 0 seconds
......@@ -59,6 +59,7 @@ from requests.exceptions import RequestException
from lxml import etree
from slapos import manager as slapmanager
from slapos.slap.exception import ConnectionError
from slapos.slap.slap import NotFoundError
from slapos.slap.slap import ServerError
from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME
......@@ -1425,7 +1426,7 @@ stderr_logfile_backups=1
def processComputerPartitionList(self):
try:
return self.processComputerPartitionListOnline()
except RequestException:
except (RequestException, ConnectionError):
return self.processComputerPartitionListOffline()
def processComputerPartitionListOnline(self):
......@@ -1456,7 +1457,7 @@ stderr_logfile_backups=1
self.processComputerPartition(computer_partition)
# Handle connection loss at the next level
except RequestException:
except (RequestException, ConnectionError):
raise
# Send log before exiting
......@@ -1517,6 +1518,40 @@ stderr_logfile_backups=1
def processComputerPartitionListOffline(self):
self.logger.info('Processing computer partitions offline...')
# Backwards compatibility: remove stopped services
for name in os.listdir(self.instance_root):
instance_path = os.path.join(self.instance_root, name)
state_path = os.path.join(instance_path, '.requested_state')
try:
with open(state_path) as f:
requested_state = f.read()
os.remove(state_path)
except (IOError, OSError) as e:
if e.errno != errno.ENOENT and e.errno != errno.ENOTDIR:
raise
requested_state = None
if requested_state == 'stopped':
local_partition = Partition(
software_path=None,
instance_path=instance_path,
shared_part_list='',
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=None,
computer_id=self.computer_id,
partition_id=name,
server_url=self.master_url,
software_release_url='toto',
certificate_repository_path=self.certificate_repository_path,
buildout=self.buildout,
buildout_debug=self.buildout_debug,
logger=self.logger,
instance_storage_home=self.instance_storage_home,
ipv4_global_network=self.ipv4_global_network,
)
local_partition.stop()
# Offline: start all existing services
try:
supervisord_socket_path = _getSupervisordSocketPath(
self.instance_root,
......
......@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase):
def test_no_master(self):
os.mkdir(self.software_root)
os.mkdir(self.instance_root)
self.assertRaises(ConnectionError, self.grid.processComputerPartitionList)
self.assertEqual(
self.grid.processComputerPartitionList(),
slapgrid.SLAPGRID_OFFLINE_SUCCESS)
def test_environment_variable_HOME(self):
# When running instance, $HOME is set to the partition path
......@@ -1087,6 +1089,71 @@ exit 1
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached
])
def test_stopped_partition_remains_stopped_after_master_connection_loss(self):
computer = self.getTestComputerClass()(
self.software_root, self.instance_root, instance_amount=2)
for i in range(2):
partition = computer.instance_list[i]
partition.requested_state = 'started'
partition.software.setBuildout()
run_path = os.path.join(partition.partition_path, 'etc', 'run')
os.makedirs(run_path)
with open(os.path.join(run_path, 'runner'), 'w') as f:
f.write("#!/bin/sh\necho 'Working'\ntouch 'runner_worked'")
os.fchmod(f.fileno(), 0o755)
control_partition = computer.instance_list[0]
test_partition = computer.instance_list[1]
control_file = os.path.join(control_partition.partition_path, 'runner_worked')
test_file = os.path.join(test_partition.partition_path, 'runner_worked')
def assertRunnerWorked(path):
for _ in range(50):
if os.path.exists(path):
break
time.sleep(0.1)
else:
self.assertTrue(os.path.exists(path))
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
assertRunnerWorked(test_file)
for i in range(2):
six.assertCountEqual(self, os.listdir(computer.instance_list[i].partition_path),
['.slapgrid', '.%d_runner.log' % i, 'buildout.cfg', 'etc',
'runner_worked', 'software_release', 'worked',
'.slapos-retention-lock-delay'])
self.assertEqual(control_partition.state, 'started')
self.assertEqual(test_partition.state, 'started')
# simulate stopping the partition with old version
test_partition.state = 'stopped'
state_path = os.path.join(test_partition.partition_path, '.requested_state')
with open(state_path, 'w') as f:
f.write('stopped')
computer.status_code = 503 # connection loss
os.unlink(control_file)
os.unlink(test_file)
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_OFFLINE_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
self.assertFalse(os.path.exists(test_file))
self.assertEqual(computer.sequence, [
'/getFullComputerInformation',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached
])
class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase):
def setUp(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment