Commit 036f320b authored by Alain Takoudjou's avatar Alain Takoudjou

slapgrid: start partitions services if network issue while processing instances

Partition.stop: When called, will remove supervisord configuration and update supervisord.
Only started partition has supervisord configuration files.

slapgrid: If `getRequiredComputerPartitionList` fail because or network issue,
buildout cannot process instance. Slapgrid will start any partition that has supervisord configuration
file and exit. This feature is required for `slapos node boot` when computer is offline.

cli.format: parameter `--ignore_network_errors` can be set to ignore exception while
posting xml file to master.

cli.boot: Now call `slapos node format` with `--ignore_network_errors`, then remove
partition timestamp and let cron launch `slapos node instance`. Partition will be
started if there is network issue.

manager.prerm: because of the new behavior of Partition.stop, the rm script
will be killed each time `slapos node report` runs. So the prerm script is
generated in a custom supervisord configuration. Samething for `manager.portredir`.
parent cb9334d4
...@@ -44,9 +44,6 @@ from slapos.cli.config import ConfigCommand ...@@ -44,9 +44,6 @@ from slapos.cli.config import ConfigCommand
from slapos.format import isGlobalScopeAddress from slapos.format import isGlobalScopeAddress
from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME, from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME,
COMPUTER_PARTITION_STARTED_STATE) COMPUTER_PARTITION_STARTED_STATE)
from slapos.grid.svcbackend import (_getSupervisordSocketPath,
getSupervisorRPC,
launchSupervisord)
from slapos.util import string_to_boolean from slapos.util import string_to_boolean
import argparse import argparse
import logging import logging
...@@ -65,49 +62,6 @@ def _removeTimestamp(instancehome, partition_base_name): ...@@ -65,49 +62,6 @@ def _removeTimestamp(instancehome, partition_base_name):
logger.info("Removing %s", timestamp_path) logger.info("Removing %s", timestamp_path)
os.remove(timestamp_path) os.remove(timestamp_path)
def _startComputerPartition(partition_id, supervisord_socket):
"""
With supervisord, start the instance that was deployed
"""
try:
with getSupervisorRPC(supervisord_socket) as supervisor:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
logger.info("Nothing to start on %s...", partition_id)
else:
raise
else:
logger.info("Requested start of %s...", partition_id)
def _startComputerPartitionList(instance_root, partition_base_name):
"""
Start services for partition which has requested state to 'started'
"""
partition_glob_path = os.path.join(
instance_root,
"%s*" % partition_base_name)
launchSupervisord(instance_root=instance_root, logger=logger)
for partition_path in glob.glob(partition_glob_path):
partition_state_path = os.path.join(
partition_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
supervisord_socket_path = _getSupervisordSocketPath(
instance_root,
logger
)
if os.path.exists(partition_state_path):
partition_state = ""
with open(partition_state_path) as f:
partition_state = f.read()
if partition_state == COMPUTER_PARTITION_STARTED_STATE:
# Call start for this computer partition
_startComputerPartition(
os.path.basename(partition_path.rstrip('/')),
supervisord_socket_path
)
def _runBang(app): def _runBang(app):
""" """
Launch slapos node format. Launch slapos node format.
...@@ -124,9 +78,9 @@ def _runFormat(app): ...@@ -124,9 +78,9 @@ def _runFormat(app):
Launch slapos node format. Launch slapos node format.
""" """
logger.info("[BOOT] Invoking slapos node format...") logger.info("[BOOT] Invoking slapos node format...")
# '--local' parameter is to prevent node format command to post data to # '--ignore_network_errors' parameter is to prevent node format command fail if
# master, so this command can work without internet and setup partitions IP. # master offline, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--local', '--verbose']) result = app.run(['node', 'format', '--now', '--ignore_network_errors', '--verbose'])
if result == 1: if result == 1:
return 0 return 0
return 1 return 1
...@@ -251,9 +205,7 @@ class BootCommand(ConfigCommand): ...@@ -251,9 +205,7 @@ class BootCommand(ConfigCommand):
while not _runFormat(app): while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...") logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15) sleep(15)
_removeTimestamp(instance_root, partition_base_name)
# Start computer partition services
_startComputerPartitionList(instance_root, partition_base_name)
# Check that node can ping master # Check that node can ping master
if valid_ipv4(master_hostname): if valid_ipv4(master_hostname):
...@@ -268,5 +220,3 @@ class BootCommand(ConfigCommand): ...@@ -268,5 +220,3 @@ class BootCommand(ConfigCommand):
while not _runBang(app): while not _runBang(app):
logger.error("[BOOT] Fail to bang, try again in 15 seconds...") logger.error("[BOOT] Fail to bang, try again in 15 seconds...")
sleep(15) sleep(15)
_removeTimestamp(instance_root, partition_base_name)
...@@ -83,10 +83,10 @@ class FormatCommand(ConfigCommand): ...@@ -83,10 +83,10 @@ class FormatCommand(ConfigCommand):
help='Launch slapformat without delay' help='Launch slapformat without delay'
' (default: %(default)s)') ' (default: %(default)s)')
ap.add_argument('--local', ap.add_argument('--ignore_network_errors',
default=False, # can have a default as it is not in .cfg default=False, # can have a default as it is not in .cfg
action="store_true", action="store_true",
help='Keep format data locally, do not post xml to master' help='Ignore network errors while connecting to slapos master.'
' (default: %(default)s)') ' (default: %(default)s)')
ap.add_argument('-n', '--dry_run', ap.add_argument('-n', '--dry_run',
......
...@@ -1408,9 +1408,13 @@ def do_format(conf): ...@@ -1408,9 +1408,13 @@ def do_format(conf):
computer.dump(path_to_xml=conf.computer_xml, computer.dump(path_to_xml=conf.computer_xml,
path_to_json=conf.computer_json, path_to_json=conf.computer_json,
logger=conf.logger) logger=conf.logger)
if not conf.local:
conf.logger.info('Posting information to %r' % conf.master_url) conf.logger.info('Posting information to %r' % conf.master_url)
try:
computer.send(conf) computer.send(conf)
except slap.exception.ConnectionError as e:
if not conf.ignore_network_errors:
raise
conf.logger.warn('Failed to send information to master: %s' % str(e))
conf.logger.info('slapos successfully prepared the computer.') conf.logger.info('slapos successfully prepared the computer.')
......
...@@ -753,7 +753,6 @@ class Partition(object): ...@@ -753,7 +753,6 @@ class Partition(object):
updateFile(self.supervisord_partition_configuration_path, updateFile(self.supervisord_partition_configuration_path,
self.supervisor_configuration_group + self.supervisor_configuration_group +
self.partition_supervisor_configuration) self.partition_supervisor_configuration)
self.updateSupervisor()
def generateSupervisorConfigurationFile(self): def generateSupervisorConfigurationFile(self):
""" """
...@@ -767,6 +766,7 @@ class Partition(object): ...@@ -767,6 +766,7 @@ class Partition(object):
installed, we install it. installed, we install it.
""" """
partition_id = self.computer_partition.getId() partition_id = self.computer_partition.getId()
self.updateSupervisor()
try: try:
with self.getSupervisorRPC() as supervisor: with self.getSupervisorRPC() as supervisor:
supervisor.startProcessGroup(partition_id, False) supervisor.startProcessGroup(partition_id, False)
...@@ -780,17 +780,10 @@ class Partition(object): ...@@ -780,17 +780,10 @@ class Partition(object):
self.logger.info("Requested start of %s..." % self.computer_partition.getId()) self.logger.info("Requested start of %s..." % self.computer_partition.getId())
def stop(self): def stop(self):
"""Asks supervisord to stop the instance.""" """Remove configuration file and asks supervisord to stop the instance."""
partition_id = self.computer_partition.getId() if os.path.exists(self.supervisord_partition_configuration_path):
try: os.unlink(self.supervisord_partition_configuration_path)
with self.getSupervisorRPC() as supervisor: self.updateSupervisor()
supervisor.stopProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
self.logger.info('Partition %s not known in supervisord, ignoring' % partition_id)
else:
raise
else:
self.logger.info("Requested stop of %s..." % self.computer_partition.getId()) self.logger.info("Requested stop of %s..." % self.computer_partition.getId())
def destroy(self): def destroy(self):
......
...@@ -62,7 +62,8 @@ from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME ...@@ -62,7 +62,8 @@ from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME
from slapos.util import mkdir_p, chownDirectory, string_to_boolean, listifdir from slapos.util import mkdir_p, chownDirectory, string_to_boolean, listifdir
from slapos.grid.exception import BuildoutFailedError from slapos.grid.exception import BuildoutFailedError
from slapos.grid.SlapObject import Software, Partition from slapos.grid.SlapObject import Software, Partition
from slapos.grid.svcbackend import (launchSupervisord, from slapos.grid.svcbackend import (getSupervisorRPC,
launchSupervisord,
createSupervisordConfiguration, createSupervisordConfiguration,
_getSupervisordConfigurationDirectory, _getSupervisordConfigurationDirectory,
_getSupervisordSocketPath) _getSupervisordSocketPath)
...@@ -553,6 +554,24 @@ stderr_logfile_backups=1 ...@@ -553,6 +554,24 @@ stderr_logfile_backups=1
if not self.forbid_supervisord_automatic_launch: if not self.forbid_supervisord_automatic_launch:
launchSupervisord(instance_root=self.instance_root, logger=self.logger) launchSupervisord(instance_root=self.instance_root, logger=self.logger)
def _startComputerPartitionList(self):
"""
Start all services for all computer partitions
"""
supervisor_conf_dir = _getSupervisordConfigurationDirectory(self.instance_root)
with getSupervisorRPC(self.supervisord_socket) as supervisor:
for config_filename in os.listdir(supervisor_conf_dir):
partition_id = config_filename.rstrip('.conf')
try:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
self.logger.info("Nothing to start on %s...", partition_id)
else:
self.logger.error("Failed to start %s: %s", partition_id, exc)
else:
self.logger.info("Requested start of %s...", partition_id)
def getComputerPartitionList(self): def getComputerPartitionList(self):
try: try:
return self.computer.getComputerPartitionList() return self.computer.getComputerPartitionList()
...@@ -1429,7 +1448,14 @@ stderr_logfile_backups=1 ...@@ -1429,7 +1448,14 @@ stderr_logfile_backups=1
# Boolean to know if every promises correctly passed # Boolean to know if every promises correctly passed
clean_run_promise = True clean_run_promise = True
try:
computer_partition_list = self.getRequiredComputerPartitionList() computer_partition_list = self.getRequiredComputerPartitionList()
except slapos.slap.exception.ConnectionError:
# Network issue, we log exception start partitions and exit
self.logger.error(traceback.format_exc())
self._startComputerPartitionList()
self.logger.info('Finished computer partitions.')
return SLAPGRID_FAIL
process_error_partition_list = [] process_error_partition_list = []
promise_error_partition_list = [] promise_error_partition_list = []
......
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
from .interface import IManager from .interface import IManager
from six.moves import filter from six.moves import filter
from zope.interface import implementer from zope.interface import implementer
from slapos.grid.utils import updateFile
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -164,7 +165,15 @@ class Manager(object): ...@@ -164,7 +165,15 @@ class Manager(object):
program['command'], program['command'],
as_user=program['as_user']) as_user=program['as_user'])
partition.writeSupervisorConfigurationFile() # Configuration is generated in a different config file so call
# of partition.stop will not delete this file
supervisord_config = os.path.splitext(
partition.supervisord_partition_configuration_path)[0] + \
'-portredir.conf'
updateFile(supervisord_config,
partition.supervisor_configuration_group +
partition.partition_supervisor_configuration)
partition.updateSupervisor()
# Start processes # Start processes
with partition.getSupervisorRPC() as supervisor: with partition.getSupervisorRPC() as supervisor:
......
...@@ -6,6 +6,7 @@ import subprocess ...@@ -6,6 +6,7 @@ import subprocess
from zope.interface import implementer from zope.interface import implementer
from slapos.manager import interface from slapos.manager import interface
from slapos.grid.utils import updateFile
from slapos.grid.slapgrid import COMPUTER_PARTITION_WAIT_LIST_FILENAME from slapos.grid.slapgrid import COMPUTER_PARTITION_WAIT_LIST_FILENAME
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -55,6 +56,17 @@ class Manager(object): ...@@ -55,6 +56,17 @@ class Manager(object):
""" """
pass pass
def writeCustomSupervisorConfigurationFile(self, partition):
"""
Write a prerm supervisord configuration file and update supervisord
"""
config_path = os.path.splitext(
partition.supervisord_partition_configuration_path)[0] + \
'-prerm.conf'
updateFile(config_path,
partition.supervisor_configuration_group +
partition.partition_supervisor_configuration)
def report(self, partition): def report(self, partition):
"""Method called at `slapos node report` phase.""" """Method called at `slapos node report` phase."""
...@@ -77,7 +89,8 @@ class Manager(object): ...@@ -77,7 +89,8 @@ class Manager(object):
partition_id, partition_id,
wrapper_list, wrapper_list,
partition.prerm_path) partition.prerm_path)
partition.writeSupervisorConfigurationFile() self.writeCustomSupervisorConfigurationFile(partition)
partition.updateSupervisor()
# check the state of all process, if the process is not started yes, start it # check the state of all process, if the process is not started yes, start it
with partition.getSupervisorRPC() as supervisor: with partition.getSupervisorRPC() as supervisor:
......
...@@ -447,21 +447,17 @@ class TestCliBoot(CliMixin): ...@@ -447,21 +447,17 @@ class TestCliBoot(CliMixin):
patch( patch(
'slapos.cli.boot.netifaces.ifaddresses', 'slapos.cli.boot.netifaces.ifaddresses',
return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\ return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\
patch('slapos.cli.boot._startComputerPartition', return_value=None) as start_partition,\
patch('slapos.cli.boot.launchSupervisord', return_value=None),\
patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname: patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname:
app.run(('node', 'boot')) app.run(('node', 'boot'))
# boot command runs as root # boot command runs as root
check_root_user.assert_called_once() check_root_user.assert_called_once()
# Computer partition was started during boot
start_partition.assert_called_once()
# it waits for interface to have an IPv6 address # it waits for interface to have an IPv6 address
ifaddresses.assert_called_once_with('interface_name_from_config') ifaddresses.assert_called_once_with('interface_name_from_config')
# then ping master hostname to wait for connectivity # then ping master hostname to wait for connectivity
_ping_hostname.assert_called_once_with('slap.vifib.com') _ping_hostname.assert_called_once_with('slap.vifib.com')
# then format and bang # then format and bang
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--local', '--verbose']) SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--ignore_network_errors', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot']) SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot'])
# timestamp files have been removed # timestamp files have been removed
...@@ -483,7 +479,6 @@ class TestCliBoot(CliMixin): ...@@ -483,7 +479,6 @@ class TestCliBoot(CliMixin):
patch('slapos.cli.boot.netifaces.ifaddresses', patch('slapos.cli.boot.netifaces.ifaddresses',
side_effect=[net1, net2, net3]),\ side_effect=[net1, net2, net3]),\
patch('slapos.cli.boot._ping_hostname', return_value=0),\ patch('slapos.cli.boot._ping_hostname', return_value=0),\
patch('slapos.cli.boot._startComputerPartitionList', return_value=None) as start_partition,\
patch('slapos.cli.format.check_root_user', return_value=True),\ patch('slapos.cli.format.check_root_user', return_value=True),\
patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\ patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\
patch('slapos.cli.bang.check_root_user', return_value=True),\ patch('slapos.cli.bang.check_root_user', return_value=True),\
...@@ -493,7 +488,6 @@ class TestCliBoot(CliMixin): ...@@ -493,7 +488,6 @@ class TestCliBoot(CliMixin):
app.run(('node', 'boot')) app.run(('node', 'boot'))
check_root_user.assert_called_once() check_root_user.assert_called_once()
start_partition.assert_called_once()
self.assertEqual(do_format.call_count, 3) self.assertEqual(do_format.call_count, 3)
self.assertEqual(do_bang.call_count, 3) self.assertEqual(do_bang.call_count, 3)
......
...@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase): ...@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase):
def test_no_master(self): def test_no_master(self):
os.mkdir(self.software_root) os.mkdir(self.software_root)
os.mkdir(self.instance_root) os.mkdir(self.instance_root)
self.assertRaises(ConnectionError, self.grid.processComputerPartitionList) with patch.object(slapos.grid.slapgrid.Slapgrid, '_startComputerPartitionList', return_value=None) as start_cp:
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_FAIL)
self.assertTrue(start_cp.called)
def test_environment_variable_HOME(self): def test_environment_variable_HOME(self):
# When running instance, $HOME is set to the partition path # When running instance, $HOME is set to the partition path
...@@ -3032,7 +3034,7 @@ exit 0 ...@@ -3032,7 +3034,7 @@ exit 0
gid = stat_info.st_gid gid = stat_info.st_gid
supervisor_conf_file = os.path.join(self.instance_root, supervisor_conf_file = os.path.join(self.instance_root,
'etc/supervisord.conf.d', 'etc/supervisord.conf.d',
'%s.conf' % partition.name) '%s-prerm.conf' % partition.name)
self.assertTrue(os.path.exists(supervisor_conf_file)) self.assertTrue(os.path.exists(supervisor_conf_file))
regex_user = r"user=(\d+)" regex_user = r"user=(\d+)"
regex_group = r"group=(\d+)" regex_group = r"group=(\d+)"
...@@ -3114,7 +3116,7 @@ class TestSlapgridWithPortRedirection(MasterMixin, unittest.TestCase): ...@@ -3114,7 +3116,7 @@ class TestSlapgridWithPortRedirection(MasterMixin, unittest.TestCase):
self.computer = self.getTestComputerClass()(self.software_root, self.instance_root) self.computer = self.getTestComputerClass()(self.software_root, self.instance_root)
self.partition = self.computer.instance_list[0] self.partition = self.computer.instance_list[0]
self.instance_supervisord_config_path = os.path.join( self.instance_supervisord_config_path = os.path.join(
self.instance_root, 'etc/supervisord.conf.d/0.conf') self.instance_root, 'etc/supervisord.conf.d/0-portredir.conf')
self.port_redirect_path = os.path.join(self.partition.partition_path, self.port_redirect_path = os.path.join(self.partition.partition_path,
slapmanager.portredir.Manager.port_redirect_filename) slapmanager.portredir.Manager.port_redirect_filename)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment