Commit 036f320b authored by Alain Takoudjou's avatar Alain Takoudjou

slapgrid: start partitions services if network issue while processing instances

Partition.stop: When called, will remove supervisord configuration and update supervisord.
Only started partition has supervisord configuration files.

slapgrid: If `getRequiredComputerPartitionList` fail because or network issue,
buildout cannot process instance. Slapgrid will start any partition that has supervisord configuration
file and exit. This feature is required for `slapos node boot` when computer is offline.

cli.format: parameter `--ignore_network_errors` can be set to ignore exception while
posting xml file to master.

cli.boot: Now call `slapos node format` with `--ignore_network_errors`, then remove
partition timestamp and let cron launch `slapos node instance`. Partition will be
started if there is network issue.

manager.prerm: because of the new behavior of Partition.stop, the rm script
will be killed each time `slapos node report` runs. So the prerm script is
generated in a custom supervisord configuration. Samething for `manager.portredir`.
parent cb9334d4
......@@ -44,9 +44,6 @@ from slapos.cli.config import ConfigCommand
from slapos.format import isGlobalScopeAddress
from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME,
COMPUTER_PARTITION_STARTED_STATE)
from slapos.grid.svcbackend import (_getSupervisordSocketPath,
getSupervisorRPC,
launchSupervisord)
from slapos.util import string_to_boolean
import argparse
import logging
......@@ -65,49 +62,6 @@ def _removeTimestamp(instancehome, partition_base_name):
logger.info("Removing %s", timestamp_path)
os.remove(timestamp_path)
def _startComputerPartition(partition_id, supervisord_socket):
"""
With supervisord, start the instance that was deployed
"""
try:
with getSupervisorRPC(supervisord_socket) as supervisor:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
logger.info("Nothing to start on %s...", partition_id)
else:
raise
else:
logger.info("Requested start of %s...", partition_id)
def _startComputerPartitionList(instance_root, partition_base_name):
"""
Start services for partition which has requested state to 'started'
"""
partition_glob_path = os.path.join(
instance_root,
"%s*" % partition_base_name)
launchSupervisord(instance_root=instance_root, logger=logger)
for partition_path in glob.glob(partition_glob_path):
partition_state_path = os.path.join(
partition_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
supervisord_socket_path = _getSupervisordSocketPath(
instance_root,
logger
)
if os.path.exists(partition_state_path):
partition_state = ""
with open(partition_state_path) as f:
partition_state = f.read()
if partition_state == COMPUTER_PARTITION_STARTED_STATE:
# Call start for this computer partition
_startComputerPartition(
os.path.basename(partition_path.rstrip('/')),
supervisord_socket_path
)
def _runBang(app):
"""
Launch slapos node format.
......@@ -124,9 +78,9 @@ def _runFormat(app):
Launch slapos node format.
"""
logger.info("[BOOT] Invoking slapos node format...")
# '--local' parameter is to prevent node format command to post data to
# master, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--local', '--verbose'])
# '--ignore_network_errors' parameter is to prevent node format command fail if
# master offline, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--ignore_network_errors', '--verbose'])
if result == 1:
return 0
return 1
......@@ -251,9 +205,7 @@ class BootCommand(ConfigCommand):
while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
# Start computer partition services
_startComputerPartitionList(instance_root, partition_base_name)
_removeTimestamp(instance_root, partition_base_name)
# Check that node can ping master
if valid_ipv4(master_hostname):
......@@ -268,5 +220,3 @@ class BootCommand(ConfigCommand):
while not _runBang(app):
logger.error("[BOOT] Fail to bang, try again in 15 seconds...")
sleep(15)
_removeTimestamp(instance_root, partition_base_name)
......@@ -83,10 +83,10 @@ class FormatCommand(ConfigCommand):
help='Launch slapformat without delay'
' (default: %(default)s)')
ap.add_argument('--local',
ap.add_argument('--ignore_network_errors',
default=False, # can have a default as it is not in .cfg
action="store_true",
help='Keep format data locally, do not post xml to master'
help='Ignore network errors while connecting to slapos master.'
' (default: %(default)s)')
ap.add_argument('-n', '--dry_run',
......
......@@ -1408,9 +1408,13 @@ def do_format(conf):
computer.dump(path_to_xml=conf.computer_xml,
path_to_json=conf.computer_json,
logger=conf.logger)
if not conf.local:
conf.logger.info('Posting information to %r' % conf.master_url)
try:
computer.send(conf)
except slap.exception.ConnectionError as e:
if not conf.ignore_network_errors:
raise
conf.logger.warn('Failed to send information to master: %s' % str(e))
conf.logger.info('slapos successfully prepared the computer.')
......
......@@ -753,7 +753,6 @@ class Partition(object):
updateFile(self.supervisord_partition_configuration_path,
self.supervisor_configuration_group +
self.partition_supervisor_configuration)
self.updateSupervisor()
def generateSupervisorConfigurationFile(self):
"""
......@@ -767,6 +766,7 @@ class Partition(object):
installed, we install it.
"""
partition_id = self.computer_partition.getId()
self.updateSupervisor()
try:
with self.getSupervisorRPC() as supervisor:
supervisor.startProcessGroup(partition_id, False)
......@@ -780,17 +780,10 @@ class Partition(object):
self.logger.info("Requested start of %s..." % self.computer_partition.getId())
def stop(self):
"""Asks supervisord to stop the instance."""
partition_id = self.computer_partition.getId()
try:
with self.getSupervisorRPC() as supervisor:
supervisor.stopProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
self.logger.info('Partition %s not known in supervisord, ignoring' % partition_id)
else:
raise
else:
"""Remove configuration file and asks supervisord to stop the instance."""
if os.path.exists(self.supervisord_partition_configuration_path):
os.unlink(self.supervisord_partition_configuration_path)
self.updateSupervisor()
self.logger.info("Requested stop of %s..." % self.computer_partition.getId())
def destroy(self):
......
......@@ -62,7 +62,8 @@ from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME
from slapos.util import mkdir_p, chownDirectory, string_to_boolean, listifdir
from slapos.grid.exception import BuildoutFailedError
from slapos.grid.SlapObject import Software, Partition
from slapos.grid.svcbackend import (launchSupervisord,
from slapos.grid.svcbackend import (getSupervisorRPC,
launchSupervisord,
createSupervisordConfiguration,
_getSupervisordConfigurationDirectory,
_getSupervisordSocketPath)
......@@ -553,6 +554,24 @@ stderr_logfile_backups=1
if not self.forbid_supervisord_automatic_launch:
launchSupervisord(instance_root=self.instance_root, logger=self.logger)
def _startComputerPartitionList(self):
"""
Start all services for all computer partitions
"""
supervisor_conf_dir = _getSupervisordConfigurationDirectory(self.instance_root)
with getSupervisorRPC(self.supervisord_socket) as supervisor:
for config_filename in os.listdir(supervisor_conf_dir):
partition_id = config_filename.rstrip('.conf')
try:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
self.logger.info("Nothing to start on %s...", partition_id)
else:
self.logger.error("Failed to start %s: %s", partition_id, exc)
else:
self.logger.info("Requested start of %s...", partition_id)
def getComputerPartitionList(self):
try:
return self.computer.getComputerPartitionList()
......@@ -1429,7 +1448,14 @@ stderr_logfile_backups=1
# Boolean to know if every promises correctly passed
clean_run_promise = True
try:
computer_partition_list = self.getRequiredComputerPartitionList()
except slapos.slap.exception.ConnectionError:
# Network issue, we log exception start partitions and exit
self.logger.error(traceback.format_exc())
self._startComputerPartitionList()
self.logger.info('Finished computer partitions.')
return SLAPGRID_FAIL
process_error_partition_list = []
promise_error_partition_list = []
......
......@@ -7,6 +7,7 @@ import os
from .interface import IManager
from six.moves import filter
from zope.interface import implementer
from slapos.grid.utils import updateFile
logger = logging.getLogger(__name__)
......@@ -164,7 +165,15 @@ class Manager(object):
program['command'],
as_user=program['as_user'])
partition.writeSupervisorConfigurationFile()
# Configuration is generated in a different config file so call
# of partition.stop will not delete this file
supervisord_config = os.path.splitext(
partition.supervisord_partition_configuration_path)[0] + \
'-portredir.conf'
updateFile(supervisord_config,
partition.supervisor_configuration_group +
partition.partition_supervisor_configuration)
partition.updateSupervisor()
# Start processes
with partition.getSupervisorRPC() as supervisor:
......
......@@ -6,6 +6,7 @@ import subprocess
from zope.interface import implementer
from slapos.manager import interface
from slapos.grid.utils import updateFile
from slapos.grid.slapgrid import COMPUTER_PARTITION_WAIT_LIST_FILENAME
logger = logging.getLogger(__name__)
......@@ -55,6 +56,17 @@ class Manager(object):
"""
pass
def writeCustomSupervisorConfigurationFile(self, partition):
"""
Write a prerm supervisord configuration file and update supervisord
"""
config_path = os.path.splitext(
partition.supervisord_partition_configuration_path)[0] + \
'-prerm.conf'
updateFile(config_path,
partition.supervisor_configuration_group +
partition.partition_supervisor_configuration)
def report(self, partition):
"""Method called at `slapos node report` phase."""
......@@ -77,7 +89,8 @@ class Manager(object):
partition_id,
wrapper_list,
partition.prerm_path)
partition.writeSupervisorConfigurationFile()
self.writeCustomSupervisorConfigurationFile(partition)
partition.updateSupervisor()
# check the state of all process, if the process is not started yes, start it
with partition.getSupervisorRPC() as supervisor:
......
......@@ -447,21 +447,17 @@ class TestCliBoot(CliMixin):
patch(
'slapos.cli.boot.netifaces.ifaddresses',
return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\
patch('slapos.cli.boot._startComputerPartition', return_value=None) as start_partition,\
patch('slapos.cli.boot.launchSupervisord', return_value=None),\
patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname:
app.run(('node', 'boot'))
# boot command runs as root
check_root_user.assert_called_once()
# Computer partition was started during boot
start_partition.assert_called_once()
# it waits for interface to have an IPv6 address
ifaddresses.assert_called_once_with('interface_name_from_config')
# then ping master hostname to wait for connectivity
_ping_hostname.assert_called_once_with('slap.vifib.com')
# then format and bang
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--local', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--ignore_network_errors', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot'])
# timestamp files have been removed
......@@ -483,7 +479,6 @@ class TestCliBoot(CliMixin):
patch('slapos.cli.boot.netifaces.ifaddresses',
side_effect=[net1, net2, net3]),\
patch('slapos.cli.boot._ping_hostname', return_value=0),\
patch('slapos.cli.boot._startComputerPartitionList', return_value=None) as start_partition,\
patch('slapos.cli.format.check_root_user', return_value=True),\
patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\
patch('slapos.cli.bang.check_root_user', return_value=True),\
......@@ -493,7 +488,6 @@ class TestCliBoot(CliMixin):
app.run(('node', 'boot'))
check_root_user.assert_called_once()
start_partition.assert_called_once()
self.assertEqual(do_format.call_count, 3)
self.assertEqual(do_bang.call_count, 3)
......
......@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase):
def test_no_master(self):
os.mkdir(self.software_root)
os.mkdir(self.instance_root)
self.assertRaises(ConnectionError, self.grid.processComputerPartitionList)
with patch.object(slapos.grid.slapgrid.Slapgrid, '_startComputerPartitionList', return_value=None) as start_cp:
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_FAIL)
self.assertTrue(start_cp.called)
def test_environment_variable_HOME(self):
# When running instance, $HOME is set to the partition path
......@@ -3032,7 +3034,7 @@ exit 0
gid = stat_info.st_gid
supervisor_conf_file = os.path.join(self.instance_root,
'etc/supervisord.conf.d',
'%s.conf' % partition.name)
'%s-prerm.conf' % partition.name)
self.assertTrue(os.path.exists(supervisor_conf_file))
regex_user = r"user=(\d+)"
regex_group = r"group=(\d+)"
......@@ -3114,7 +3116,7 @@ class TestSlapgridWithPortRedirection(MasterMixin, unittest.TestCase):
self.computer = self.getTestComputerClass()(self.software_root, self.instance_root)
self.partition = self.computer.instance_list[0]
self.instance_supervisord_config_path = os.path.join(
self.instance_root, 'etc/supervisord.conf.d/0.conf')
self.instance_root, 'etc/supervisord.conf.d/0-portredir.conf')
self.port_redirect_path = os.path.join(self.partition.partition_path,
slapmanager.portredir.Manager.port_redirect_filename)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment