diff --git a/setup.py b/setup.py index 5eb7ff1946dd10c5ec84755cdd8f95118249e366..1358dbad04f264310d19a6ab833852fc6e11c30c 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ setup(name=name, 'slapproxy = slapos.proxy:main', 'bang = slapos.bang:main', 'slapos = slapos.entry:main', - 'watchdog = slapos.grid.watchdog:main', + 'slapos-watchdog = slapos.grid.watchdog:main', ] }, test_suite="slapos.tests", diff --git a/slapos/tests/slapgrid.py b/slapos/tests/slapgrid.py index 8e89dd1dc05afcb84d94bff5aa6dca87b7e87023..fa434244bdefafd511f9cabcf8d5f0fc87fa6f35 100644 --- a/slapos/tests/slapgrid.py +++ b/slapos/tests/slapgrid.py @@ -32,6 +32,7 @@ import os import shutil import signal import slapos.slap.slap +from slapos.grid.watchdog import Watchdog, getWatchdogID import socket import sys import tempfile @@ -40,6 +41,33 @@ import unittest import urlparse import xml_marshaller + +WATCHDOG_TEMPLATE = """#!%(python_path)s -S + +import sys +sys.path=%(sys_path)s +import slapos.slap.slap +import slapos.grid.watchdog + +def setBang(): + def getBang(): + def bang(self_partition,message): + report = "" + for key in self_partition.__dict__: + report += (key + ': ' + str(self_partition.__dict__[key]) + ' ') + if key == '_connection_helper': + for el in self_partition.__dict__[key].__dict__: + report += (' ' + el +': ' + + str(self_partition.__dict__[key].__dict__[el]) + ' ') + report += message + open('%(watchdog_banged)s','w').write(report) + return bang + slapos.slap.ComputerPartition.bang = getBang() + +setBang() +slapos.grid.watchdog.main() +""" + WRAPPER_CONTENT = """#!/bin/sh touch worked && mkdir -p etc/run && @@ -48,6 +76,19 @@ echo "while :; do echo "Working\\nWorking\\n" ; sleep 0.1; done" >> etc/run/wrap chmod 755 etc/run/wrapper """ +DAEMON_CONTENT = """#!/bin/sh +mkdir -p etc/service && +echo "#!/bin/sh" > etc/service/daemon && +echo "touch launched +if [ -f ./crashed ]; then +while :; do echo "Working\\nWorking\\n" ; sleep 0.1; done +else +touch ./crashed; echo "Failing\\nFailing\\n"; sleep 1; return 111; +fi" >> etc/service/daemon && +chmod 755 etc/service/daemon && +touch worked +""" + class BasicMixin: def assertSortedListEqual(self, list1, list2, msg=None): self.assertListEqual(sorted(list1), sorted(list2), msg) @@ -241,6 +282,8 @@ class ComputerForTest: if parsed_url.path == 'destroyedComputerPartition': instance.state = 'destroyed' return (200, {}, '') + if parsed_url.path == 'softwareInstanceBang': + return (200, {}, '') if parsed_url.path == 'softwareInstanceError': instance.error_log = '\n'.join([line for line \ in parsed_qs['error_log'][0].splitlines() @@ -569,6 +612,177 @@ chmod 755 etc/run/wrapper self.assertEqual('stopped', instance.state) +class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase): + + def test_one_failing_daemon_in_service_will_bang_with_watchdog(self): + """ + Check that a failing service watched by watchdog trigger bang + 1.Prepare computer and set a service named daemon in etc/service + (to be watched by watchdog). This daemon will fail. + 2.Prepare file for supervisord to call watchdog + -Set sys.path + -Monkeypatch computer partition bang + 3.Check damemon is launched + 4.Wait for it to fail + 5.Wait for file generated by monkeypacthed bang to appear + """ + computer = ComputerForTest(self.software_root,self.instance_root) + partition = computer.instance_list[0] + partition.requested_state = 'started' + partition.software.setBuildout(DAEMON_CONTENT) + # Prepare watchdog + watchdog_path = os.path.join(self._tempdir,'watchdog') + watchdog_banged = os.path.join(self._tempdir,'watchdog_banged') + open(watchdog_path,'w').write( + WATCHDOG_TEMPLATE % dict(python_path=sys.executable, + sys_path=sys.path, + watchdog_banged=watchdog_banged)) + os.chmod(watchdog_path,0755) + self.grid.watchdog_path = watchdog_path + self.assertTrue(self.grid.processComputerPartitionList()) + self.assertSortedListEqual(os.listdir(self.instance_root), ['0', 'etc', + 'var']) + self.assertSortedListEqual(os.listdir(partition.partition_path), + ['.0_daemon.log','worked', 'buildout.cfg', 'etc']) + tries = 10 + daemon_log = os.path.join(partition.partition_path, '.0_daemon.log') + while tries > 0: + tries -= 1 + if os.path.getsize(daemon_log) > 0: + break + time.sleep(0.2) + self.assertTrue('Failing' in open(daemon_log, 'r').read()) + tries = 25 + while tries > 0: + tries -= 1 + if os.path.exists(watchdog_banged): + break + time.sleep(0.2) + self.assertTrue(os.path.exists(watchdog_banged)) + self.assertTrue('daemon' in open(watchdog_banged,'r').read()) + + RUN_CONTENT = """#!/bin/sh +mkdir -p etc/run && +echo "#!/bin/sh" > etc/run/daemon && +echo "touch launched +touch ./crashed; echo "Failing\\nFailing\\n"; sleep 1; return 111; +" >> etc/run/daemon && +chmod 755 etc/run/daemon && +touch worked +""" + + def test_one_failing_daemon_in_run_will_not_bang_with_watchdog(self): + """ + Check that a failing service watched by watchdog trigger bang + 1.Prepare computer and set a service named daemon in etc/run + (not watched by watchdog). This daemon will fail. + 2.Prepare file for supervisord to call watchdog + -Set sys.path + -Monkeypatch computer partition bang + 3.Check damemon is launched + 4.Wait for it to fail + 5.Check that file generated by monkeypacthed bang do not appear + """ + computer = ComputerForTest(self.software_root,self.instance_root) + partition = computer.instance_list[0] + partition.requested_state = 'started' + partition.software.setBuildout(self.RUN_CONTENT) + # Prepare watchdog + watchdog_path = os.path.join(self._tempdir,'watchdog') + watchdog_banged = os.path.join(self._tempdir,'watchdog_banged') + open(watchdog_path,'w').write( + WATCHDOG_TEMPLATE % dict(python_path=sys.executable, + sys_path=sys.path, + watchdog_banged=watchdog_banged)) + os.chmod(watchdog_path,0755) + self.grid.watchdog_path = watchdog_path + self.assertTrue(self.grid.processComputerPartitionList()) + self.assertSortedListEqual(os.listdir(self.instance_root), ['0', 'etc', + 'var']) + self.assertSortedListEqual(os.listdir(partition.partition_path), + ['.0_daemon.log','worked', 'buildout.cfg', 'etc']) + tries = 10 + daemon_log = os.path.join(partition.partition_path, '.0_daemon.log') + while tries > 0: + tries -= 1 + if os.path.getsize(daemon_log) > 0: + break + time.sleep(0.2) + self.assertTrue('Failing' in open(daemon_log, 'r').read()) + tries = 25 + while tries > 0: + tries -= 1 + if os.path.exists(watchdog_banged): + break + time.sleep(0.2) + self.assertFalse(os.path.exists(watchdog_banged)) + + + def test_watched_by_watchdog_bang(self): + """ + Test that a process going to fatal or exited mode in supervisord + is banged if watched by watchdog + (ie: watchdog id in process name) + """ + computer = ComputerForTest(self.software_root,self.instance_root) + instance = computer.instance_list[0] + + watchdog = Watchdog(dict(master_url=self.master_url, + computer_id=self.computer_id, + key_file=None, + cert_file=None)) + for event in watchdog.process_state_events: + instance.sequence = [] + headers = dict(eventname=event) + payload = "processname:%s groupname:%s from_state:RUNNING"\ + % ('daemon'+getWatchdogID(),instance.name) + watchdog.handle_event(headers,payload) + self.assertEqual(instance.sequence,['softwareInstanceBang']) + + def test_unwanted_events_will_not_bang(self): + """ + Test that a process going to a mode not watched by watchdog + in supervisord is not banged if watched by watchdog + """ + computer = ComputerForTest(self.software_root,self.instance_root) + instance = computer.instance_list[0] + + watchdog = Watchdog(dict(master_url=self.master_url, + computer_id=self.computer_id, + key_file=None, + cert_file=None)) + for event in ['EVENT', 'PROCESS_STATE', 'PROCESS_STATE_RUNNING', + 'PROCESS_STATE_BACKOFF', 'PROCESS_STATE_STOPPED']: + computer.sequence = [] + headers = dict(eventname=event) + payload = "processname:%s groupname:%s from_state:RUNNING"\ + % ('daemon'+getWatchdogID(),instance.name) + watchdog.handle_event(headers,payload) + self.assertEqual(instance.sequence,[]) + + + def test_not_watched_by_watchdog_do_not_bang(self): + """ + Test that a process going to fatal or exited mode in supervisord + is not banged if not watched by watchdog + (ie: no watchdog id in process name) + """ + computer = ComputerForTest(self.software_root,self.instance_root) + instance = computer.instance_list[0] + + watchdog = Watchdog(dict(master_url=self.master_url, + computer_id=self.computer_id, + key_file=None, + cert_file=None)) + for event in watchdog.process_state_events: + computer.sequence = [] + headers = dict(eventname=event) + payload = "processname:%s groupname:%s from_state:RUNNING"\ + % ('daemon',instance.name) + watchdog.handle_event(headers,payload) + self.assertEqual(computer.sequence,[]) + + class TestSlapgridCPPartitionProcessing (MasterMixin, unittest.TestCase): def test_partition_timestamp(self):