Commit cd4d47d8 authored by Boxiang Sun's avatar Boxiang Sun

monitor: Allow httpd to restart after the sever reboot from poweroff

If a server reboots brutally, there will be a legacy httpd pid file stay
in the file system, and the OS may reuse the PID that remains in that
file, which will cause httpd service to fail to start even if there is
no such httpd service is running.

This commit implemented the following logic:
0. If there is no existing PID file, then run the httpd service directly
1. If the PID file exists and contains the PID used by another process,
we delete the PID file, then restart the httpd service
2. If the PID file exists and contains the PID used by another running
httpd service, we allow the httpd report the "already running" error
normally
parent adec8b28
...@@ -30,11 +30,13 @@ import glob ...@@ -30,11 +30,13 @@ import glob
import hashlib import hashlib
import json import json
import os import os
import psutil
import re import re
import requests import requests
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
import time
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from cryptography import x509 from cryptography import x509
from cryptography.hazmat.backends import default_backend from cryptography.hazmat.backends import default_backend
...@@ -75,6 +77,141 @@ class ServicesTestCase(SlapOSInstanceTestCase): ...@@ -75,6 +77,141 @@ class ServicesTestCase(SlapOSInstanceTestCase):
self.assertIn(expected_process_name, process_names) self.assertIn(expected_process_name, process_names)
def test_monitor_httpd_normal_reboot(self):
# Start the monitor-httpd service
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
partition = info['group']
if info['statename'] != "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.startProcess(monitor_httpd_process_name)
for _retries in range(20):
time.sleep(1)
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
if info['statename'] == "RUNNING":
break
else:
self.fail(f"the supervisord service '{monitor_httpd_process_name}' is not running")
# Get the partition path
partition_path_list = glob.glob(os.path.join(self.slap.instance_directory, '*'))
for partition_path in partition_path_list:
if os.path.exists(os.path.join(partition_path, 'etc', 'monitor-httpd.conf')):
self.partition_path = partition_path
break
# Make sure we are focusing the same httpd service
self.assertIn(partition, self.partition_path)
# Get the monitor-httpd-service
monitor_httpd_service_path = glob.glob(os.path.join(
self.partition_path, 'etc', 'service', 'monitor-httpd*'
))[0]
try:
output = subprocess.check_output([monitor_httpd_service_path], timeout=10, stderr=subprocess.STDOUT, text=True)
# If the httpd-monitor service is running
# and the monitor-httpd.pid contains the identical PID as the servicse
# run the monitor-httpd service can cause the "already running" error correctly
self.assertIn("already running", output)
except subprocess.CalledProcessError as e:
self.logger.debug("Unexpected error when running the monitor-httpd service:", e)
self.fail("Unexpected error when running the monitor-httpd service")
except subprocess.TimeoutExpired as e:
# Timeout means we run the httpd service corrrectly
# This is not the expected behaviour
self.logger.debug("Unexpected behaviour: We are not suppose to be able to run the httpd service in the test:", e)
# Kill the process that we started manually
# Get the pid of the monitor_httpd from the PID file
monitor_httpd_pid_file = os.path.join(self.partition_path, 'var', 'run', 'monitor-httpd.pid')
monitor_httpd_pid = ""
if os.path.exists(monitor_httpd_pid_file):
with open(monitor_httpd_pid_file, "r") as pid_file:
monitor_httpd_pid = pid_file.read()
try:
pid_to_kill = monitor_httpd_pid.strip('\n')
subprocess.run(["kill", "-9", str(pid_to_kill)], check=True)
self.logger.debug(f"Process with PID {pid_to_kill} killed.")
except subprocess.CalledProcessError as e:
self.logger.debug(f"Error killing process with PID {pid_to_kill}: {e}")
self.fail("Unexpected behaviour: We are not suppose to be able to run the httpd service in the test")
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
partition = info['group']
if info['statename'] == "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.stopProcess(monitor_httpd_process_name)
def test_monitor_httpd_crash_reboot(self):
# Get the partition path
partition_path_list = glob.glob(os.path.join(self.slap.instance_directory, '*'))
for partition_path in partition_path_list:
if os.path.exists(os.path.join(partition_path, 'etc', 'monitor-httpd.conf')):
self.partition_path = partition_path
break
# Get the pid file
monitor_httpd_pid_file = os.path.join(self.partition_path, 'var', 'run', 'monitor-httpd.pid')
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
if info['statename'] == "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.stopProcess(monitor_httpd_process_name)
# Write the PID of the infinite process to the pid file.
with open(monitor_httpd_pid_file, "w") as file:
file.write(str(os.getpid()))
# Get the monitor-httpd-service
monitor_httpd_service_path = glob.glob(os.path.join(
self.partition_path, 'etc', 'service', 'monitor-httpd*'
))[0]
monitor_httpd_service_is_running = False
# Create the subprocess
self.logger.debug("Ready to run the process in crash reboot")
try:
process = subprocess.Popen(monitor_httpd_service_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = '', ''
try:
# Wait for the process to finish, but with a timeout
stdout, stderr = process.communicate(timeout=3)
self.logger.debug("Communicated!")
except subprocess.TimeoutExpired:
monitor_httpd_service_is_running = True # We didn't get any output within 3 seconds, this means everything is fine.
# If the process times out, terminate it
try:
main_process = psutil.Process(process.pid)
child_processes = main_process.children(recursive=True)
for process in child_processes + [main_process]:
process.terminate()
psutil.wait_procs(child_processes + [main_process])
self.logger.debug(f"Processes with PID {process.pid} and its subprocesses terminated.")
except psutil.NoSuchProcess as e:
# This print will generate ResourceWarningm but it is normal in Python 3
# See https://github.com/giampaolo/psutil/blob/master/psutil/tests/test_process.py#L1526
self.logger.debug("No process found with PID: %s" % process.pid)
# "httpd (pid 21934) already running" means we start httpd failed
if "already running" in stdout:
self.fail("Unexepected output from the monitor-httpd process: %s" % stdout)
raise Exception("Unexepected output from the monitor-httpd process: %s" % stdout)
except subprocess.CalledProcessError as e:
self.logger.debug("Unexpected error when running the monitor-httpd service:", e)
self.fail("Unexpected error when running the monitor-httpd service")
self.assertTrue(monitor_httpd_service_is_running)
class MonitorTestMixin: class MonitorTestMixin:
monitor_setup_url_key = 'monitor-setup-url' monitor_setup_url_key = 'monitor-setup-url'
......
...@@ -51,6 +51,10 @@ filename = monitor.conf.in ...@@ -51,6 +51,10 @@ filename = monitor.conf.in
[monitor-httpd-cors] [monitor-httpd-cors]
<= monitor-download-base <= monitor-download-base
filename = httpd-cors.cfg.in filename = httpd-cors.cfg.in
[template-monitor-httpd-wrapper]
<= monitor-download-base
filename = template-monitor-httpd-wrapper.sh.in
# End templates files # End templates files
[monitor-template] [monitor-template]
...@@ -82,6 +86,7 @@ context = ...@@ -82,6 +86,7 @@ context =
raw python_executable ${buildout:executable} raw python_executable ${buildout:executable}
raw python_with_eggs ${buildout:bin-directory}/${monitor-eggs:interpreter} raw python_with_eggs ${buildout:bin-directory}/${monitor-eggs:interpreter}
raw template_wrapper ${monitor-template-wrapper:location}/${monitor-template-wrapper:filename} raw template_wrapper ${monitor-template-wrapper:location}/${monitor-template-wrapper:filename}
raw template_monitor_httpd_wrapper ${template-monitor-httpd-wrapper:location}/${template-monitor-httpd-wrapper:filename}
raw check_disk_space ${buildout:bin-directory}/check-free-disk raw check_disk_space ${buildout:bin-directory}/check-free-disk
raw bin_directory ${buildout:directory}/bin raw bin_directory ${buildout:directory}/bin
......
...@@ -14,12 +14,16 @@ ...@@ -14,12 +14,16 @@
# not need these here). # not need these here).
[monitor2-template] [monitor2-template]
filename = instance-monitor.cfg.jinja2.in filename = instance-monitor.cfg.jinja2.in
md5sum = 255b4f5f2d960ec958899114cef4cfd9 md5sum = dda9b2355134517dae601cc20709685a
[monitor-httpd-conf] [monitor-httpd-conf]
_update_hash_filename_ = templates/monitor-httpd.conf.in _update_hash_filename_ = templates/monitor-httpd.conf.in
md5sum = 0540fc5cc439a06079e9e724a5a55a70 md5sum = 0540fc5cc439a06079e9e724a5a55a70
[template-monitor-httpd-wrapper]
_update_hash_filename_ = templates/template-monitor-httpd-wrapper.sh.in
md5sum = 45929a22527b71620555326f4dd78c34
[monitor-template-wrapper] [monitor-template-wrapper]
_update_hash_filename_ = templates/wrapper.in _update_hash_filename_ = templates/wrapper.in
md5sum = e8566c00b28f6f86adde11b6b6371403 md5sum = e8566c00b28f6f86adde11b6b6371403
......
...@@ -67,9 +67,22 @@ hash-existing-files = ${buildout:directory}/software_release/buildout.cfg ...@@ -67,9 +67,22 @@ hash-existing-files = ${buildout:directory}/software_release/buildout.cfg
recipe = slapos.cookbook:certificate_authority.request recipe = slapos.cookbook:certificate_authority.request
key-file = ${monitor-httpd-conf-parameter:key-file} key-file = ${monitor-httpd-conf-parameter:key-file}
cert-file = ${monitor-httpd-conf-parameter:cert-file} cert-file = ${monitor-httpd-conf-parameter:cert-file}
executable = ${monitor-httpd-wrapper:wrapper-path} executable = ${monitor-httpd-service-wrapper:output}
wrapper = ${directory:bin}/ca-monitor-httpd wrapper = ${directory:bin}/ca-monitor-httpd
[monitor-httpd-service-wrapper]
recipe = slapos.recipe.template:jinja2
url = {{ template_monitor_httpd_wrapper }}
output = ${directory:bin}/monitor-httpd-service-wrapper
pid-file = ${monitor-httpd-conf-parameter:pid-file}
monitor-httpd-wrapper-path = ${monitor-httpd-wrapper:wrapper-path}
monitor-httpd-conf = ${monitor-httpd-conf:output}
context =
key pid_file :pid-file
key monitor_httpd_wrapper_path :monitor-httpd-wrapper-path
key monitor_httpd_conf :monitor-httpd-conf
raw dash_binary {{ dash_executable_location }}
[ca-monitor-httpd-service] [ca-monitor-httpd-service]
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
command-line = ${ca-monitor-httpd:wrapper} command-line = ${ca-monitor-httpd:wrapper}
......
#!{{ dash_binary }}
# BEWARE: This file is operated by slapos node
# BEWARE: It will be overwritten automatically
pid_file="{{ pid_file }}"
monitor_httpd_conf_file={{ monitor_httpd_conf }}
if [ -f "$pid_file" ]; then
pid=$(cat "$pid_file")
result=$(ps aux | grep "^\S*\s*$pid\s")
# The process with the specified PID is running
if [ -n "$result" ]; then
echo "there is a process running with the same pid"
# Get the command line of the process and replace null characters with spaces
cmdline=$(tr '\0' ' ' < "/proc/$pid/cmdline")
# There is a process running with the pid,
# but it is not one using our monitor-httpd.conf
if ! expr "$cmdline" : ".*$monitor_httpd_conf_file" > /dev/null; then
echo "The process is not running with the monitor_httpd_conf"
rm -f {{ pid_file }};
fi
fi
fi
exec {{ monitor_httpd_wrapper_path }} "$@"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment