Commit 445f95a5 authored by Boxiang Sun's avatar Boxiang Sun

monitor: Allow httpd to restart after the sever reboot from poweroff

If a server reboots brutally, there will be a legacy httpd pid file stay
in the file system, and the OS may reuse the PID that remains in that
file, which will cause httpd service to fail to start even if there is
no such httpd service is running.

This commit implemented the following logic:
0. If there is no existing PID file, then run the httpd service directly
1. If the PID file exists and contains the PID used by another process,
we delete the PID file, then restart the httpd service
2. If the PID file exists and contains the PID used by another running
httpd service, we allow the httpd report the "already running" error
normally
parent adec8b28
......@@ -30,11 +30,13 @@ import glob
import hashlib
import json
import os
import psutil
import re
import requests
import shutil
import subprocess
import tempfile
import time
import xml.etree.ElementTree as ET
from cryptography import x509
from cryptography.hazmat.backends import default_backend
......@@ -75,6 +77,145 @@ class ServicesTestCase(SlapOSInstanceTestCase):
self.assertIn(expected_process_name, process_names)
def test_monitor_httpd_normal_reboot(self):
# Start the monitor-httpd service
monitor_httpd_process_name = ''
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
partition = info['group']
if info['statename'] != "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.startProcess(monitor_httpd_process_name)
for _retries in range(20):
time.sleep(1)
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
if info['statename'] == "RUNNING":
break
else:
self.fail(f"the supervisord service '{monitor_httpd_process_name}' is not running")
# Get the partition path
partition_path_list = glob.glob(os.path.join(self.slap.instance_directory, '*'))
for partition_path in partition_path_list:
if os.path.exists(os.path.join(partition_path, 'etc', 'monitor-httpd.conf')):
self.partition_path = partition_path
break
# Make sure we are focusing the same httpd service
self.assertIn(partition, self.partition_path)
# Get the monitor-httpd-service
monitor_httpd_service_path = glob.glob(os.path.join(
self.partition_path, 'etc', 'service', 'monitor-httpd*'
))[0]
try:
output = subprocess.check_output([monitor_httpd_service_path], timeout=10, stderr=subprocess.STDOUT, text=True)
# If the httpd-monitor service is running
# and the monitor-httpd.pid contains the identical PID as the servicse
# run the monitor-httpd service can cause the "already running" error correctly
self.assertIn("already running", output)
except subprocess.CalledProcessError as e:
self.logger.debug(e.output)
self.logger.debug("Unexpected error when running the monitor-httpd service:", e)
self.fail("Unexpected error when running the monitor-httpd service")
except subprocess.TimeoutExpired as e:
# Timeout means we run the httpd service corrrectly
# This is not the expected behaviour
self.logger.debug("Unexpected behaviour: We are not suppose to be able to run the httpd service in the test:", e)
# Kill the process that we started manually
# Get the pid of the monitor_httpd from the PID file
monitor_httpd_pid_file = os.path.join(self.partition_path, 'var', 'run', 'monitor-httpd.pid')
monitor_httpd_pid = ""
if os.path.exists(monitor_httpd_pid_file):
with open(monitor_httpd_pid_file, "r") as pid_file:
monitor_httpd_pid = pid_file.read()
try:
pid_to_kill = monitor_httpd_pid.strip('\n')
subprocess.run(["kill", "-9", str(pid_to_kill)], check=True)
self.logger.debug(f"Process with PID {pid_to_kill} killed.")
except subprocess.CalledProcessError as e:
self.logger.debug(f"Error killing process with PID {pid_to_kill}: {e}")
self.fail("Unexpected behaviour: We are not suppose to be able to run the httpd service in the test")
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
partition = info['group']
if info['statename'] == "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.stopProcess(monitor_httpd_process_name)
def test_monitor_httpd_crash_reboot(self):
# Get the partition path
partition_path_list = glob.glob(os.path.join(self.slap.instance_directory, '*'))
for partition_path in partition_path_list:
if os.path.exists(os.path.join(partition_path, 'etc', 'monitor-httpd.conf')):
self.partition_path = partition_path
break
# Get the pid file
monitor_httpd_pid_file = os.path.join(self.partition_path, 'var', 'run', 'monitor-httpd.pid')
monitor_httpd_process_name = ''
with self.slap.instance_supervisor_rpc as supervisor:
info, = [i for i in
supervisor.getAllProcessInfo() if ('monitor-httpd' in i['name']) and ('on-watch' in i['name'])]
if info['statename'] == "RUNNING":
monitor_httpd_process_name = f"{info['group']}:{info['name']}"
supervisor.stopProcess(monitor_httpd_process_name)
# Write the PID of the infinite process to the pid file.
with open(monitor_httpd_pid_file, "w") as file:
file.write(str(os.getpid()))
# Get the monitor-httpd-service
monitor_httpd_service_path = glob.glob(os.path.join(
self.partition_path, 'etc', 'service', 'monitor-httpd*'
))[0]
output = ''
monitor_httpd_service_is_running = False
# Create the subprocess
self.logger.debug("Ready to run the process in crash reboot")
try:
process = subprocess.Popen(monitor_httpd_service_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = '', ''
try:
# Wait for the process to finish, but with a timeout
stdout, stderr = process.communicate(timeout=3)
self.logger.debug("Communicated!")
except subprocess.TimeoutExpired:
monitor_httpd_service_is_running = True # We didn't get any output within 3 seconds, this means everything is fine.
# If the process times out, terminate it
try:
main_process = psutil.Process(process.pid)
child_processes = main_process.children(recursive=True)
for process in child_processes + [main_process]:
process.terminate()
psutil.wait_procs(child_processes + [main_process])
self.logger.debug(f"Processes with PID {process.pid} and its subprocesses terminated.")
except psutil.NoSuchProcess as e:
# This print will generate ResourceWarningm but it is normal in Python 3
# See https://github.com/giampaolo/psutil/blob/master/psutil/tests/test_process.py#L1526
self.logger.debug("No process found with PID: %s" % process.pid)
except subprocess.CalledProcessError as e:
self.logger.debug(e.output)
self.logger.debug("Unexpected error when running the monitor-httpd service:", e)
self.fail("Unexpected error when running the monitor-httpd service")
# "httpd (pid 21934) already running" means we start httpd failed
if "already running" in stdout:
self.fail("Unexepected output from the monitor-httpd process: %s" % stdout)
raise Exception("Unexepected output from the monitor-httpd process: %s" % stdout)
self.assertTrue(monitor_httpd_service_is_running)
class MonitorTestMixin:
monitor_setup_url_key = 'monitor-setup-url'
......
......@@ -51,6 +51,10 @@ filename = monitor.conf.in
[monitor-httpd-cors]
<= monitor-download-base
filename = httpd-cors.cfg.in
[template-monitor-httpd-wrapper]
<= monitor-download-base
filename = template-monitor-httpd-wrapper.sh.in
# End templates files
[monitor-template]
......@@ -82,6 +86,7 @@ context =
raw python_executable ${buildout:executable}
raw python_with_eggs ${buildout:bin-directory}/${monitor-eggs:interpreter}
raw template_wrapper ${monitor-template-wrapper:location}/${monitor-template-wrapper:filename}
raw template_monitor_httpd_wrapper ${template-monitor-httpd-wrapper:location}/${template-monitor-httpd-wrapper:filename}
raw check_disk_space ${buildout:bin-directory}/check-free-disk
raw bin_directory ${buildout:directory}/bin
......
......@@ -14,12 +14,16 @@
# not need these here).
[monitor2-template]
filename = instance-monitor.cfg.jinja2.in
md5sum = 255b4f5f2d960ec958899114cef4cfd9
md5sum = dda9b2355134517dae601cc20709685a
[monitor-httpd-conf]
_update_hash_filename_ = templates/monitor-httpd.conf.in
md5sum = 0540fc5cc439a06079e9e724a5a55a70
[template-monitor-httpd-wrapper]
_update_hash_filename_ = templates/template-monitor-httpd-wrapper.sh.in
md5sum = 45929a22527b71620555326f4dd78c34
[monitor-template-wrapper]
_update_hash_filename_ = templates/wrapper.in
md5sum = e8566c00b28f6f86adde11b6b6371403
......
......@@ -67,9 +67,22 @@ hash-existing-files = ${buildout:directory}/software_release/buildout.cfg
recipe = slapos.cookbook:certificate_authority.request
key-file = ${monitor-httpd-conf-parameter:key-file}
cert-file = ${monitor-httpd-conf-parameter:cert-file}
executable = ${monitor-httpd-wrapper:wrapper-path}
executable = ${monitor-httpd-service-wrapper:output}
wrapper = ${directory:bin}/ca-monitor-httpd
[monitor-httpd-service-wrapper]
recipe = slapos.recipe.template:jinja2
url = {{ template_monitor_httpd_wrapper }}
output = ${directory:bin}/monitor-httpd-service-wrapper
pid-file = ${monitor-httpd-conf-parameter:pid-file}
monitor-httpd-wrapper-path = ${monitor-httpd-wrapper:wrapper-path}
monitor-httpd-conf = ${monitor-httpd-conf:output}
context =
key pid_file :pid-file
key monitor_httpd_wrapper_path :monitor-httpd-wrapper-path
key monitor_httpd_conf :monitor-httpd-conf
raw dash_binary {{ dash_executable_location }}
[ca-monitor-httpd-service]
recipe = slapos.cookbook:wrapper
command-line = ${ca-monitor-httpd:wrapper}
......
#!{{ dash_binary }}
# BEWARE: This file is operated by slapos node
# BEWARE: It will be overwritten automatically
pid_file="{{ pid_file }}"
monitor_httpd_conf_file={{ monitor_httpd_conf }}
if [ -f "$pid_file" ]; then
pid=$(cat "$pid_file")
result=$(ps aux | grep "^\S*\s*$pid\s")
# The process with the specified PID is running
if [ -n "$result" ]; then
echo "there is a process running with the same pid"
# Get the command line of the process and replace null characters with spaces
cmdline=$(tr '\0' ' ' < "/proc/$pid/cmdline")
# There is a process running with the pid,
# but it is not one using our monitor-httpd.conf
if ! expr "$cmdline" : ".*$monitor_httpd_conf_file" > /dev/null; then
echo "The process is not running with the monitor_httpd_conf"
rm -f {{ pid_file }};
fi
fi
fi
exec {{ monitor_httpd_wrapper_path }} "$@"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment