From 5f92ce8a74e0affec0f0b54b5398a2022b02458f Mon Sep 17 00:00:00 2001 From: Rafael Monnerat <rafael@nexedi.com> Date: Mon, 29 May 2017 19:15:03 +0200 Subject: [PATCH] apache-mpm_watchdog: Initial commit Implement a watchdog to kill process which are hold on dead lock state after a greceful reload. --- .gitignore | 1 + setup.py | 1 + .../promise/apache_mpm_watchdog/__init__.py | 85 ++++++++++++++++ slapos/test/promise/data/corrupted_db.json | 0 slapos/test/promise/data/server_status.html | 86 +++++++++++++++++ slapos/test/promise/data/test_db.json | 1 + .../test/promise/test_apache_mpm_watchdog.py | 96 +++++++++++++++++++ 7 files changed, 270 insertions(+) create mode 100644 slapos/promise/apache_mpm_watchdog/__init__.py create mode 100644 slapos/test/promise/data/corrupted_db.json create mode 100644 slapos/test/promise/data/server_status.html create mode 100644 slapos/test/promise/data/test_db.json create mode 100644 slapos/test/promise/test_apache_mpm_watchdog.py diff --git a/.gitignore b/.gitignore index 61ba889..ebc0657 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ .eggs/ config.json slapos/test/promise/data/SOFTINST-0_* +slapos/test/promise/data/write_db.json diff --git a/setup.py b/setup.py index 8fe54ec..b10e3e2 100644 --- a/setup.py +++ b/setup.py @@ -69,6 +69,7 @@ setup(name=name, entry_points={ 'console_scripts': [ 'agent = slapos.agent.agent:main', + 'apache-mpm-watchdog = slapos.promise.apache_mpm_watchdog:main', 'check-web-page-http-cache-hit = slapos.promise.check_web_page_http_cache_hit:main', 'check-feed-as-promise = slapos.checkfeedaspromise:main', 'check-error-on-apache-log = slapos.promise.check_error_on_apache_log:main', diff --git a/slapos/promise/apache_mpm_watchdog/__init__.py b/slapos/promise/apache_mpm_watchdog/__init__.py new file mode 100644 index 0000000..731c006 --- /dev/null +++ b/slapos/promise/apache_mpm_watchdog/__init__.py @@ -0,0 +1,85 @@ +import requests +import re +import signal +import os +import psutil +import json +import time + +search_pid_regex = r"</td><td.*?>(.+?)</td><td>yes \(old gen\)</td>" + +def loadJSONFile(db_path): + if os.path.exists(db_path): + with open(db_path) as json_file: + try: + return json.load(json_file) + except ValueError: + return {} + else: + return {} + +def writeJSONFile(pid_dict, db_path): + if db_path is None: + # No place to save + return + for pid in pid_dict.copy(): + try: + process = psutil.Process(int(pid)) + except psutil.NoSuchProcess: + del pid_dict[pid] + + with open(db_path, "w") as f: + f.write(json.dumps(pid_dict)) + +def getServerStatus(url, user, password): + try: + if user is not None: + r = requests.get(url, auth=(user, password)) + else: + r = requests.get(url) + + if r.status_code == 200: + return r.text + except requests.exceptions.ConnectionError: + return + +def watchServerStatus(pid_dict, server_status): + _pid_dict = pid_dict.copy() + for i in re.findall(search_pid_regex, server_status): + try: + process = psutil.Process(int(i)) + except psutil.NoSuchProcess: + continue + + # Ensure the process is actually an apache + if process.cmdline()[0].endswith("/httpd"): + pid_dict.setdefault(i, time.time() + timeout) + if pid_dict[i] < time.time(): + print "Sending signal -%s to %s" % (signal.SIGKILL, i) + os.kill(int(i), signal.SIGKILL) + + return _pid_dict + +def main(): + parser = argparse.ArgumentParser() + # Address to ping to + parser.add_argument("-u", "--url", required=True) + # Force use ipv4 protocol + parser.add_argument("-u", "--user") + parser.add_argument("-p", "--password") + parser.add_argument("-d", "--db") + parser.add_argument("-t", "--timeout", default=600) + args = parser.parse_args() + + pid_dict = loadJSONFile(args.db) + + server_status = getServerStatus( + args.url, args.user, args.password) + + if server_status is None: + raise ValueError("Couldn't connect to server status page") + + pid_dict = watchServerStatus(pid_dict, server_status) + + writeJSONFile(pid_dict, args.db) + diff --git a/slapos/test/promise/data/corrupted_db.json b/slapos/test/promise/data/corrupted_db.json new file mode 100644 index 0000000..e69de29 diff --git a/slapos/test/promise/data/server_status.html b/slapos/test/promise/data/server_status.html new file mode 100644 index 0000000..02ba9d8 --- /dev/null +++ b/slapos/test/promise/data/server_status.html @@ -0,0 +1,86 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<html><head> +<title>Apache Status</title> +</head><body> +<h1>Apache Server Status </h1> + +<dl><dt>Server Version: Apache/2.4.25 (Unix) OpenSSL/1.0.2k mod_antiloris/0.5.1</dt> +<dt>Server MPM: event</dt> +<dt>Server Built: Apr 19 2017 12:41:47 +</dt></dl><hr /><dl> +<dt>Current Time: Wednesday, 31-May-2017 14:57:52 CEST</dt> +<dt>Restart Time: Friday, 26-May-2017 00:24:30 CEST</dt> +<dt>Parent Server Config. Generation: 498</dt> +<dt>Parent Server MPM Generation: 497</dt> +<dt>Server uptime: 5 days 14 hours 33 minutes 22 seconds</dt> +<dt>Server load: 2.32 1.98 1.94</dt> +<dt>Total accesses: 26754245 - Total Traffic: 249.0 GB</dt> +<dt>CPU Usage: u7129.39 s1016.88 cu0 cs0 - 1.68% CPU load</dt> +<dt>55.2 requests/sec - 0.5 MB/second - 9.8 kB/request</dt> +<dt>8 requests currently being processed, 117 idle workers</dt> +</dl> + +<table rules="all" cellpadding="1%"> +<tr><th rowspan="2">Slot</th><th rowspan="2">PID</th><th rowspan="2">Stopping</th><th colspan="2">Connections</th> +<th colspan="2">Threads</th><th colspan="3">Async connections</th></tr> +<tr><th>total</th><th>accepting</th><th>busy</th><th>idle</th><th>writing</th><th>keep-alive</th><th>closing</th></tr> +<tr><td>0</td><td>12345</td><td>yes (old gen)</td><td>3</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr> +<tr><td>1</td><td>12346</td><td>yes (old gen)</td><td>3</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr> +<tr><td>2</td><td>24443</td><td>no</td><td>8</td><td>yes</td><td>1</td><td>24</td><td>0</td><td>1</td><td>7</td></tr> +<tr><td>5</td><td>23019</td><td>no</td><td>11</td><td>yes</td><td>1</td><td>24</td><td>0</td><td>3</td><td>7</td></tr> +<tr><td>8</td><td>12348</td><td>yes (old gen)</td><td>1</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr> +<tr><td>9</td><td>23032</td><td>no</td><td>18</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>2</td><td>14</td></tr> +<tr><td>10</td><td>23053</td><td>no</td><td>17</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>2</td><td>14</td></tr> +<tr><td>11</td><td>23118</td><td>no</td><td>13</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>7</td><td>3</td></tr> +<tr><td>Sum</td><td>8</td><td>3</td><td>74</td><td> </td><td>8</td><td>117</td><td>0</td><td>15</td><td>45</td></tr> +</table> +<pre>..............G..G.G.............G.......G.......G___R__________ +___________.................................................._R_ +______________________.......................................... +...........G.....................______________R_____R__________ +______________WC___________R________R_______.................... +................................................................ +................</pre> +<p>Scoreboard Key:<br /> +"<b><code>_</code></b>" Waiting for Connection, +"<b><code>S</code></b>" Starting up, +"<b><code>R</code></b>" Reading Request,<br /> +"<b><code>W</code></b>" Sending Reply, +"<b><code>K</code></b>" Keepalive (read), +"<b><code>D</code></b>" DNS Lookup,<br /> +"<b><code>C</code></b>" Closing connection, +"<b><code>L</code></b>" Logging, +"<b><code>G</code></b>" Gracefully finishing,<br /> +"<b><code>I</code></b>" Idle cleanup of worker, +"<b><code>.</code></b>" Open slot with no current process<br /> +</p> + + +<table border="0"><tr><th>Srv</th><th>PID</th><th>Acc</th><th>M</th><th>CPU +</th><th>SS</th><th>Req</th><th>Conn</th><th>Child</th><th>Slot</th><th>Client</th><th>Protocol</th><th>VHost</th><th>Request</th></tr> + +<tr><td><b>0-495</b></td><td>-</td><td>0/0/168016</td><td>. +</td><td>2094.10</td><td>8672</td><td>8</td><td>0.0</td><td>0.00</td><td>1344.54 +</td><td>163.172.65.117</td><td>http/1.1</td><td nowrap></td><td nowrap></td></tr> + +</table> + <hr /> <table> + <tr><th>Srv</th><td>Child Server number - generation</td></tr> + <tr><th>PID</th><td>OS process ID</td></tr> + <tr><th>Acc</th><td>Number of accesses this connection / this child / this slot</td></tr> + <tr><th>M</th><td>Mode of operation</td></tr> +<tr><th>CPU</th><td>CPU usage, number of seconds</td></tr> +<tr><th>SS</th><td>Seconds since beginning of most recent request</td></tr> + <tr><th>Req</th><td>Milliseconds required to process most recent request</td></tr> + <tr><th>Conn</th><td>Kilobytes transferred this connection</td></tr> + <tr><th>Child</th><td>Megabytes transferred this child</td></tr> + <tr><th>Slot</th><td>Total megabytes transferred this slot</td></tr> + </table> +<hr> +<table cellspacing=0 cellpadding=0> +<tr><td bgcolor="#000000"> +<b><font color="#ffffff" face="Arial,Helvetica">SSL/TLS Session Cache Status:</font></b> </td></tr> +<tr><td bgcolor="#ffffff"> +cache type: <b>SHMCB</b>, shared memory: <b>512000</b> bytes, current entries: <b>642</b><br>subcaches: <b>32</b>, indexes per subcache: <b>88</b><br>time left on oldest entries' objects: avg: <b>15</b> seconds, (range: 0...45)<br>index usage: <b>22%</b>, cache usage: <b>28%</b><br>total entries stored since starting: <b>16275</b><br>total entries replaced since starting: <b>0</b><br>total entries expired since starting: <b>15633</b><br>total (pre-expiry) entries scrolled out of the cache: <b>0</b><br>total retrieves since starting: <b>3285</b> hit, <b>607</b> miss<br>total removes since starting: <b>0</b> hit, <b>0</b> miss<br></td></tr> +</table> +</body></html> diff --git a/slapos/test/promise/data/test_db.json b/slapos/test/promise/data/test_db.json new file mode 100644 index 0000000..b3e3b34 --- /dev/null +++ b/slapos/test/promise/data/test_db.json @@ -0,0 +1 @@ +{"1234": 1496161635.514768, "4321": 1496161635.514768} diff --git a/slapos/test/promise/test_apache_mpm_watchdog.py b/slapos/test/promise/test_apache_mpm_watchdog.py new file mode 100644 index 0000000..fdb15ea --- /dev/null +++ b/slapos/test/promise/test_apache_mpm_watchdog.py @@ -0,0 +1,96 @@ +############################################################################## +# +# Copyright (c) 2017 Vifib SARL and Contributors. All Rights Reserved. +# +# WARNING: This program as such is intended to be used by professional +# programmers who take the whole responsibility of assessing all potential +# consequences resulting from its eventual inadequacies and bugs +# End users who are looking for a ready-to-use solution with commercial +# guarantees and support are strongly adviced to contract a Free Software +# Service Company +# +# This program is Free Software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +############################################################################## + +import unittest +import os.path +import socket +import time +import psutil +from slapos.promise.apache_mpm_watchdog import watchServerStatus, \ + loadJSONFile, writeJSONFile, getServerStatus, search_pid_regex + +from slapos.test.promise import data + +class TestApacheMPMWatchdog(unittest.TestCase): + + def setUp(self): + self.base_path = "/".join(data.__file__.split("/")[:-1]) + + def text_searchPidRegex(self): + + with open(self.base_path + "/server_status.html") as f: + server_status = f.read() + f.close() + + self.assertEquals(['12345', '12346'], + re.findall(search_pid_regex, server_status)) + + + + def test_loadJSONFile(self): + self.assertEquals({}, + loadJSONFile("couscous")) + + self.assertEquals( + {"1234": 1496161635.514768 , "4321": 1496161635.514768}, + loadJSONFile(os.path.join(self.base_path, "test_db.json"))) + + self.assertEquals( + {}, + loadJSONFile(os.path.join(self.base_path, "corrupted_db.json"))) + + def test_writeJSONFile(self): + # Check if don't raise. + self.assertEquals(None, + writeJSONFile({}, None)) + + current_pid = os.getpid() + self.assertEquals(None, + writeJSONFile({"123482": 123, current_pid: 124}, + os.path.join(self.base_path, "write_db.json"))) + + with open(os.path.join(self.base_path, "write_db.json")) as f: + json_content = f.read() + f.close() + + self.assertEquals(json_content, + '{"%s": 124}' % current_pid) + + + def test_getServerStatus(self): + self.assertEquals(None, + getServerStatus("http://localhost/", None, None)) + self.assertEquals(None, + getServerStatus("http://localhost/", + "user", "password")) + self.assertNotEquals(None, + getServerStatus("https://www.erp5.com/", None, None)) + + +if __name__ == '__main__': + unittest.main() + -- 2.30.9