From 5f92ce8a74e0affec0f0b54b5398a2022b02458f Mon Sep 17 00:00:00 2001
From: Rafael Monnerat <rafael@nexedi.com>
Date: Mon, 29 May 2017 19:15:03 +0200
Subject: [PATCH] apache-mpm_watchdog: Initial commit

  Implement a watchdog to kill process which are hold on dead lock
  state after a greceful reload.
---
 .gitignore                                    |  1 +
 setup.py                                      |  1 +
 .../promise/apache_mpm_watchdog/__init__.py   | 85 ++++++++++++++++
 slapos/test/promise/data/corrupted_db.json    |  0
 slapos/test/promise/data/server_status.html   | 86 +++++++++++++++++
 slapos/test/promise/data/test_db.json         |  1 +
 .../test/promise/test_apache_mpm_watchdog.py  | 96 +++++++++++++++++++
 7 files changed, 270 insertions(+)
 create mode 100644 slapos/promise/apache_mpm_watchdog/__init__.py
 create mode 100644 slapos/test/promise/data/corrupted_db.json
 create mode 100644 slapos/test/promise/data/server_status.html
 create mode 100644 slapos/test/promise/data/test_db.json
 create mode 100644 slapos/test/promise/test_apache_mpm_watchdog.py

diff --git a/.gitignore b/.gitignore
index 61ba889..ebc0657 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@
 .eggs/
 config.json
 slapos/test/promise/data/SOFTINST-0_*
+slapos/test/promise/data/write_db.json
diff --git a/setup.py b/setup.py
index 8fe54ec..b10e3e2 100644
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,7 @@ setup(name=name,
       entry_points={
         'console_scripts': [
           'agent = slapos.agent.agent:main',
+          'apache-mpm-watchdog = slapos.promise.apache_mpm_watchdog:main',
           'check-web-page-http-cache-hit = slapos.promise.check_web_page_http_cache_hit:main',
           'check-feed-as-promise = slapos.checkfeedaspromise:main',
           'check-error-on-apache-log = slapos.promise.check_error_on_apache_log:main',
diff --git a/slapos/promise/apache_mpm_watchdog/__init__.py b/slapos/promise/apache_mpm_watchdog/__init__.py
new file mode 100644
index 0000000..731c006
--- /dev/null
+++ b/slapos/promise/apache_mpm_watchdog/__init__.py
@@ -0,0 +1,85 @@
+import requests
+import re
+import signal
+import os
+import psutil
+import json
+import time
+
+search_pid_regex = r"</td><td.*?>(.+?)</td><td>yes \(old gen\)</td>"
+
+def loadJSONFile(db_path):
+  if os.path.exists(db_path):
+    with open(db_path) as json_file:
+       try:
+         return json.load(json_file)
+       except ValueError:
+         return {}
+  else:
+    return {}
+
+def writeJSONFile(pid_dict, db_path):
+  if db_path is None:
+    # No place to save
+    return 
+  for pid in pid_dict.copy():
+    try:
+      process = psutil.Process(int(pid))
+    except psutil.NoSuchProcess:
+      del pid_dict[pid]
+
+  with open(db_path, "w") as f:
+    f.write(json.dumps(pid_dict))
+
+def getServerStatus(url, user, password):
+  try: 
+    if user is not None:
+      r = requests.get(url, auth=(user, password))
+    else:
+      r = requests.get(url)
+
+    if r.status_code == 200:
+      return r.text
+  except requests.exceptions.ConnectionError:
+    return 
+
+def watchServerStatus(pid_dict, server_status):
+  _pid_dict = pid_dict.copy()
+  for i in re.findall(search_pid_regex, server_status):
+    try:
+      process = psutil.Process(int(i))
+    except psutil.NoSuchProcess:
+      continue
+
+    # Ensure the process is actually an apache
+    if process.cmdline()[0].endswith("/httpd"):
+      pid_dict.setdefault(i, time.time() + timeout)
+      if pid_dict[i] < time.time():
+        print "Sending signal -%s to %s" % (signal.SIGKILL, i)
+        os.kill(int(i), signal.SIGKILL)
+
+  return _pid_dict
+
+def main():
+  parser = argparse.ArgumentParser()
+  # Address to ping to
+  parser.add_argument("-u", "--url", required=True)
+  # Force use ipv4 protocol
+  parser.add_argument("-u", "--user")
+  parser.add_argument("-p", "--password")
+  parser.add_argument("-d", "--db")
+  parser.add_argument("-t", "--timeout", default=600)
+  args = parser.parse_args()
+
+  pid_dict = loadJSONFile(args.db)
+
+  server_status = getServerStatus(
+    args.url, args.user, args.password)
+
+  if server_status is None:
+    raise ValueError("Couldn't connect to server status page")
+
+  pid_dict = watchServerStatus(pid_dict, server_status)
+
+  writeJSONFile(pid_dict, args.db)
+
diff --git a/slapos/test/promise/data/corrupted_db.json b/slapos/test/promise/data/corrupted_db.json
new file mode 100644
index 0000000..e69de29
diff --git a/slapos/test/promise/data/server_status.html b/slapos/test/promise/data/server_status.html
new file mode 100644
index 0000000..02ba9d8
--- /dev/null
+++ b/slapos/test/promise/data/server_status.html
@@ -0,0 +1,86 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html><head>
+<title>Apache Status</title>
+</head><body>
+<h1>Apache Server Status </h1>
+
+<dl><dt>Server Version: Apache/2.4.25 (Unix) OpenSSL/1.0.2k mod_antiloris/0.5.1</dt>
+<dt>Server MPM: event</dt>
+<dt>Server Built: Apr 19 2017 12:41:47
+</dt></dl><hr /><dl>
+<dt>Current Time: Wednesday, 31-May-2017 14:57:52 CEST</dt>
+<dt>Restart Time: Friday, 26-May-2017 00:24:30 CEST</dt>
+<dt>Parent Server Config. Generation: 498</dt>
+<dt>Parent Server MPM Generation: 497</dt>
+<dt>Server uptime:  5 days 14 hours 33 minutes 22 seconds</dt>
+<dt>Server load: 2.32 1.98 1.94</dt>
+<dt>Total accesses: 26754245 - Total Traffic: 249.0 GB</dt>
+<dt>CPU Usage: u7129.39 s1016.88 cu0 cs0 - 1.68% CPU load</dt>
+<dt>55.2 requests/sec - 0.5 MB/second - 9.8 kB/request</dt>
+<dt>8 requests currently being processed, 117 idle workers</dt>
+</dl>
+
+<table rules="all" cellpadding="1%">
+<tr><th rowspan="2">Slot</th><th rowspan="2">PID</th><th rowspan="2">Stopping</th><th colspan="2">Connections</th>
+<th colspan="2">Threads</th><th colspan="3">Async connections</th></tr>
+<tr><th>total</th><th>accepting</th><th>busy</th><th>idle</th><th>writing</th><th>keep-alive</th><th>closing</th></tr>
+<tr><td>0</td><td>12345</td><td>yes (old gen)</td><td>3</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
+<tr><td>1</td><td>12346</td><td>yes (old gen)</td><td>3</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
+<tr><td>2</td><td>24443</td><td>no</td><td>8</td><td>yes</td><td>1</td><td>24</td><td>0</td><td>1</td><td>7</td></tr>
+<tr><td>5</td><td>23019</td><td>no</td><td>11</td><td>yes</td><td>1</td><td>24</td><td>0</td><td>3</td><td>7</td></tr>
+<tr><td>8</td><td>12348</td><td>yes (old gen)</td><td>1</td><td>no</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
+<tr><td>9</td><td>23032</td><td>no</td><td>18</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>2</td><td>14</td></tr>
+<tr><td>10</td><td>23053</td><td>no</td><td>17</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>2</td><td>14</td></tr>
+<tr><td>11</td><td>23118</td><td>no</td><td>13</td><td>yes</td><td>2</td><td>23</td><td>0</td><td>7</td><td>3</td></tr>
+<tr><td>Sum</td><td>8</td><td>3</td><td>74</td><td>&nbsp;</td><td>8</td><td>117</td><td>0</td><td>15</td><td>45</td></tr>
+</table>
+<pre>..............G..G.G.............G.......G.......G___R__________
+___________.................................................._R_
+______________________..........................................
+...........G.....................______________R_____R__________
+______________WC___________R________R_______....................
+................................................................
+................</pre>
+<p>Scoreboard Key:<br />
+"<b><code>_</code></b>" Waiting for Connection, 
+"<b><code>S</code></b>" Starting up, 
+"<b><code>R</code></b>" Reading Request,<br />
+"<b><code>W</code></b>" Sending Reply, 
+"<b><code>K</code></b>" Keepalive (read), 
+"<b><code>D</code></b>" DNS Lookup,<br />
+"<b><code>C</code></b>" Closing connection, 
+"<b><code>L</code></b>" Logging, 
+"<b><code>G</code></b>" Gracefully finishing,<br /> 
+"<b><code>I</code></b>" Idle cleanup of worker, 
+"<b><code>.</code></b>" Open slot with no current process<br />
+</p>
+
+
+<table border="0"><tr><th>Srv</th><th>PID</th><th>Acc</th><th>M</th><th>CPU
+</th><th>SS</th><th>Req</th><th>Conn</th><th>Child</th><th>Slot</th><th>Client</th><th>Protocol</th><th>VHost</th><th>Request</th></tr>
+
+<tr><td><b>0-495</b></td><td>-</td><td>0/0/168016</td><td>.
+</td><td>2094.10</td><td>8672</td><td>8</td><td>0.0</td><td>0.00</td><td>1344.54
+</td><td>163.172.65.117</td><td>http/1.1</td><td nowrap></td><td nowrap></td></tr>
+
+</table>
+ <hr /> <table>
+ <tr><th>Srv</th><td>Child Server number - generation</td></tr>
+ <tr><th>PID</th><td>OS process ID</td></tr>
+ <tr><th>Acc</th><td>Number of accesses this connection / this child / this slot</td></tr>
+ <tr><th>M</th><td>Mode of operation</td></tr>
+<tr><th>CPU</th><td>CPU usage, number of seconds</td></tr>
+<tr><th>SS</th><td>Seconds since beginning of most recent request</td></tr>
+ <tr><th>Req</th><td>Milliseconds required to process most recent request</td></tr>
+ <tr><th>Conn</th><td>Kilobytes transferred this connection</td></tr>
+ <tr><th>Child</th><td>Megabytes transferred this child</td></tr>
+ <tr><th>Slot</th><td>Total megabytes transferred this slot</td></tr>
+ </table>
+<hr>
+<table cellspacing=0 cellpadding=0>
+<tr><td bgcolor="#000000">
+<b><font color="#ffffff" face="Arial,Helvetica">SSL/TLS Session Cache Status:</font></b>
</td></tr>
+<tr><td bgcolor="#ffffff">
+cache type: <b>SHMCB</b>, shared memory: <b>512000</b> bytes, current entries: <b>642</b><br>subcaches: <b>32</b>, indexes per subcache: <b>88</b><br>time left on oldest entries' objects: avg: <b>15</b> seconds, (range: 0...45)<br>index usage: <b>22%</b>, cache usage: <b>28%</b><br>total entries stored since starting: <b>16275</b><br>total entries replaced since starting: <b>0</b><br>total entries expired since starting: <b>15633</b><br>total (pre-expiry) entries scrolled out of the cache: <b>0</b><br>total retrieves since starting: <b>3285</b> hit, <b>607</b> miss<br>total removes since starting: <b>0</b> hit, <b>0</b> miss<br></td></tr>
+</table>
+</body></html>
diff --git a/slapos/test/promise/data/test_db.json b/slapos/test/promise/data/test_db.json
new file mode 100644
index 0000000..b3e3b34
--- /dev/null
+++ b/slapos/test/promise/data/test_db.json
@@ -0,0 +1 @@
+{"1234": 1496161635.514768, "4321": 1496161635.514768}
diff --git a/slapos/test/promise/test_apache_mpm_watchdog.py b/slapos/test/promise/test_apache_mpm_watchdog.py
new file mode 100644
index 0000000..fdb15ea
--- /dev/null
+++ b/slapos/test/promise/test_apache_mpm_watchdog.py
@@ -0,0 +1,96 @@
+##############################################################################
+#
+# Copyright (c) 2017 Vifib SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+import unittest
+import os.path
+import socket
+import time
+import psutil
+from slapos.promise.apache_mpm_watchdog import watchServerStatus, \
+  loadJSONFile, writeJSONFile, getServerStatus, search_pid_regex
+
+from slapos.test.promise import data
+
+class TestApacheMPMWatchdog(unittest.TestCase):
+
+  def setUp(self):
+    self.base_path = "/".join(data.__file__.split("/")[:-1])
+
+  def text_searchPidRegex(self):
+
+    with open(self.base_path + "/server_status.html") as f:
+      server_status = f.read()
+      f.close()
+
+    self.assertEquals(['12345', '12346'], 
+      re.findall(search_pid_regex, server_status))
+
+    
+
+  def test_loadJSONFile(self):
+    self.assertEquals({},
+       loadJSONFile("couscous"))
+
+    self.assertEquals(
+      {"1234": 1496161635.514768 , "4321": 1496161635.514768},
+      loadJSONFile(os.path.join(self.base_path, "test_db.json")))
+
+    self.assertEquals(
+      {},
+      loadJSONFile(os.path.join(self.base_path, "corrupted_db.json")))
+
+  def test_writeJSONFile(self):
+    # Check if don't raise.
+    self.assertEquals(None,
+      writeJSONFile({}, None))
+
+    current_pid = os.getpid() 
+    self.assertEquals(None,
+      writeJSONFile({"123482": 123, current_pid: 124},
+          os.path.join(self.base_path, "write_db.json")))
+
+    with open(os.path.join(self.base_path, "write_db.json")) as f:
+      json_content = f.read()
+      f.close()
+
+    self.assertEquals(json_content,
+      '{"%s": 124}' % current_pid)
+
+
+  def test_getServerStatus(self):
+    self.assertEquals(None,
+        getServerStatus("http://localhost/", None, None))
+    self.assertEquals(None,
+        getServerStatus("http://localhost/", 
+                           "user", "password"))
+    self.assertNotEquals(None,
+        getServerStatus("https://www.erp5.com/", None, None))
+
+
+if __name__ == '__main__':
+  unittest.main()
+
-- 
2.30.9