Commit 9fe0637c authored by Xavier Thompson's avatar Xavier Thompson

[wip] promise: Move promise plugins here

parent 99cf4bfd
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
from slapos.grid.promise.generic import TestResult
import re
import sys
import pytz
from os.path import isfile, getmtime
from datetime import datetime
from croniter import croniter
from dateutil.parser import parse
from tzlocal import get_localzone
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# check backup ran OK every 5 minutes
self.setPeriodicity(minute=5)
def sense(self):
"""
backupserver run rdiff-backup and log everything in a text file.
At the beginning of the backup, we have "backup running" printed in the text file.
At the end of the backup, we can have one of the following printed in the text file:
* "backup failed" -> backup failed
* "backup success" -> backup succeeded
A backup is valid only if we have the 2 conditions:
* we can grep "backup running" in the text file
* we can't grep "backup failed" in the text file
"""
script = self.getConfig('script_fullpath')
status = self.getConfig('status_fullpath')
local_tz = get_localzone()
prev_cron = croniter(self.getConfig('cron_frequency'), datetime.now()).get_prev(datetime) # date of the previous time cron launched
prev_cron = local_tz.localize(prev_cron)
status_url = "{}/private/{}/{}".format(self.getConfig("monitor_url"), self.getConfig("status_dirbasename"), self.getConfig("status_name"))
statistic_url = "{}/private/{}/{}".format(self.getConfig("monitor_url"), self.getConfig("statistic_dirbasename"), self.getConfig("statistic_name"))
# If log file is not present, it can be OK if we launched the instance after the last cron due date
if not isfile(status):
if pytz.utc.localize(datetime.utcfromtimestamp(getmtime(script))) < prev_cron:
self.logger.error("Backup status file is not present")
else:
self.logger.info("Backup was never launched")
return
# First, parse the log file
backup_started = False
backup_ended = False
with open(status, 'r') as f:
for line in f:
m = re.match(r"(.*), (.*), (.*), backup (.*)$", line)
if m:
if m.group(4) == "running":
backup_started = True
backup_start = parse(m.group(1))
elif m.group(4) == "failed":
backup_ended = True
backup_failed = True
backup_end = parse(m.group(1))
elif m.group(4) == "success":
backup_ended = True
backup_failed = False
backup_end = parse(m.group(1))
# Then check result
if backup_ended and backup_failed:
self.logger.error("Backup FAILED at {} (see {} ).".format(backup_end, status_url))
elif not backup_started:
self.logger.error("Can't find backup start date. Is there a problem with status file? (see {} ).".format(status_url))
elif backup_start < prev_cron:
self.logger.error("Backup didn't start at correct time: it started at {} but should have started after {}. (see {} ).".format(backup_start, prev_cron, status_url))
elif not backup_ended:
self.logger.info("Backup currently running, started at {} (see {} ).".format(backup_start, status_url))
else:
self.logger.info("Backup OK, started at {} and lasted {} (see full stats at {} and status at {} ).".format(
backup_start,
backup_end - backup_start,
statistic_url,
status_url
))
def test(self):
"""
This is the default test function. Could be commented.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Anomaly returns a TestResult instead of AnomalyResult because we don't
want to call bang when there is a problem. Usually the problem won't be
in the deployment of this instance but rather in the instance we are
backuping. This will need a human intervention.
"""
return self._test(result_count=1, failure_amount=1)
[buildout]
parts =
${download-promise-plugins:modules}
[download-promise-plugins]
directory = ${__init__:promise-folder}
modules =
${__init__:module}
${backupserver_check_backup:module}
${check_certificate:module}
${check_command_execute:module}
${check_error_on_http_log:module}
${check_error_on_zope_longrequest_log:module}
${check_file_state:module}
${check_free_disk_space:module}
${check_icmp_packet_lost:module}
${check_neo_health:module}
${check_partition_deployment_state:module}
${check_re6st_optimal_status:module}
${check_server_cpu_load:module}
${check_service_state:module}
${check_socket_listening:module}
${check_surykatka_json:module}
${check_url_available:module}
${monitor_bootstrap_status:module}
${trafficserver_cache_availability:module}
${util:module}
${validate_frontend_configuration:module}
[plugin-download-base]
recipe = hexagonit.recipe.download
ignore-existing = true
download-only = true
url = ${:_profile_base_location_}/${:module}.py
module = ${:_buildout_section_name_}
filename = ${:module}.py
promise-folder = ${buildout:parts-directory}/promise
destination = ${:promise-folder}/plugin
mode = 0644
[__init__]
<= plugin-download-base
[backupserver_check_backup]
<= plugin-download-base
[check_certificate]
<= plugin-download-base
[check_command_execute]
<= plugin-download-base
[check_error_on_http_log]
<= plugin-download-base
[check_error_on_zope_longrequest_log]
<= plugin-download-base
[check_file_state]
<= plugin-download-base
[check_free_disk_space]
<= plugin-download-base
[check_icmp_packet_lost]
<= plugin-download-base
[check_neo_health]
<= plugin-download-base
[check_partition_deployment_state]
<= plugin-download-base
[check_re6st_optimal_status]
<= plugin-download-base
[check_server_cpu_load]
<= plugin-download-base
[check_service_state]
<= plugin-download-base
[check_socket_listening]
<= plugin-download-base
[check_surykatka_json]
<= plugin-download-base
[check_url_available]
<= plugin-download-base
[monitor_bootstrap_status]
<= plugin-download-base
[trafficserver_cache_availability]
<= plugin-download-base
[util]
<= plugin-download-base
[validate_frontend_configuration]
<= plugin-download-base
from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
from slapos.util import str2bytes
from zope.interface import implementer
import datetime
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def sense(self):
"""
Check the certificate
"""
certificate_file = self.getConfig('certificate')
key_file = self.getConfig('key')
try:
certificate_expiration_days = int(
self.getConfig('certificate-expiration-days', '15'))
except ValueError:
self.logger.error('ERROR certificate-expiration-days is wrong: %r' % (
self.getConfig('certificate-expiration-days')))
return
try:
with open(certificate_file, 'r') as fh:
certificate = x509.load_pem_x509_certificate(
str2bytes(fh.read()), default_backend())
except Exception as e:
self.logger.error(
'ERROR Problem loading certificate %r, error: %s' % (
certificate_file, e))
return
try:
with open(key_file, 'r') as fh:
key = serialization.load_pem_private_key(
str2bytes(fh.read()), None, default_backend())
except Exception as e:
self.logger.error(
'ERROR Problem loading key %r, error: %s' % (key_file, e))
return
if certificate.public_key().public_numbers() != \
key.public_key().public_numbers():
self.logger.error(
'ERROR Certificate %r does not match key %r' % (
certificate_file, key_file))
return
if certificate.not_valid_after - datetime.timedelta(
days=certificate_expiration_days) < datetime.datetime.utcnow():
self.logger.error(
'ERROR Certificate %r will expire in less than %s days' % (
certificate_file, certificate_expiration_days))
return
self.logger.info(
'OK Certificate %r and key %r are ok' % (certificate_file, key_file))
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import subprocess
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# SR can set custom periodicity
self.setPeriodicity(float(self.getConfig('frequency', 2)))
self.result_count = int(self.getConfig('result_count', '1'))
self.failure_amount = int(self.getConfig('failure_amount', '1'))
def sense(self):
"""
Check result of the executed command
"""
command = self.getConfig('command')
try:
out = subprocess.check_output(
command,
shell=True,
stderr=subprocess.STDOUT)
status = 0
except subprocess.CalledProcessError as e:
out = e.output
status = e.returncode
except Exception as e:
self.logger.error(
"ERROR %r during running command %r" % (e, command))
return
if status != 0:
self.logger.error(
'ERROR %r run with failure, output: %r' % (command, out))
else:
self.logger.info("OK %r run with success" % (command,))
def anomaly(self):
return self._anomaly(result_count=self.result_count, failure_amount=self.failure_amount)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise, TestResult
import re
import time
import os
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# set periodicity to run the promise twice per day
self.custom_frequency = int(self.getConfig('frequency', 720))
self.setPeriodicity(self.custom_frequency)
# Skip test check on this promise
self.setTestLess()
def sense(self):
"""
Check if http log file contain errors
"""
log_file = self.getConfig('log-file')
maximum_delay = int(self.getConfig('maximum-delay', 0))
if not log_file:
raise ValueError("log file was not set in promise parameters.")
regex = re.compile(br"^(\[[^\]]+\]) (\[[^\]]+\]) (.*)$")
error_amount = 0
no_route_error = 0
network_is_unreachable = 0
timeout = 0
parsing_failure = 0
if not os.path.exists(log_file):
# file don't exist, nothing to check
self.logger.info("OK")
return
with open(log_file, "rb") as f:
f.seek(0, 2)
block_end_byte = f.tell()
f.seek(-min(block_end_byte, 4096), 1)
data = f.read()
for line in reversed(data.splitlines()):
m = regex.match(line)
if m is None:
continue
dt, level, msg = m.groups()
try:
try:
t = time.strptime(dt[1:-1].decode('utf-8'), "%a %b %d %H:%M:%S %Y")
except ValueError:
# Fail to parser for the first time, try a different output.
t = time.strptime(dt[1:-1].decode('utf-8'), "%a %b %d %H:%M:%S.%f %Y")
except ValueError:
# Probably it fail to parse
if parsing_failure < 3:
# Accept failure 2 times, as the line can be actually
# cut on the middle.
parsing_failure += 1
continue
raise
if maximum_delay and (time.time()-time.mktime(t)) > maximum_delay:
# no result in the latest hour
break
if level != b"[error]":
continue
# Classify the types of errors
if b"(113)No route to host" in msg:
no_route_error += 1
elif b"(101)Network is unreachable" in msg:
network_is_unreachable += 1
elif b"(110)Connection timed out" in msg:
timeout += 1
error_amount += 1
if error_amount:
self.logger.error("ERROR=%s (NOROUTE=%s, UNREACHABLENET=%s, TIMEOUT=%s)" % (
error_amount, no_route_error, network_is_unreachable, timeout))
else:
self.logger.info("OK")
def anomaly(self):
# only check the result of the two latest sense call
return self._test(result_count=2, failure_amount=2, latest_minute=self.custom_frequency*3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import time
import os
import sys
import re
r = re.compile(br"^([0-9]+\-[0-9]+\-[0-9]+ [0-9]+\:[0-9]+\:[0-9]+)(\,[0-9]+) - ([A-z]+) (.*)$")
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=10)
def sense(self):
log_file = self.getConfig('log-file')
error_threshold = self.getConfig('error-threshold')
maximum_delay = self.getConfig('maximum-delay')
error_amount = 0
if not os.path.exists(log_file):
# file don't exist, nothing to check
self.logger.info("log file does not exist: log check skipped")
return 0
with open(log_file, "rb") as f:
f.seek(0, 2)
block_end_byte = f.tell()
f.seek(-min(block_end_byte, 4096*10), 1)
data = f.read()
for line in reversed(data.splitlines()):
m = r.match(line)
if m is None:
continue
dt, _, level, msg = m.groups()
try:
t = time.strptime(dt.decode('utf-8'), "%Y-%m-%d %H:%M:%S")
except ValueError:
continue
if maximum_delay and (time.time()-time.mktime(t)) > maximum_delay:
# no result in the latest hour
break
error_amount += 1
if error_amount > error_threshold:
self.logger.error('ERROR: Site has %s long request' % error_amount)
else:
self.logger.info('INFO: Site has %s long request' % error_amount)
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# SR can set custom periodicity
self.setPeriodicity(float(self.getConfig('frequency', 2)))
def sense(self):
"""
Check state of the filename
state can be empty or not-empty
"""
filename = self.getConfig('filename')
state = self.getConfig('state')
url = self.getConfig('url').strip()
try:
with open(filename) as f:
result = f.read()
except Exception as e:
self.logger.error(
"ERROR %r during opening and reading file %r" % (e, filename))
return
if state == 'empty' and result != '':
message_list = ['ERROR %r not empty' % (filename,)]
if url:
message_list.append(', content available at %s' % (url,))
self.logger.error(''.join(message_list))
elif state == 'not-empty' and result == '':
self.logger.error(
"ERROR %r empty" % (filename,))
else:
self.logger.info("OK %r state %r" % (filename, state))
def anomaly(self):
return self._anomaly(result_count=3, failure_amount=3)
from __future__ import division
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
import sys
import sqlite3
import argparse
import datetime
import psutil
from slapos.collect.db import Database
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# check disk space at least every 3 minutes
self.setPeriodicity(minute=3)
def getDiskSize(self, disk_partition, database):
database = Database(database, create=False, timeout=10)
try:
# fetch disk size
database.connect()
where_query = "partition='%s'" % (disk_partition)
order = "datetime(date || ' ' || time) DESC"
query_result = database.select("disk", columns="free+used", where=where_query, order=order, limit=1)
result = query_result.fetchone()
if not result or not result[0]:
return None
disk_size = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return disk_size
def getFreeSpace(self, disk_partition, database, date, time):
database = Database(database, create=False, timeout=10)
try:
# fetch free disk space
database.connect()
where_query = "time between '%s:00' and '%s:30' and partition='%s'" % (time, time, disk_partition)
query_result = database.select("disk", date, "free", where=where_query)
result = query_result.fetchone()
if not result or not result[0]:
self.logger.info("No result from collector database: disk check skipped")
return 0
disk_free = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return 0
raise
finally:
try:
database.close()
except Exception:
pass
return int(disk_free)
def raiseOnDatabaseLocked(self, locked_message):
max_warn = 10
latest_result_list = self.getLastPromiseResultList(result_count=max_warn)
warning_count = 0
if len(latest_result_list) < max_warn:
return False
for result in latest_result_list[0]:
if result['status'] == "ERROR" and locked_message in result["message"]:
return True
for result_list in latest_result_list:
found = False
for result in result_list:
if result['status'] == "WARNING" and locked_message in result["message"]:
found = True
warning_count += 1
break
if not found:
break
if warning_count == max_warn:
# too many warning on database locked, now fail.
return True
self.logger.warn("collector database is locked by another process")
return False
@staticmethod
def _checkInodeUsage(path):
stat = os.statvfs(path)
total_inode = stat.f_files
if total_inode:
usage = 100 * (total_inode - stat.f_ffree) / total_inode
if usage >= 98:
return "Disk Inodes usage is really high: %.4f%%" % usage
def getInodeUsage(self, path):
return (self._checkInodeUsage(path) or
os.path.ismount('/tmp') and self._checkInodeUsage('/tmp') or
"")
def sense(self):
# find if a disk is mounted on the path
disk_partition = ""
db_path = self.getConfig('collectordb')
check_date = self.getConfig('test-check-date')
path = os.path.join(self.getPartitionFolder(), "") + "extrafolder"
partitions = psutil.disk_partitions()
while path is not '/':
if not disk_partition:
path = os.path.dirname(path)
else:
break
for p in partitions:
if p.mountpoint == path:
disk_partition = p.device
break
if not disk_partition:
self.logger.error("Couldn't find disk partition")
return
if db_path.endswith("collector.db"):
db_path=db_path[:-len("collector.db")]
if check_date:
# testing mode
currentdate = check_date
currenttime = self.getConfig('test-check-time', '09:17')
disk_partition = self.getConfig('test-disk-partition', '/dev/sda1')
else:
# get last minute
now = datetime.datetime.now()
currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M')
disk_size = self.getDiskSize(disk_partition, db_path)
default_threshold = None
if disk_size is not None:
default_threshold = round(disk_size/(1024*1024*1024) * 0.05, 2)
threshold = float(self.getConfig('threshold', default_threshold) or 2.0)
threshold_days = float(self.getConfig('threshold-days', '30'))
free_space = self.getFreeSpace(disk_partition, db_path, currentdate,
currenttime)
if free_space == 0:
return
elif free_space > threshold*1024*1024*1024:
inode_usage = self.getInodeUsage(self.getPartitionFolder())
if inode_usage:
self.logger.error(inode_usage)
else:
self.logger.info("Disk usage: OK")
return
free_space = round(free_space/(1024*1024*1024), 2)
self.logger.error('Free disk space low: remaining %s G (threshold: %s G)' % (
free_space, threshold))
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise, TestResult
import re
import time
from slapos.networkbench.ping import ping, ping6
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# set periodicity to run the promise twice per day
self.custom_frequency = int(self.getConfig('frequency', 720))
self.setPeriodicity(self.custom_frequency)
# Skip test check on this promise
self.setTestLess()
def sense(self):
"""
Check if there ICMP packets lost on given address
"""
# Address to ping to
address = self.getConfig('address')
if not address:
raise ValueError("'address' was not set in promise parameters.")
# Force use ipv4 protocol ?
ipv4 = self.getConfig('ipv4') in ('True', 'true', '1')
count = int(self.getConfig('count', 10))
threshold = int(self.getConfig('threshold', 0))
if threshold < 0:
raise ValueError("'threshold' value should be greater than 0.")
if ipv4:
result = ping(address, count=count)
else:
result = ping6(address, count=count)
message = "%s host=%s code=%s, result=%s, packet_lost_ratio=%s msg=%s" % result
packet_lost_ratio = int(result[4])
if packet_lost_ratio == -1 or packet_lost_ratio > threshold:
# Packet lost occurred
self.logger.error(message)
else:
self.logger.info(message)
def anomaly(self):
# only check the result of the two latest sense call
return self._test(result_count=2, failure_amount=2, latest_minute=self.custom_frequency*3)
import json
import subprocess
import six
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import \
AnomalyResult, GenericPromise, TestResult
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
config.setdefault('periodicity', 10)
super(RunPromise, self).__init__(config)
self.setTestLess()
def sense(self):
try:
summary = subprocess.check_output(
(self.getConfig('neoctl'), 'print', 'summary'),
universal_newlines=True,
).splitlines()
severities = json.loads(summary[0][1:])
if severities:
cluster_list = sum(six.itervalues(severities), [])
try:
cluster_list.remove(None)
except ValueError:
summary = []
else:
summary = ['main']
if cluster_list:
cluster_list.sort()
summary.append('backup: ' + ', '.join(cluster_list))
(self.logger.error if 'problem' in severities else
self.logger.warning)('; '.join(summary))
else:
self.logger.info(summary[1])
except Exception as e:
self.logger.critical(str(e))
def anomaly(self):
latest_result_list = self.getLastPromiseResultList()
if latest_result_list:
for result in latest_result_list[0]:
status = result['status']
message = result['message']
if status == 'CRITICAL':
return AnomalyResult(True, message)
if status != 'INFO':
result_class = TestResult
if status == 'ERROR':
status = 'PROBLEM'
# XXX: Allow the user not to bang in this case, because this is
# counter-productive as long as we haven't implemented the
# ability to ignore the state (started or stopped) of services.
if self.getConfig('bang-on-problem', True):
result_class = AnomalyResult
return result_class(True, '%s (%s)' % (status, message))
else:
message = "No result found!"
return AnomalyResult(False, message)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
from datetime import datetime
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=1)
def sense(self):
"""
Run the promise code and store the result
raise error, log error message, ... for failure
"""
partition_folder = self.getPartitionFolder()
log_folder = os.path.join(partition_folder, 'var/log')
log_name = 'slapgrid-%s-error.log' % self.getConfig('partition-id')
slapgrid_error_log_file = os.path.join(partition_folder, '.%s' % log_name)
link_file = os.path.join(log_folder, log_name)
monitor_url = self.getConfig('monitor-url')
message = ''
if os.path.exists(slapgrid_error_log_file) and \
os.stat(slapgrid_error_log_file).st_size:
message = 'Buildout failed to process %s.' % self.getConfig('partition-id')
if monitor_url:
message += '\nSee %s/log/%s for more information.' % (monitor_url, log_name)
if not os.path.exists(link_file):
os.symlink(slapgrid_error_log_file, link_file)
else:
if os.path.exists(link_file):
os.unlink(link_file)
if message:
self.logger.error(message)
else:
self.logger.info("buildout is OK")
def test(self):
"""
Test promise and say if problem is detected or not
Return TestResult object
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=2, failure_amount=2)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise, TestResult
import re
import time
from slapos.networkbench.ping import ping, ping6
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# set periodicity to run the promise twice per day
self.custom_frequency = int(self.getConfig('frequency', 720))
self.setPeriodicity(self.custom_frequency)
# Skip test check on this promise
self.setTestLess()
def sense(self):
"""
Check re6st optimal status
"""
# promise ipv6 and ipv4 address to compare.
ipv4 = self.getConfig('ipv4')
ipv6 = self.getConfig('ipv6')
count = int(self.getConfig('count', 10))
if not ipv4:
raise ValueError("'ipv4' was not set in promise parameters.")
if not ipv6:
raise ValueError("'ipv6' was not set in promise parameters.")
result_ipv4 = ping(ipv4, count=count)
result_ipv6 = ping6(ipv6, count=count)
# push into to the log file
self.logger.info("%s host=%s code=%s, result=%s, packet_lost_ratio=%s msg=%s" % result_ipv4)
self.logger.info("%s host=%s code=%s, result=%s, packet_lost_ratio=%s msg=%s" % result_ipv6)
if result_ipv4[3] == "failed" and result_ipv6[3] != "failed":
# IPv4 is unreacheable
self.logger.info("OK: IPv4 unreachable, IPv6 reachable")
return
if result_ipv6[3] == "failed":
# IPv6 is unreacheable
self.logger.error("FAILED: IPv4 reachable, IPv6 unreachable")
return
latency4 = float(result_ipv4[3])
latency6 = float(result_ipv6[3])
# We can consider that at worst 1ms is added to
# ipv4 response, due the usage of openvpn.
acceptable_delay = int(self.getConfig('acceptable-delay', 1))
# We can consider that we accept a certain increase
# on latency, if we are on a bit congested link.
# So 10% is reseonable enough.
acceptable_lost = int(self.getConfig('acceptable-lost', 0.10))
# Increase latency with the value.
latency4 += acceptable_delay + latency4 * acceptable_lost
if latency4 < latency6:
self.logger.error("FAIL %s (latency4) > %s (latence6)" % (latency4, latency6))
else:
# Compare if both has Same working rate
self.logger.info("OK: IPv4 reachable, IPv6 reachable")
def anomaly(self):
# only check the result of the two latest sense call
return self._test(result_count=2, failure_amount=2, latest_minute=self.custom_frequency*3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import subprocess
import os
import psutil
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# test load every 3 minutes
self.setPeriodicity(minute=3)
def checkCPULoad(self, tolerance=2.2):
# tolerance=1.5 => accept CPU load up to 1.5 =150%
load, load5, long_load = psutil.getloadavg()
core_count = psutil.cpu_count()
max_load = core_count * tolerance
if long_load > max_load:
# display top statistics
top_result = subprocess.check_output(('top', '-n', '1', '-b'),
universal_newlines=True)
message = "CPU load is high: %s %s %s\n\n" % (load, load5, long_load)
i = 0
result_list = top_result.split('\n')
# display first 5 lines
while i < len(result_list) and i < 5:
message += "\n%s" % result_list[i]
i += 1
self.logger.error(message)
else:
self.logger.info("CPU load is OK")
def sense(self):
load_threshold = self.getConfig('cpu-load-threshold')
threshold = 0
if load_threshold is not None:
try:
threshold = float(load_threshold)
except ValueError as e:
self.logger.error("CPU load threshold %r is not valid: %s" % (load_threshold, e))
return
self.checkCPULoad(threshold or 2.2)
def test(self):
# fail if load is higher than the threshold for more than 30 minutes
return self._test(result_count=10, failure_amount=10)
def anomaly(self):
# fail if load is higher than the threshold for more than 30 minutes
return self._test(result_count=10, failure_amount=10)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
from slapos.grid.svcbackend import getSupervisorRPC
from slapos.grid.svcbackend import _getSupervisordSocketPath
import os
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# SR can set custom periodicity
self.setPeriodicity(float(self.getConfig('frequency', 2)))
self.result_count = int(self.getConfig('result-count', 3))
self.failure_amount = int(self.getConfig('failure-amount', 3))
def sense(self):
"""
Check if a service is in the expected state
"""
service = self.getConfig('service')
expect = self.getConfig('expect', '')
if expect not in ("running", "stopped"):
self.logger.info("OK service %r is allowed to be in any state (expect = %r)", service, expect)
return
if not service:
self.logger.error("ERROR no service is specified")
return
run_directory = self.getConfig('run-directory', os.path.join(self.getPartitionFolder(), "srv", "runner", "var", "run"))
try:
supervisor_rpc = getSupervisorRPC(_getSupervisordSocketPath(run_directory, self.logger))
with supervisor_rpc as supervisor:
state = supervisor.getProcessInfo(service)['statename'].lower()
if state != expect:
self.logger.error("ERROR service %r is in state %r (expected %r)", service, state, expect)
else:
self.logger.info("OK service %r is in expected state %r", service, state)
except Exception as e:
self.logger.error("ERROR %r", e, exc_info=True)
def anomaly(self):
return self._anomaly(result_count=self.result_count, failure_amount=self.failure_amount)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import socket
ADDRESS_USAGE = (
"Address must be specified in 1 of the following 3 forms:"
" (host, port), path or abstract")
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=2)
self.result_count = int(self.getConfig('result-count', 3))
self.failure_amount = int(self.getConfig('failure-amount', 3))
def sense(self):
"""
Check the state of a socket.
"""
host = self.getConfig('host')
port = self.getConfig('port')
path = self.getConfig('pathname')
abstract = self.getConfig('abstract')
if host:
if path or abstract or not port:
self.logger.error(ADDRESS_USAGE)
return
# type of port must be int or str, unicode is not accepted.
family, _, _, _, addr = socket.getaddrinfo(host, int(port))[0]
else:
if bool(path) == bool(abstract):
self.logger.error(ADDRESS_USAGE)
return
family = socket.AF_UNIX
addr = path or '\0' + abstract
s = socket.socket(family, socket.SOCK_STREAM)
try:
s.connect(addr)
except socket.error as e:
self.logger.error('%s: %s', type(e).__name__, e)
else:
self.logger.info("socket connection OK %r", addr)
finally:
s.close()
def anomaly(self):
"""
By default, there is an anomaly if last 3 senses were bad.
"""
return self._anomaly(result_count=self.result_count, failure_amount=self.failure_amount)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import datetime
import email.utils
import json
import os
import time
from six.moves.urllib.parse import urlparse
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
EXTENDED_STATUS_CODE_MAPPING = {
'520': 'Too many redirects',
'523': 'Connection error',
'524': 'Connection timeout',
'526': 'SSL Error',
}
def __init__(self, config):
super(RunPromise, self).__init__(config)
# Set frequency compatible to default surykatka interval - 2 minutes
self.setPeriodicity(float(self.getConfig('frequency', 2)))
self.failure_amount = int(
self.getConfig('failure-amount', self.getConfig('failure_amount', 1)))
self.result_count = self.failure_amount
self.error_list = []
self.info_list = []
# Make promise test-less, as it's result is not important for instantiation
self.setTestLess()
def appendErrorMessage(self, message):
self.error_list.append(message)
def appendInfoMessage(self, message):
self.info_list.append(message)
def emitLog(self):
if len(self.error_list) > 0:
emit = self.logger.error
else:
emit = self.logger.info
message_list = self.error_list + self.info_list
url = self.getConfig('url')
if url:
message_list.insert(0, '%s :' % (url,))
emit(' '.join(message_list))
def senseBotStatus(self):
key = 'bot_status'
def appendError(msg, *args):
self.appendErrorMessage(key + ': ERROR ' + msg % args)
if key not in self.surykatka_json:
appendError("%r not in %r", key, self.json_file)
return
bot_status_list = self.surykatka_json[key]
if len(bot_status_list) == 0:
appendError("%r empty in %r", key, self.json_file)
return
bot_status = bot_status_list[0]
if bot_status.get('text') != 'loop':
appendError(
"bot_status is %r instead of 'loop' in %r",
str(bot_status.get('text')), self.json_file)
return
timetuple = email.utils.parsedate(bot_status['date'])
last_bot_datetime = datetime.datetime.fromtimestamp(time.mktime(timetuple))
last_bot_datetime_string = email.utils.formatdate(time.mktime(timetuple))
delta = self.utcnow - last_bot_datetime
# sanity check
if delta < datetime.timedelta(minutes=0):
appendError('Last bot datetime is in future')
return
if delta > datetime.timedelta(minutes=15):
appendError('Last bot datetime is more than 15 minutes old')
return
self.appendInfoMessage('%s: OK Last bot status' % (key,))
def senseSslCertificate(self):
key = 'ssl_certificate'
def appendError(msg, *args):
self.appendErrorMessage(key + ': ERROR ' + msg % args)
url = self.getConfig('url')
parsed_url = urlparse(url)
if parsed_url.scheme == 'https':
hostname = parsed_url.netloc
ssl_check = True
certificate_expiration_days = self.getConfig(
'certificate-expiration-days', '15')
try:
certificate_expiration_days = int(certificate_expiration_days)
except ValueError:
certificate_expiration_days = None
else:
ssl_check = False
certificate_expiration_days = None
if not ssl_check:
return
if certificate_expiration_days is None:
appendError(
'certificate-expiration-days %r is incorrect',
self.getConfig('certificate-expiration-days'))
return
if not hostname:
appendError('url is incorrect')
return
if key not in self.surykatka_json:
appendError(
'No key %r. If the error persist, please update surykatka.' % (key,))
return
entry_list = [
q for q in self.surykatka_json[key] if q['hostname'] == hostname]
if len(entry_list) == 0:
appendError('No data')
return
for entry in entry_list:
timetuple = email.utils.parsedate(entry['not_after'])
if timetuple is None:
appendError('No certificate information for %s' % (entry['ip']))
else:
certificate_expiration_time = datetime.datetime.fromtimestamp(
time.mktime(timetuple))
if certificate_expiration_time - datetime.timedelta(
days=certificate_expiration_days) < self.utcnow:
appendError(
'Certificate on %s will expire on %s, which is less than %s days',
entry['ip'], entry['not_after'], certificate_expiration_days)
return
else:
self.appendInfoMessage(
'%s: OK Certificate on %s will expire on %s, which is more than '
'%s days' % (
key, entry['ip'], entry['not_after'],
certificate_expiration_days))
return
def senseHttpQuery(self):
key = 'http_query'
error = False
def appendError(msg, *args):
self.appendErrorMessage(key + ': ERROR ' + msg % args)
if key not in self.surykatka_json:
appendError("%r not in %r", key, self.json_file)
return
url = self.getConfig('url')
status_code = self.getConfig('status-code')
ip_list = self.getConfig('ip-list', '').split()
http_header_dict = json.loads(self.getConfig('http-header-dict', '{}'))
entry_list = [q for q in self.surykatka_json[key] if q['url'] == url]
if len(entry_list) == 0:
appendError('No data')
return
for entry in entry_list:
entry_status_code = str(entry['status_code'])
if entry_status_code != status_code:
status_code_explanation = self.EXTENDED_STATUS_CODE_MAPPING.get(
entry_status_code)
if status_code_explanation:
status_code_explanation = '%s (%s)' % (
entry_status_code, status_code_explanation)
else:
status_code_explanation = entry_status_code
appendError(
'IP %s got status code %s instead of %s' % (
entry['ip'], status_code_explanation, status_code))
error = True
if http_header_dict and http_header_dict != entry['http_header_dict']:
appendError(
'HTTP Header dict was %s instead of %s' % (
json.dumps(entry['http_header_dict'], sort_keys=True),
json.dumps(http_header_dict, sort_keys=True),
))
error = True
db_ip_list = [q['ip'] for q in entry_list]
if len(ip_list):
if set(ip_list) != set(db_ip_list):
appendError(
'expected IPs %s differes from got %s' % (
' '.join(ip_list), ' '.join(db_ip_list)))
error = True
if error:
return
info_message = '%s: OK with status code %s' % (key, status_code)
if http_header_dict:
info_message += ' and HTTP Header dict %s' % (
json.dumps(http_header_dict, sort_keys=True),
)
if len(ip_list) > 0:
info_message += ' on IPs %s' % (' '.join(ip_list))
self.appendInfoMessage(info_message)
def senseElapsedTime(self):
key = 'elapsed_time'
surykatka_key = 'http_query'
def appendError(msg, *args):
self.appendErrorMessage(key + ': ERROR ' + msg % args)
if surykatka_key not in self.surykatka_json:
appendError(
'No key %r. If the error persist, please update surykatka.' % (
surykatka_key,))
return
url = self.getConfig('url')
maximum_elapsed_time = self.getConfig('maximum-elapsed-time')
entry_list = [
q for q in self.surykatka_json[surykatka_key] if q['url'] == url]
if len(entry_list) == 0:
appendError('No data')
return
for entry in entry_list:
if maximum_elapsed_time:
if 'total_seconds' in entry:
maximum_elapsed_time = float(maximum_elapsed_time)
if entry['total_seconds'] == 0.:
appendError('IP %s failed to reply' % (entry['ip']))
elif entry['total_seconds'] > maximum_elapsed_time:
appendError(
'IP %s replied in more time than maximum %.2fs' %
(entry['ip'], maximum_elapsed_time))
else:
self.appendInfoMessage(
'%s: OK IP %s replied in less time than maximum %.2fs' % (
key, entry['ip'], maximum_elapsed_time))
def sense(self):
"""
Check if frontend URL is available
"""
test_utcnow = self.getConfig('test-utcnow')
if test_utcnow:
self.utcnow = datetime.datetime.fromtimestamp(
time.mktime(email.utils.parsedate(test_utcnow)))
else:
self.utcnow = datetime.datetime.utcnow()
self.json_file = self.getConfig('json-file', '')
if not os.path.exists(self.json_file):
self.appendErrorMessage('ERROR File %r does not exists' % self.json_file)
else:
with open(self.json_file) as fh:
try:
self.surykatka_json = json.load(fh)
except Exception:
self.appendErrorMessage(
"ERROR loading JSON from %r" % self.json_file)
else:
report = self.getConfig('report')
if report == 'bot_status':
self.senseBotStatus()
elif report == 'http_query':
self.senseHttpQuery()
self.senseSslCertificate()
self.senseElapsedTime()
else:
self.appendErrorMessage(
"ERROR Report %r is not supported" % report)
self.emitLog()
def anomaly(self):
return self._test(
result_count=self.result_count, failure_amount=self.failure_amount)
"""
Some notable parameters:
promise-timeout:
Optional timeout (in seconds) for promise.
timeout:
Optional timeout (in seconds) for HTTP request.
verify, ca-cert-file, cert-file, key-file:
Optional SSL information. (See Python requests documentation.)
http-code:
(default 200) The expected response HTTP code.
ignore-code:
(default 0) If set to 1, ignore the response HTTP code.
username, password:
If supplied, enables basic HTTP authentication.
"""
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import requests
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# SR can set custom periodicity
self.setPeriodicity(float(self.getConfig('frequency', 2)))
def sense(self):
"""
Check if frontend URL is available.
"""
url = self.getConfig('url')
# make default time a max of 5 seconds, a bit smaller than promise-timeout
# and in the same time at least 1 second
default_timeout = max(
1, min(5, int(self.getConfig('promise-timeout', 20)) - 1))
expected_http_code = int(self.getConfig('http-code', 200))
ca_cert_file = self.getConfig('ca-cert-file')
cert_file = self.getConfig('cert-file')
key_file = self.getConfig('key-file')
verify = int(self.getConfig('verify', 0))
username = self.getConfig('username')
password = self.getConfig('password')
if int(self.getConfig('ignore-code', 0)) == 1:
ignore_code = True
else:
ignore_code = False
if ca_cert_file:
verify = ca_cert_file
elif verify:
verify = True
else:
verify = False
if key_file and cert_file:
cert = (cert_file, key_file)
else:
cert = None
if username and password:
credentials = (username, password)
request_type = "authenticated"
else:
credentials = None
request_type = "non-authenticated"
request_options = {
'allow_redirects': True,
'timeout': int(self.getConfig('timeout', default_timeout)),
'verify': verify,
'cert': cert,
'auth': credentials,
}
try:
response = requests.get(url, **request_options)
except requests.exceptions.SSLError as e:
if 'certificate verify failed' in str(e):
self.logger.error(
"ERROR SSL verify failed while accessing %r", url)
else:
self.logger.error(
"ERROR Unknown SSL error %r while accessing %r", e, url)
except requests.ConnectionError as e:
self.logger.error(
"ERROR connection not possible while accessing %r", url)
except Exception as e:
self.logger.error("ERROR: %s", e)
else:
# Log a sensible message, depending on the request/response
# parameters.
if ignore_code:
log = self.logger.info
result = "succeeded"
message = "return code ignored"
elif response.status_code == expected_http_code:
log = self.logger.info
result = "succeeded"
message = "returned expected code %d" % expected_http_code
else:
log = self.logger.error
result = "failed"
message = "returned %d, expected %d" % (response.status_code,
expected_http_code)
log("%s request to %r %s (%s)", request_type, url, result, message)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
import time
import psutil
from .util import tail_file
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=2)
def sense(self):
process_pid_file = self.getConfig('process-pid-file')
if not os.path.exists(process_pid_file):
self.logger.error("Bootstrap didn't run!")
return
with open(process_pid_file) as f:
try:
pid = int(f.read())
except ValueError as e:
raise ValueError("%r is empty or doesn't contain a valid pid number: %s" % (
process_pid_file, str(e)))
try:
process = psutil.Process(pid)
command_string = ' '.join(process.cmdline())
if "monitor.bootstrap" in command_string and \
self.getPartitionFolder() in command_string:
for i in range(0, 15):
if process.is_running():
time.sleep(1)
else:
break
else:
self.logger.error("Monitor bootstrap is running for more than 15 seconds!")
return
except psutil.NoSuchProcess:
# process exited
pass
status_file = self.getConfig('status-file')
if os.path.exists(status_file) and not os.stat(status_file).st_size:
self.logger.info("Bootstrap OK")
return
message = "Monitor bootstrap exited with error."
log_file = os.path.join(self.getPartitionFolder(), ".%s_%s.log" % (
self.getConfig('partition-id'),
self.getConfig('process-name')))
if os.path.exists(log_file):
message += "\n ---- Latest monitor-boostrap.log ----\n"
message += tail_file(log_file, 4)
self.logger.error(message)
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
# bang if we have 3 error successively
return self._anomaly(result_count=3, failure_amount=3)
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
try:
import subprocess32 as subprocess
except ImportError:
import subprocess
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=int(self.getConfig('frequency', 5)))
def sense(self):
"""
Check trafficserver cache availability
"""
wrapper = self.getConfig('wrapper-path')
if 'traffic_line' in wrapper:
args = [wrapper, '-r', 'proxy.node.cache.percent_free']
message = "Cache not available, availability: %s"
elif 'traffic_ctl' in wrapper:
args = [wrapper, 'metric', 'get', 'proxy.process.cache.percent_full']
message = "Cache not available, occupation: %s"
else:
self.logger.error("Wrapper %r not supported." % (wrapper,))
return
try:
subprocess.check_output(args, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
result = e.output.strip()
self.logger.error(message, result if str is bytes else
result.decode('utf-8', 'replace'))
else:
self.logger.info("OK")
def anomaly(self):
"""
There is an anomaly if last 3 senses were bad.
"""
return self._anomaly(result_count=3, failure_amount=3)
\ No newline at end of file
def tail_file(file_path, line_count=10):
"""
Returns the last lines of file.
"""
line_list = []
with open(file_path) as f:
BUFSIZ = 1024
f.seek(0, 2)
bytes = f.tell()
size = line_count + 1
block = -1
while size > 0 and bytes > 0:
if bytes - BUFSIZ > 0:
# Seek back one whole BUFSIZ
f.seek(block * BUFSIZ, 2)
line_list.insert(0, f.read(BUFSIZ))
else:
f.seek(0, 0)
# only read what was not read
line_list.insert(0, f.read(bytes))
line_len = line_list[0].count('\n')
size -= line_len
bytes -= BUFSIZ
block -= 1
return '\n'.join(''.join(line_list).splitlines()[-line_count:])
\ No newline at end of file
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
try:
import subprocess32 as subprocess
except ImportError:
import subprocess
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# check configuration every 5 minutes (only for anomaly)
self.setPeriodicity(minute=int(self.getConfig('frequency', 5)))
def sense(self):
"""
Run frontend validatation script
"""
validate_script = self.getConfig('verification-script')
if not validate_script:
raise ValueError("'verification-script' was not set in promise parameters.")
try:
subprocess.check_output(validate_script, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
message = e.output
self.logger.error(message if str is bytes else
message.decode('utf-8', 'replace'))
else:
self.logger.info("OK")
def anomaly(self):
return self._anomaly(result_count=1, failure_amount=1)
......@@ -2,6 +2,7 @@
extends =
buildout.hash.cfg
../../promise/plugin/buildout.cfg
../../component/apache/buildout.cfg
../../component/curl/buildout.cfg
../../component/dash/buildout.cfg
......@@ -16,6 +17,19 @@ parts =
monitor-eggs
monitor2-template
###
[slapos-cookbook-develop]
egg = slapos.cookbook
[slapos-cookbook]
eggs = ${slapos-cookbook-develop:egg}
[slapos.cookbook-repository]
repository = https://lab.nexedi.com/xavier_thompson/slapos.git
branch = plugin_recipe_extra_paths
develop = true
###
[monitor-download-base]
recipe = hexagonit.recipe.download
ignore-existing = true
......@@ -93,6 +107,7 @@ context =
raw template_wrapper ${monitor-template-wrapper:location}/${monitor-template-wrapper:filename}
raw check_disk_space ${buildout:bin-directory}/check-free-disk
raw bin_directory ${buildout:directory}/bin
raw plugins_directory ${download-promise-plugins:directory}
[versions]
cns.recipe.symlink = 0.2.3
......@@ -14,7 +14,7 @@
# not need these here).
[monitor2-template]
filename = instance-monitor.cfg.jinja2.in
md5sum = d4185c191e8b9df20e1f98cd8c556b1d
md5sum = 234289899a0cd1f04cb196a632a713ab
[monitor-httpd-conf]
_update_hash_filename_ = templates/monitor-httpd.conf.in
......
......@@ -301,9 +301,13 @@ pre = {{ monitor_statistic }} --history_folder ${monitor-directory:public}
[monitor-promise-base]
recipe = slapos.cookbook:promise.plugin
eggs =
slapos.toolbox
cryptography
pytz
tzlocal
extra-paths =
{{ plugins_directory }}
content =
from slapos.promise.plugin.${:module} import RunPromise
from plugin.${:module} import RunPromise
mode = 600
output = ${directory:plugins}/${:name}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment