Commit 76f0a0bd authored by Justin's avatar Justin

General alarms for node monitor

See merge request !105
parents 3b7d673f 3b0c79df
...@@ -11,7 +11,8 @@ for f in sorted(glob.glob(os.path.join('slapos', 'README.*.rst'))): ...@@ -11,7 +11,8 @@ for f in sorted(glob.glob(os.path.join('slapos', 'README.*.rst'))):
long_description += open("CHANGES.txt").read() + "\n" long_description += open("CHANGES.txt").read() + "\n"
test_require = ['mock', 'cryptography',] prediction_require = ['statsmodels', 'scipy', 'pandas']
test_require = ['mock', 'cryptography',] + prediction_require
setup(name=name, setup(name=name,
version=version, version=version,
...@@ -61,6 +62,8 @@ setup(name=name, ...@@ -61,6 +62,8 @@ setup(name=name,
'lampconfigure': ["mysqlclient"], #needed for MySQL Database access 'lampconfigure': ["mysqlclient"], #needed for MySQL Database access
'zodbpack': ['ZODB3'], # needed to play with ZODB 'zodbpack': ['ZODB3'], # needed to play with ZODB
'flask_auth' : ["Flask-Auth"], 'flask_auth' : ["Flask-Auth"],
'pandas' : ['pandas'], # needed to monitor_partition_space promise
'prediction' : prediction_require, # needed to use ARIMA in check_free_disk_space
'test': test_require, 'test': test_require,
}, },
tests_require=test_require, tests_require=test_require,
......
...@@ -36,23 +36,12 @@ import time ...@@ -36,23 +36,12 @@ import time
import json import json
import argparse import argparse
import psutil import psutil
from time import strftime, gmtime from time import strftime
from datetime import datetime, timedelta from datetime import datetime, timedelta
from slapos.collect.db import Database from slapos.collect.db import Database
from slapos.collect.reporter import ConsumptionReportBase from slapos.collect.reporter import ConsumptionReportBase
def get_date_scope():
return strftime('%Y-%m-%d', gmtime())
def get_min_time():
gm = gmtime()
return '{}:{}:00'.format(gm.tm_hour, gm.tm_min - 1)
def get_max_time():
gm = gmtime()
return '{}:{}:59'.format(gm.tm_hour, gm.tm_min - 1)
def parseArguments(): def parseArguments():
""" """
Parse arguments for monitor collector instance. Parse arguments for monitor collector instance.
...@@ -114,11 +103,11 @@ class ResourceCollect: ...@@ -114,11 +103,11 @@ class ResourceCollect:
if where != "": if where != "":
where = "and %s" % where where = "and %s" % where
if not date_scope: if not date_scope:
date_scope = get_date_scope() date_scope = datetime.now().strftime('%Y-%m-%d')
if not min_time: if not min_time:
min_time = get_min_time() min_time = (datetime.now() - timedelta(minutes=1)).strftime('%H:%M:00')
if not max_time: if not max_time:
max_time = get_max_time() max_time = (datetime.now() - timedelta(minutes=1)).strftime('%H:%M:59')
columns = """count(pid), SUM(cpu_percent) as cpu_result, SUM(cpu_time), columns = """count(pid), SUM(cpu_percent) as cpu_result, SUM(cpu_time),
MAX(cpu_num_threads), SUM(memory_percent), SUM(memory_rss), pid, SUM(io_rw_counter), MAX(cpu_num_threads), SUM(memory_percent), SUM(memory_rss), pid, SUM(io_rw_counter),
...@@ -159,11 +148,11 @@ class ResourceCollect: ...@@ -159,11 +148,11 @@ class ResourceCollect:
if where != "": if where != "":
where = " and %s" % where where = " and %s" % where
if not date_scope: if not date_scope:
date_scope = get_date_scope() date_scope = datetime.now().strftime('%Y-%m-%d')
if not min_time: if not min_time:
min_time = get_min_time() min_time = (datetime.now() - timedelta(minutes=1)).strftime('%H:%M:00')
if not max_time: if not max_time:
max_time = get_max_time() max_time = (datetime.now() - timedelta(minutes=1)).strftime('%H:%M:59')
colums = """count(pid), SUM(cpu_percent), SUM(cpu_time), SUM(cpu_num_threads), SUM(memory_percent), colums = """count(pid), SUM(cpu_percent), SUM(cpu_time), SUM(cpu_num_threads), SUM(memory_percent),
SUM(memory_rss), SUM(io_rw_counter), SUM(io_cycles_counter)""" SUM(memory_rss), SUM(io_rw_counter), SUM(io_cycles_counter)"""
......
import json
import os
import psutil
import time
from .util import JSONPromise
from zope.interface import implementer
from slapos.grid.promise import interface
@implementer(interface.IPromise)
class RunPromise(JSONPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(float(self.getConfig('frequency', 2)))
self.avg_flag_file = self.getConfig('last-avg-computation-file', 'last_avg')
self.max_spot_temp = float(self.getConfig('max-spot-temp', 90)) # °C
self.max_avg_temp = float(self.getConfig('max-avg-temp', 80)) # °C
self.avg_temp_duration = int(self.getConfig('avg-temp-duration', 600)) # secondes
def sense(self):
success = True
# Get current temperature
try:
cpu_temp = psutil.sensors_temperatures()['coretemp'][0][1]
except (KeyError, IndexError) as e:
# Put logger.info to avoid errors when sensors are not
# supported by OS (ex: VM)
self.logger.info("Could not read core temperature on VM")
return
# Check spot temperature
if cpu_temp > self.max_spot_temp:
success = False
self.logger.error(
"Temperature reached critical threshold: %s °C"
" (threshold is %s °C)",
cpu_temp, self.max_spot_temp)
# Log temperature
data = json.dumps({'cpu_temperature': cpu_temp})
self.json_logger.info("Temperature data", extra={'data': data})
# TODO: promise should compute average only with logs between interval
# Computer average temperature
avg_computation_period = self.avg_temp_duration / 4
try:
t = os.path.getmtime(self.avg_flag_file)
except OSError:
t = 0
if (time.time() - t) > avg_computation_period:
open(self.avg_flag_file, 'w').close()
temp_list = self.getJsonLogDataInterval(self.avg_temp_duration)
if temp_list:
avg_temp = sum(x['cpu_temperature'] for x in temp_list) / len(temp_list)
if avg_temp > self.max_avg_temp:
success = False
self.logger.error(
"Average temperature over the last %ds reached threshold: %s °C"
" (threshold is %s °C)",
self.avg_temp_duration, avg_temp, self.max_avg_temp)
else:
success = False
self.logger.error("Couldn't read temperature from log")
if success:
self.logger.info("Temperature OK (%s °C)", cpu_temp)
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
...@@ -11,21 +11,33 @@ import sqlite3 ...@@ -11,21 +11,33 @@ import sqlite3
import argparse import argparse
import datetime import datetime
import psutil import psutil
import itertools
import warnings
import pkgutil
from slapos.collect.db import Database from slapos.collect.db import Database
from contextlib import closing
# install pandas, numpy and statsmodels for ARIMA prediction
try:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
except ImportError:
pass
@implementer(interface.IPromise) @implementer(interface.IPromise)
class RunPromise(GenericPromise): class RunPromise(GenericPromise):
def __init__(self, config): def __init__(self, config):
super(RunPromise, self).__init__(config) super(RunPromise, self).__init__(config)
# check disk space at least every 3 minutes # check disk space at least every hours (heavy in computation)
self.setPeriodicity(minute=3) self.setPeriodicity(float(self.getConfig('frequency', 60)))
def getDiskSize(self, disk_partition, database): def getDiskSize(self, disk_partition, database):
database = Database(database, create=False, timeout=10) database = Database(database, create=False, timeout=10)
# by using contextlib.closing, we don't need to close the database explicitly
with closing(database):
try: try:
# fetch disk size # fetch disk size
database.connect() database.connect()
...@@ -44,15 +56,11 @@ class RunPromise(GenericPromise): ...@@ -44,15 +56,11 @@ class RunPromise(GenericPromise):
not self.raiseOnDatabaseLocked(locked_message): not self.raiseOnDatabaseLocked(locked_message):
return None return None
raise raise
finally:
try:
database.close()
except Exception:
pass
return disk_size return disk_size
def getFreeSpace(self, disk_partition, database, date, time): def getFreeSpace(self, disk_partition, database, date, time):
database = Database(database, create=False, timeout=10) database = Database(database, create=False, timeout=10)
with closing(database):
try: try:
# fetch free disk space # fetch free disk space
database.connect() database.connect()
...@@ -71,12 +79,143 @@ class RunPromise(GenericPromise): ...@@ -71,12 +79,143 @@ class RunPromise(GenericPromise):
not self.raiseOnDatabaseLocked(locked_message): not self.raiseOnDatabaseLocked(locked_message):
return 0 return 0
raise raise
finally: return int(disk_free)
def getBiggestPartitions(self, database, date, time):
# displays the 3 biggest partitions thanks to disk usage
limit = 3
database = Database(database, create=False, timeout=10)
with closing(database):
try: try:
database.close() database.connect()
date_time = date + ' ' + time
# gets the data recorded between the current date (date_time) and 24 hours earlier
where_query = "datetime(date || ' ' || time) >= datetime('%s', '-1 days') AND datetime(date || ' ' || time) <= datetime('%s')"
# gets only the most recent data for each partition
result = database.select(
"folder",
columns = "partition, disk_used*1024, max(datetime(date || ' ' || time))",
where = where_query % (date_time, date_time),
group = "partition",
order = "disk_used DESC",
limit = limit).fetchall()
if not result or not result[0]:
self.logger.info("No result from collector database in table folder: skipped")
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
return result
def evaluateArimaModel(self, X, arima_order):
"""
Evaluate an ARIMA model for a given order (p,d,q) with the MSE which
measures the average of the squares of the errors.
"""
# take 66% of the data for training and 33% for testing
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
# make predictions
predictions = list()
for t in range(len(test)):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
model = ARIMA(history, order=arima_order)
model_fit = model.fit(disp=-1)
yhat = model_fit.forecast()[0]
predictions.append(yhat)
history.append(test[t])
# calculate out of sample error
rmse = (np.square(np.subtract(test.values, np.hstack(predictions))).mean())**0.5
return rmse
def evaluateModels(self, dataset, p_values, d_values, q_values):
"""
Evaluate combinations of p, d and q values for an ARIMA model
"""
dataset = dataset.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
for d in d_values:
for q in q_values:
order = (p,d,q)
try:
rmse = self.evaluateArimaModel(dataset, order)
if rmse < best_score:
best_score, best_cfg = rmse, order
except Exception: except Exception:
pass pass
return int(disk_free) return best_cfg
def diskSpacePrediction(self, disk_partition, database, date, time, day_range):
"""
Returns an estimation of free disk space left depending on
the day_range parameter.
It uses Arima in order to predict data thanks to the 15 days before.
"""
database = Database(database, create=False, timeout=10)
with closing(database):
try:
database.connect()
# get one data per day, where each data is at the same time
where_query = "time between '%s:00' and '%s:30' and partition='%s'" % (
time, time, disk_partition)
result = database.select(
"disk",
columns = "free, datetime(date || ' ' || time)",
where = where_query,
order = "datetime(date || ' ' || time) ASC").fetchall()
# checks that there are at least 14 days of data
if (not result) or (len(result) < 14):
self.logger.info("No or not enough results from collector database in table disk: no prediction")
return None
# put the list in pandas dataframe format and set the right types
df = pd.DataFrame(data=result, columns=['free', 'date'])
df.loc[:,'date'] = pd.to_datetime(df.date)
df = df.astype({'free': np.float})
df = df.set_index('date')
# find the best configuration by trying different combinations
p_values = d_values = q_values = range(0, 3)
best_cfg = self.evaluateModels(df.free, p_values, d_values, q_values)
# set the days to be predicted
max_date_predicted = day_range+1
future_index_date = pd.date_range(df.index[-1], freq='24H', periods=max_date_predicted)
try:
# disabling warnings during the ARIMA calculation
with warnings.catch_warnings():
warnings.simplefilter("ignore")
model_arima = ARIMA(df, order=best_cfg)
# disp < 0 means no output about convergence information
model_arima_fit = model_arima.fit(disp=-1)
# save ARIMA predictions
fcast, _, conf = model_arima_fit.forecast(max_date_predicted, alpha=0.05)
# pass the same index as the others
fcast = pd.Series(fcast, index=future_index_date)
if fcast.empty:
self.logger.info("Arima prediction: none. Skipped prediction")
return None
except Exception:
self.logger.info("Arima prediction error: skipped prediction")
return None
# get results with 95% confidence
lower_series = pd.Series(conf[:, 0], index=future_index_date)
upper_series = pd.Series(conf[:, 1], index=future_index_date)
return fcast, lower_series, upper_series
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
def raiseOnDatabaseLocked(self, locked_message): def raiseOnDatabaseLocked(self, locked_message):
max_warn = 10 max_warn = 10
...@@ -149,7 +288,7 @@ class RunPromise(GenericPromise): ...@@ -149,7 +288,7 @@ class RunPromise(GenericPromise):
disk_partition = self.getConfig('test-disk-partition', '/dev/sda1') disk_partition = self.getConfig('test-disk-partition', '/dev/sda1')
else: else:
# get last minute # get last minute
now = datetime.datetime.now() now = datetime.datetime.utcnow()
currentdate = now.strftime('%Y-%m-%d') currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1) currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M') currenttime = currenttime.time().strftime('%H:%M')
...@@ -158,8 +297,7 @@ class RunPromise(GenericPromise): ...@@ -158,8 +297,7 @@ class RunPromise(GenericPromise):
default_threshold = None default_threshold = None
if disk_size is not None: if disk_size is not None:
default_threshold = round(disk_size/(1024*1024*1024) * 0.05, 2) default_threshold = round(disk_size/(1024*1024*1024) * 0.05, 2)
threshold = float(self.getConfig('threshold', default_threshold) or 2.0) threshold = float(self.getConfig('threshold', default_threshold) or default_threshold)
threshold_days = float(self.getConfig('threshold-days', '30'))
free_space = self.getFreeSpace(disk_partition, db_path, currentdate, free_space = self.getFreeSpace(disk_partition, db_path, currentdate,
currenttime) currenttime)
...@@ -170,12 +308,57 @@ class RunPromise(GenericPromise): ...@@ -170,12 +308,57 @@ class RunPromise(GenericPromise):
if inode_usage: if inode_usage:
self.logger.error(inode_usage) self.logger.error(inode_usage)
else: else:
self.logger.info("Disk usage: OK") self.logger.info("Current disk usage: OK")
# if the option is enabled and the current disk size is large enough,
# we check the predicted remaining disk space
display_prediction = bool(int(self.getConfig('display-prediction', 0) or 0))
self.logger.info("Enable to display disk space predictions: %s" % display_prediction)
if display_prediction:
# check that the libraries are installed from the slapos.toolbox extra requires
pandas_found = pkgutil.find_loader("pandas")
numpy_found = pkgutil.find_loader("numpy")
statsmodels_found = pkgutil.find_loader("statsmodels")
# if one module isn't installed
if pandas_found is None or numpy_found is None or statsmodels_found is None:
self.logger.warning("Trying to use statsmodels and pandas " \
"but at least one module is not installed. Prediction skipped.")
return return
nb_days_predicted = int(self.getConfig('nb-days-predicted', 10) or 10)
disk_space_prediction_tuple = self.diskSpacePrediction(
disk_partition, db_path, currentdate, currenttime, nb_days_predicted)
if disk_space_prediction_tuple is not None:
fcast, lower_series, upper_series = disk_space_prediction_tuple
space_left_predicted = fcast.iloc[-1]
last_date_predicted = datetime.datetime.strptime(str(fcast.index[-1]),
"%Y-%m-%d %H:%M:%S")
delta_days = (last_date_predicted.date() - \
datetime.datetime.strptime(currentdate, "%Y-%m-%d").date()).days
self.logger.info("Prediction: there will be %.2f G left on %s (%s days)." % (
space_left_predicted/(1024*1024*1024), last_date_predicted, delta_days))
if space_left_predicted <= threshold*1024*1024*1024:
self.logger.warning("The free disk space will be too low. " \
"(disk size: %.2f G, threshold: %s G)" % (
disk_size/(1024*1024*1024), threshold))
return
message = "Free disk space low: remaining %.2f G (disk size: %.0f G, threshold: %.0f G)." % (
free_space/(1024*1024*1024), disk_size/(1024*1024*1024), threshold)
free_space = round(free_space/(1024*1024*1024), 2) display_partition = bool(int(self.getConfig('display-partition', 0) or 0))
self.logger.error('Free disk space low: remaining %s G (threshold: %s G)' % ( self.logger.info("Enable to display the 3 biggest partitions: %s" % display_partition)
free_space, threshold)) if display_partition:
# display the 3 partitions that have the most storage capacity on the disk
big_partitions = self.getBiggestPartitions(db_path, currentdate, currenttime)
if big_partitions is not None:
for partition in big_partitions:
user_name, size_partition, date_checked = partition
partition_id = self.getConfig('partition-id', 'slappart')
# get the name of each partition by adding the user's number to the general name of the partition
partition_name = ''.join(x for x in partition_id if not x.isdigit()) + ''.join(filter(str.isdigit, user_name))
message += " The partition %s uses %.2f G (date checked: %s)." % (
partition_name, size_partition/(1024*1024*1024), date_checked)
# display the final error message
self.logger.error(message)
def test(self): def test(self):
return self._test(result_count=1, failure_amount=1) return self._test(result_count=1, failure_amount=1)
......
import psutil
import math
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# Get reference values
self.setPeriodicity(float(self.getConfig('frequency', 5)))
self.max_lost_packets = int(self.getConfig('max-lost-packets-per-MB', 100))
self.max_error_messages = int(self.getConfig('max-error-messages-per-MB', 100))
def sense(self):
promise_success = True
# Get current Network statistics
network_data = psutil.net_io_counters()
# Get total number of bytes recv and sent in MB (if > 1MB)
if (network_data.bytes_recv + network_data.bytes_sent) > 1e6:
total_MB = (network_data.bytes_recv + network_data.bytes_sent)/1e6
else:
total_MB = 1
# Get sum of errors and dropped packets
total_dropped = network_data.dropin + network_data.dropout
total_errors = network_data.errin + network_data.errout
# Check for network dropped packets
if total_dropped/total_MB >= self.max_lost_packets:
self.logger.error("Network packets lost reached critical threshold: %s "\
" (threshold is %s per MB)" % (math.ceil(total_dropped/total_MB), self.max_lost_packets))
promise_success = False
# Check for network errors
if total_errors/total_MB >= self.max_error_messages:
self.logger.error("Network errors reached critical threshold: %s "\
" (threshold is %s per MB)" % (math.ceil(total_errors/total_MB), self.max_error_messages))
promise_success = False
if promise_success:
self.logger.info("Network statistics OK")
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
import json
import os
import psutil
import time
from psutil._common import bytes2human
from .util import JSONPromise
from zope.interface import implementer
from slapos.grid.promise import interface
@implementer(interface.IPromise)
class RunPromise(JSONPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# Get reference values
self.setPeriodicity(float(self.getConfig('frequency', 1)))
self.last_transit_file = self.getConfig('last-transit-file', 'last_transit')
self.max_data_amount = float(self.getConfig('max-data-amount', 10e3))*1048576 # MB converted into bytes
self.min_data_amount = float(self.getConfig('min-data-amount', 0.1))*1048576 # MB converted into bytes
self.transit_period = int(self.getConfig('transit-period', 600)) # secondes
def sense(self):
promise_success = True
# Get current network statistics, see https://psutil.readthedocs.io/en/latest/#network
network_data = psutil.net_io_counters(nowrap=True)
data_amount = network_data.bytes_recv + network_data.bytes_sent
# Log data amount
data = json.dumps({'network_data_amount': data_amount})
self.json_logger.info("Network data amount", extra={'data': data})
# Get last timestamp (i.e. last modification) of log file
try:
t = os.path.getmtime(self.last_transit_file)
except OSError:
t = 0
# We recalculate every quarter of transit_period since calculate over periodicity
# can be heavy in computation
if (time.time() - t) > self.transit_period / 4:
open(self.last_transit_file, 'w').close()
temp_list = self.getJsonLogDataInterval(self.transit_period)
if temp_list:
# If no previous data in log
if len(temp_list) == 1:
pass
else:
data_diff = temp_list[0]['network_data_amount'] - temp_list[-1]['network_data_amount']
if data_diff <= self.min_data_amount:
self.logger.error("Network congested, data amount over the last %s seconds "\
"reached minimum threshold: %7s (threshold is %7s)"
% (self.transit_period, bytes2human(data_diff), bytes2human(self.min_data_amount)))
promise_success = False
if data_diff >= self.max_data_amount:
self.logger.error("Network congested, data amount over the last %s seconds "\
"reached maximum threshold: %7s (threshold is %7s)"
% (self.transit_period, bytes2human(data_diff), bytes2human(self.max_data_amount)))
promise_success = False
else:
self.logger.error("Couldn't read network data from log")
promise_success = False
if promise_success:
self.logger.info("Network transit OK")
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
import json
import os
import psutil
import time
from psutil._common import bytes2human
from .util import JSONPromise
from zope.interface import implementer
from slapos.grid.promise import interface
@implementer(interface.IPromise)
class RunPromise(JSONPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# Get reference values
self.setPeriodicity(float(self.getConfig('frequency', 2)))
self.last_avg_ram_file = self.getConfig('last-avg-ram-file', 'last_avg')
self.min_threshold_ram = float(self.getConfig('min-threshold-ram', 500))*1048576 # MB converted into bytes
self.min_avg_ram = float(self.getConfig('min-avg-ram', 1e3))*1048576 # MB converted into bytes
self.avg_ram_period = int(self.getConfig('avg-ram-period', 600)) # secondes
def sense(self):
promise_success = True
# Get current RAM usage
ram_data = psutil.virtual_memory()
# Check with min threshold and log error if below it
if ram_data.available <= self.min_threshold_ram:
self.logger.error("RAM usage reached critical threshold: %7s "\
" (threshold is %7s)" % (bytes2human(ram_data.available), bytes2human(self.min_threshold_ram)))
promise_success = False
# Log RAM usage
data = json.dumps({'available_ram': ram_data.available})
self.json_logger.info("RAM data", extra={'data': data})
# Get last timestamp (i.e. last modification) of log file
try:
t = os.path.getmtime(self.last_avg_ram_file)
except OSError:
t = 0
# Get last available RAM from log file since avg_ram_period / 4
if (time.time() - t) > self.avg_ram_period / 4:
open(self.last_avg_ram_file, 'w').close()
temp_list = self.getJsonLogDataInterval(self.avg_ram_period)
if temp_list:
avg_ram = sum(map(lambda x: x['available_ram'], temp_list)) / len(temp_list)
if avg_ram < self.min_avg_ram:
self.logger.error("Average RAM usage over the last %s seconds "\
"reached threshold: %7s (threshold is %7s)"
% (self.avg_ram_period, bytes2human(avg_ram), bytes2human(self.min_avg_ram)))
promise_success = False
else:
self.logger.error("Couldn't read available RAM from log")
promise_success = False
if promise_success:
self.logger.info("RAM usage OK")
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
...@@ -12,7 +12,7 @@ class RunPromise(GenericPromise): ...@@ -12,7 +12,7 @@ class RunPromise(GenericPromise):
def __init__(self, config): def __init__(self, config):
super(RunPromise, self).__init__(config) super(RunPromise, self).__init__(config)
# test load every 3 minutes # test load every 3 minutes
self.setPeriodicity(minute=3) self.setPeriodicity(float(self.getConfig('frequency', 3)))
def checkCPULoad(self, tolerance=2.2): def checkCPULoad(self, tolerance=2.2):
# tolerance=1.5 => accept CPU load up to 1.5 =150% # tolerance=1.5 => accept CPU load up to 1.5 =150%
......
from __future__ import division
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
import sys
import pwd
import sqlite3
import argparse
import datetime
import psutil
import math
import pkgutil
# try to install pandas and numpy
try:
import pandas as pd
import numpy as np
except ImportError:
pass
from slapos.collect.db import Database
from contextlib import closing
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# at least every hours (heavy in computation)
self.setPeriodicity(float(self.getConfig('frequency', 60)))
def getDiskSize(self, disk_partition, db_path):
database = Database(db_path, create=False, timeout=10)
# by using contextlib.closing, we don't need to close the database explicitly
with closing(database):
try:
database.connect()
where_query = "partition='%s'" % (disk_partition)
order = "datetime(date || ' ' || time) DESC"
result = database.select(
"disk",
columns="free+used",
where=where_query,
order=order,
limit=1).fetchone()
if not result or not result[0]:
return None
disk_size = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
return disk_size
def getPartitionSize(self, disk_partition, db_path):
database = Database(db_path, create=False, timeout=10)
with closing(database):
try:
database.connect()
where_query = "partition='%s'" % (disk_partition)
order = "datetime(date || ' ' || time) DESC"
result = database.select(
"folder",
columns="disk_used*1024",
where=where_query,
order=order,
limit=1).fetchone()
if not result or not result[0]:
return None
partition_size = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
return partition_size
def getAnomaly(self, disk_partition, db_path, user, date, time):
database = Database(db_path, create=False, timeout=10)
with closing(database):
try:
disk_size = self.getDiskSize(disk_partition, db_path)
if disk_size is None:
return None
database.connect()
result = database.select(
"folder",
columns = "%s-disk_used*1024, disk_used*1024, datetime(date || ' ' || time)" % disk_size,
where = "partition='%s'" % (user),
order = "date ASC, time ASC"
).fetchall()
if not result or not result[0]:
self.logger.info("No result from collector database for the user %s: skipped", user)
return None
datetime_now = datetime.datetime.strptime(date + ' ' + time, "%Y-%m-%d %H:%M:%S")
# check that the last data is less than 24 hours old
last_date = datetime.datetime.strptime(result[-1][2], "%Y-%m-%d %H:%M:%S")
if (datetime_now - last_date) > datetime.timedelta(days=1):
self.logger.info("Not enough recent data to detect anomalies: skipped")
return None
# check that the first data is at least 13 days old
first_date = datetime.datetime.strptime(result[0][2], "%Y-%m-%d %H:%M:%S")
if (datetime_now - first_date) < datetime.timedelta(days=13):
self.logger.info("Not enough data to detect anomalies: skipped")
return None
df = pd.DataFrame(result, columns=["free", "used", "date"])
df.loc[:,'date'] = pd.to_datetime(df.date)
# keep a sample every 5 minutes, set NaN when there is no information
freq = 5
df = df.resample(str(freq)+"min", on='date').mean()
# estimate the missing information
df['free'] = df.free.astype(float).interpolate(method='linear')
df['used'] = df.used.astype(float).interpolate(method='linear')
# calculate the median for the element-wise absolute value
# of the difference between each x and the median of x
df = df.reset_index()
x = df['date']
y = df['free']
mad = lambda x: np.median(np.fabs(x - np.median(x)))
# threshold is set at 8% of the disk size by default
threshold_ratio = float(self.getConfig('threshold-ratio', 0.08) or 0.08)
threshold = threshold_ratio*disk_size
# use a 1-day window
minutes_per_day = 60*24/freq
rolling_window = int(minutes_per_day*1)
rolling_mad = y.rolling(window=rolling_window, center=False).median() + \
y.rolling(window=rolling_window, center=False).apply(mad)
rolling_mad_upper = rolling_mad + threshold
rolling_mad_lower = rolling_mad - threshold
# create Pandas DataFrame and rename columns
data = pd.concat([x, y, df['used'], rolling_mad, rolling_mad_upper, rolling_mad_lower], axis=1)
data.columns = ["date", "free", "used", "mad", "upper_mad", "lower_mad"]
# drop initial values (outside rolling window)
data.dropna(subset=["mad"], inplace=True)
# determine anomalies and display their number
data["is_anomaly"] = ~(data["free"].between(data["lower_mad"], data["upper_mad"]))
data = data.set_index("date")
if (len(data)==0):
self.logger.info("No result from anomaly detection")
return None
self.logger.info("There were %s anomalies in the last 15 days " \
"(1 data every %s minutes, threshold: %s %% of the disk size)" % (
len(data[data['is_anomaly'] == True]), freq, threshold_ratio*100))
return data
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
@staticmethod
def _checkInodeUsage(path):
stat = os.statvfs(path)
total_inode = stat.f_files
if total_inode:
usage = 100 * (total_inode - stat.f_ffree) / total_inode
if usage >= 98:
return "Disk Inodes usage is really high: %.4f%%" % usage
def getInodeUsage(self, path):
return (self._checkInodeUsage(path) or
os.path.ismount('/tmp') and self._checkInodeUsage('/tmp') or
"")
def sense(self):
# check that the libraries are installed from the slapos.toolbox extra requires
pandas_found = pkgutil.find_loader("pandas")
numpy_found = pkgutil.find_loader("numpy")
if pandas_found is None or numpy_found is None:
self.logger.warning("Trying to use pandas but the module is not installed. Promise skipped.")
return
# find if a disk is mounted on the path
disk_partition = ""
db_path = self.getConfig('collectordb')
check_date = self.getConfig('test-check-date')
path = os.path.join(self.getPartitionFolder(), "") + "extrafolder"
partitions = psutil.disk_partitions()
while path is not '/':
if not disk_partition:
path = os.path.dirname(path)
else:
break
for p in partitions:
if p.mountpoint == path:
disk_partition = p.device
break
if not disk_partition:
self.logger.error("Couldn't find disk partition")
return
if db_path.endswith("collector.db"):
db_path=db_path[:-len("collector.db")]
if check_date:
# testing mode
currentdate = check_date
currenttime = self.getConfig('test-check-time', '09:30:30')
user = self.getConfig('test-partition', 'slapuser0')
else:
# get the user name of the partition
user = pwd.getpwuid(os.getuid()).pw_name
# get last minute
now = datetime.datetime.utcnow()
currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M:%S')
partition_size = self.getPartitionSize(user, db_path)
data = self.getAnomaly(disk_partition, db_path, user, currentdate, currenttime)
if data is None:
return
last_data = data.iloc[-1]
last_date = data.index[-1]
if last_data.is_anomaly:
self.logger.error("Anomaly detected on %s. Space used by %s: %.2f G." % (
last_date, user, partition_size/(1024*1024*1024)))
else:
self.logger.info("No anomaly detected (last date checked: %s)" % (last_date))
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
import itertools
import json
import logging
import os
import textwrap
from dateutil import parser as dateparser
from datetime import datetime
from slapos.grid.promise.generic import GenericPromise
def iter_reverse_lines(f):
"""
Read lines from the end of the file
"""
f.seek(0, os.SEEK_END)
while True:
try:
while f.seek(-2, os.SEEK_CUR) and f.read(1) != b'\n':
pass
except OSError:
return
pos = f.tell()
yield f.readline()
f.seek(pos, os.SEEK_SET)
def iter_logrotate_file_handle(path, mode='r'):
"""
Yield successive file handles for rotated logs
(XX.log, XX.log.1, XX.log.2, ...)
"""
for i in itertools.count():
path_i = path + str(i or '')
try:
with open(path_i, mode) as f:
yield f
except OSError:
break
class JSONPromise(GenericPromise):
def __init__(self, config):
self.__name = config.get('name', None)
self.__log_folder = config.get('log-folder', None)
super(JSONPromise, self).__init__(config)
json_log_name = os.path.splitext(self.__name)[0] + '.json.log'
self.__json_log_file = os.path.join(self.__log_folder, json_log_name)
self.json_logger = self.__makeJsonLogger(self.__json_log_file)
def __makeJsonLogger(self, json_log_file):
logger = logging.getLogger('json-logger')
logger.setLevel(logging.INFO)
handler = logging.FileHandler(json_log_file)
formatter = logging.Formatter(
'{"time": "%(asctime)s", "log_level": "%(levelname)s"'
', "message": "%(message)s", "data": %(data)s}'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def getJsonLogDataInterval(self, interval):
"""
Get all data in the last "interval" seconds from JSON log
Reads rotated logs too (XX.log, XX.log.1, XX.log.2, ...)
"""
current_time = datetime.now()
data_list = []
for f in iter_logrotate_file_handle(self.__json_log_file, 'rb'):
for line in iter_reverse_lines(f):
l = json.loads(line.decode().replace("'", '"'))
timestamp = dateparser.parse(l['time'])
if (current_time - timestamp).total_seconds() > interval:
return data_list
data_list.append(l['data'])
return data_list
def getJsonLogLatestTimestamp(log):
"""
Get latest timestamp from JSON log
Reads rotated logs too (XX.log, XX.log.1, XX.log.2, ...)
"""
for f in iter_logrotate_file_handle(self.__json_log_file, 'rb'):
for line in iter_reverse_lines(f):
l = json.loads(line.decode().replace("'", '"'))
return dateparser.parse(l['time'])
return 0
from dateutil import parser
from slapos.grid.promise.generic import GenericPromise
def tail_file(file_path, line_count=10): def tail_file(file_path, line_count=10):
""" """
Returns the last lines of file. Returns the last lines of file.
""" """
line_list = [] line_list = []
with open(file_path) as f: with open(file_path) as f:
BUFSIZ = 1024 BUFSIZ = 1024
...@@ -25,5 +115,4 @@ def tail_file(file_path, line_count=10): ...@@ -25,5 +115,4 @@ def tail_file(file_path, line_count=10):
size -= line_len size -= line_len
bytes -= BUFSIZ bytes -= BUFSIZ
block -= 1 block -= 1
return '\n'.join(''.join(line_list).splitlines()[-line_count:]) return '\n'.join(''.join(line_list).splitlines()[-line_count:])
BEGIN TRANSACTION; BEGIN TRANSACTION;
CREATE TABLE disk (partition text, used text, free text, mountpoint text, date text, time text, reported integer NULL DEFAULT 0); CREATE TABLE disk (partition text, used text, free text, mountpoint text, date text, time text, reported integer NULL DEFAULT 0);
CREATE TABLE folder (partition text, disk_used real, date text, time text, reported integer NULL DEFAULT 0);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-18','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-19','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-20','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-21','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-22','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-23','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-24','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-25','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-26','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-27','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-28','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-29','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-09-30','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-10-01','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-10-02','09:17:01',1); INSERT INTO "disk" VALUES('/dev/sda1','159220666368','288948396032','/','2017-10-02','09:17:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159237537792','288931524608','/','2017-10-02','09:18:01',1); INSERT INTO "disk" VALUES('/dev/sda1','159237537792','288931524608','/','2017-10-02','09:18:01',1);
INSERT INTO "disk" VALUES('/dev/sda1','159238090752','288930971648','/','2017-10-02','09:19:02',1); INSERT INTO "disk" VALUES('/dev/sda1','159238090752','288930971648','/','2017-10-02','09:19:02',1);
...@@ -14,4 +29,7 @@ INSERT INTO "disk" VALUES('/dev/sda1','159429677056','288739385344','/','2017-10 ...@@ -14,4 +29,7 @@ INSERT INTO "disk" VALUES('/dev/sda1','159429677056','288739385344','/','2017-10
INSERT INTO "disk" VALUES('/dev/sda1','159444549632','288724512768','/','2017-10-02','09:28:03',1); INSERT INTO "disk" VALUES('/dev/sda1','159444549632','288724512768','/','2017-10-02','09:28:03',1);
INSERT INTO "disk" VALUES('/dev/sda1','159472902144','288696160256','/','2017-10-02','09:29:02',1); INSERT INTO "disk" VALUES('/dev/sda1','159472902144','288696160256','/','2017-10-02','09:29:02',1);
INSERT INTO "disk" VALUES('/dev/sda1','159476805632','288692256768','/','2017-10-02','09:30:03',1); INSERT INTO "disk" VALUES('/dev/sda1','159476805632','288692256768','/','2017-10-02','09:30:03',1);
INSERT INTO "folder" VALUES('slapuser0',87533020.0,'2017-10-02','09:17:00',1);
INSERT INTO "folder" VALUES('slapuser1',21883255.0,'2017-10-02','09:17:00',1);
INSERT INTO "folder" VALUES('slapuser2',43766510.0,'2017-10-02','09:17:00',1);
COMMIT; COMMIT;
BEGIN TRANSACTION;
CREATE TABLE disk (partition text, used text, free text, mountpoint text, date text, time text, reported integer NULL DEFAULT 0);
CREATE TABLE folder (partition text, disk_used real, date text, time text, reported integer NULL DEFAULT 0);
INSERT INTO "disk" VALUES('disk_partition_name','159220666368','288948396032','/','2017-10-02','09:17:01',1);
INSERT INTO "disk" VALUES('disk_partition_name','159237537792','288931524608','/','2017-10-02','09:18:01',1);
INSERT INTO "disk" VALUES('disk_partition_name','159238090752','288930971648','/','2017-10-02','09:19:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159241568256','288927494144','/','2017-10-02','09:20:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159242719232','288926343168','/','2017-10-02','09:21:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159196176384','288972886016','/','2017-10-02','09:22:03',1);
INSERT INTO "disk" VALUES('disk_partition_name','159300747264','288868315136','/','2017-10-02','09:23:03',1);
INSERT INTO "disk" VALUES('disk_partition_name','159294308352','288874754048','/','2017-10-02','09:24:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159328468992','288840593408','/','2017-10-02','09:25:03',1);
INSERT INTO "disk" VALUES('disk_partition_name','159384883200','288784179200','/','2017-10-02','09:26:03',1);
INSERT INTO "disk" VALUES('disk_partition_name','159429677056','288739385344','/','2017-10-02','09:27:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159444549632','288724512768','/','2017-10-02','09:28:03',1);
INSERT INTO "disk" VALUES('disk_partition_name','159472902144','288696160256','/','2017-10-02','09:29:02',1);
INSERT INTO "disk" VALUES('disk_partition_name','159476805632','288692256768','/','2017-10-02','09:30:03',1);
INSERT INTO "folder" VALUES('slapuser0',21883255.0,'2017-09-18','09:30:20',1);
INSERT INTO "folder" VALUES('slapuser0',24071580.5,'2017-09-19','09:30:11',1);
INSERT INTO "folder" VALUES('slapuser0',23196250.3,'2017-09-20','09:30:08',1);
INSERT INTO "folder" VALUES('slapuser0',23415082.8,'2017-09-21','09:30:21',1);
INSERT INTO "folder" VALUES('slapuser0',23633915.4,'2017-09-22','09:30:09',1);
INSERT INTO "folder" VALUES('slapuser0',30636557.0,'2017-09-23','09:30:06',1);
INSERT INTO "folder" VALUES('slapuser0',28448231.5,'2017-09-24','09:30:19',1);
INSERT INTO "folder" VALUES('slapuser0',27572901.3,'2017-09-25','09:30:13',1);
INSERT INTO "folder" VALUES('slapuser0',29761226.8,'2017-09-26','09:30:15',1);
INSERT INTO "folder" VALUES('slapuser0',30855389.5,'2017-09-27','09:30:06',1);
INSERT INTO "folder" VALUES('slapuser0',30636557.0,'2017-09-28','09:30:11',1);
INSERT INTO "folder" VALUES('slapuser0',31074222.0,'2017-09-29','09:30:08',1);
INSERT INTO "folder" VALUES('slapuser0',31096105.3,'2017-09-30','09:30:05',1);
INSERT INTO "folder" VALUES('slapuser0',31949552.2,'2017-10-01','09:30:10',1);
INSERT INTO "folder" VALUES('slapuser0',87533020.0,'2017-10-02','09:30:02',1);
COMMIT;
\ No newline at end of file
# -*- coding: utf-8 -*-
##############################################################################
# Copyright (c) 2018 Vifib SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################
import mock
import os
import time
from slapos.grid.promise import PromiseError
from slapos.promise.plugin.check_cpu_temperature import RunPromise
from . import TestPromisePluginMixin
class TestCheckCpuTemperature(TestPromisePluginMixin):
promise_name = "monitor-cpu-temperature.py"
def setUp(self):
super(TestCheckCpuTemperature, self).setUp()
def writePromise(self, **kw):
super(TestCheckCpuTemperature, self).writePromise(self.promise_name,
"from %s import %s\nextra_config_dict = %r\n"
% (RunPromise.__module__, RunPromise.__name__, kw))
def runPromise(self, summary, failed=False):
self.configureLauncher(enable_anomaly=True, force=True)
with mock.patch('psutil.sensors_temperatures', return_value=summary):
if failed:
self.assertRaises(PromiseError, self.launcher.run)
else:
self.launcher.run()
result = self.getPromiseResult(self.promise_name)['result']
self.assertEqual(result['failed'], failed)
return result['message']
def test_temp_ok(self):
message = "Temperature OK (50 °C)"
self.writePromise(**{
'last-avg-computation-file':'last_avg_computation_file',
'max-spot-temp': 80,
'max-avg-temp': 100,
})
self.assertEqual(message, self.runPromise({'coretemp': [[0, 50]]}))
def test_spot_critical(self):
message = "Temperature reached critical threshold: 90 °C (threshold is 80.0 °C)"
self.writePromise(**{
'last-avg-computation-file':'last_avg_computation_file',
'max-spot-temp': 80,
'max-avg-temp': 100,
})
self.assertEqual(message, self.runPromise({'coretemp': [[0, 90]]}))
def test_avg_critical(self):
message = "Average temperature over the last 1s reached threshold: 45.0 °C (threshold is 40.0 °C)"
self.writePromise(**{
'last-avg-computation-file':'last_avg_computation_file',
'max-spot-temp': 99999,
'max-avg-temp': 40,
'avg-temp-duration': 1,
})
m = self.runPromise({'coretemp': [[0, 0]]})
time.sleep(0.6)
m = self.runPromise({'coretemp': [[0, 0]]})
time.sleep(0.5)
self.assertEqual(message, self.runPromise({'coretemp': [[0, 90]]}))
if __name__ == '__main__':
unittest.main()
...@@ -74,18 +74,19 @@ extra_config_dict = { ...@@ -74,18 +74,19 @@ extra_config_dict = {
""" % {'collectordb': self.db_file} """ % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content) self.writePromise(self.promise_name, content)
self.configureLauncher() self.configureLauncher(timeout=20)
self.launcher.run() self.launcher.run()
result = self.getPromiseResult(self.promise_name) result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False) self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'], "No result from collector database: disk check skipped") self.assertEqual(result['result']['message'], "No result from collector database: disk check skipped")
def test_disk_space_ok(self): def test_disk_space_ok(self):
self.configureLauncher() self.configureLauncher(timeout=20)
self.launcher.run() self.launcher.run()
result = self.getPromiseResult(self.promise_name) result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False) self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'], "Disk usage: OK") message = "Current disk usage: OK\nEnable to display disk space predictions: False"
self.assertEqual(result['result']['message'], message)
def test_disk_space_nok(self): def test_disk_space_nok(self):
content = """from slapos.promise.plugin.check_free_disk_space import RunPromise content = """from slapos.promise.plugin.check_free_disk_space import RunPromise
...@@ -98,29 +99,62 @@ extra_config_dict = { ...@@ -98,29 +99,62 @@ extra_config_dict = {
""" % {'collectordb': self.db_file} """ % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content) self.writePromise(self.promise_name, content)
self.configureLauncher() self.configureLauncher(timeout=20)
with self.assertRaises(PromiseError): with self.assertRaises(PromiseError):
self.launcher.run() self.launcher.run()
result = self.getPromiseResult(self.promise_name) result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], True) self.assertEqual(result['result']['failed'], True)
self.assertEqual(result['result']['message'], message = "Free disk space low: remaining 269.10 G (disk size: 417 G, threshold: 278 G)."
"Free disk space low: remaining 269.1 G (threshold: 278.0 G)") self.assertIn(message, result['result']['message'])
self.configureLauncher() def test_display_partition(self):
content = """from slapos.promise.plugin.check_free_disk_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-02',
'threshold': '278',
'display-partition' : '1',
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(timeout=20)
with self.assertRaises(PromiseError): with self.assertRaises(PromiseError):
self.launcher.run() self.launcher.run()
result = self.getPromiseResult(self.promise_name) result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], True) self.assertEqual(result['result']['failed'], True)
self.assertEqual(result['result']['message'], "Free disk space low: remaining 269.1 G (threshold: 278.0 G)") message = "Free disk space low: remaining 269.10 G (disk size: 417 G, threshold: 278 G). " \
"The partition slappart0 uses 83.48 G (date checked: 2017-10-02 09:17:00). " \
"The partition slappart2 uses 41.74 G (date checked: 2017-10-02 09:17:00). " \
"The partition slappart1 uses 20.87 G (date checked: 2017-10-02 09:17:00)."
self.assertIn(message, result['result']['message'])
def test_display_prediction(self):
content = """from slapos.promise.plugin.check_free_disk_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-02',
'display-prediction' : '1',
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(timeout=20)
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False)
self.assertIn("Prediction:", result['result']['message'])
def test_check_free_disk_with_unicode_string_path(self): def test_check_free_disk_with_unicode_string_path(self):
# set path unicode # set path unicode
self.partition_dir = u'%s' % self.partition_dir self.partition_dir = u'%s' % self.partition_dir
self.configureLauncher() self.configureLauncher(timeout=20)
self.launcher.run() self.launcher.run()
result = self.getPromiseResult(self.promise_name) result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False) self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'], "Disk usage: OK") self.assertIn("Current disk usage: OK", result['result']['message'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# -*- coding: utf-8 -*-
##############################################################################
# Copyright (c) 2022 Vifib SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################
import mock
from collections import namedtuple
from slapos.grid.promise import PromiseError
from slapos.promise.plugin.check_network_errors_packets import RunPromise
from . import TestPromisePluginMixin
class TestCheckNetwork(TestPromisePluginMixin):
promise_name = "monitor-network.py"
def setUp(self):
super(TestCheckNetwork, self).setUp()
self.network_data = namedtuple('network_data',
['bytes_recv', 'bytes_sent','errin', 'errout', 'dropin', 'dropout'])
def writePromise(self, **kw):
super(TestCheckNetwork, self).writePromise(self.promise_name,
"from %s import %s\nextra_config_dict = %r\n"
% (RunPromise.__module__, RunPromise.__name__, kw))
def runPromise(self, summary, failed=False):
self.configureLauncher(enable_anomaly=True, force=True)
with mock.patch('psutil.net_io_counters', return_value=summary):
if failed:
self.assertRaises(PromiseError, self.launcher.run)
else:
self.launcher.run()
result = self.getPromiseResult(self.promise_name)['result']
self.assertEqual(result['failed'], failed)
return result['message']
def test_network_ok(self):
message = "Network statistics OK"
mock_stats = {'bytes_recv':0, 'bytes_sent':0,'errin':0, 'errout':0, 'dropin':0, 'dropout':0}
self.writePromise(**{
'max-lost-packets-per-MB': 5,
'max-error-messages-per-MB': 5,
})
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_dropped_packets_nok(self):
message = "Network packets lost reached critical threshold: 10 (threshold is 5 per MB)"
mock_stats = {'bytes_recv':0, 'bytes_sent':0,'errin':0, 'errout':0, 'dropin':5, 'dropout':5}
self.writePromise(**{
'max-lost-packets-per-MB': 5,
'max-error-messages-per-MB': 5,
})
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_errors_nok(self):
message = "Network errors reached critical threshold: 10 (threshold is 5 per MB)"
mock_stats = {'bytes_recv':0, 'bytes_sent':0,'errin':5, 'errout':5, 'dropin':0, 'dropout':0}
self.writePromise(**{
'max-lost-packets-per-MB': 5,
'max-error-messages-per-MB': 5,
})
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_nok(self):
message = "Network packets lost reached critical threshold: 10 (threshold is 5 per MB)"\
"\nNetwork errors reached critical threshold: 10 (threshold is 5 per MB)"
mock_stats = {'bytes_recv':0, 'bytes_sent':0, 'errin':5, 'errout':5, 'dropin':5, 'dropout':5}
self.writePromise(**{
'max-lost-packets-per-MB': 5,
'max-error-messages-per-MB': 5,
})
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
if __name__ == '__main__':
unittest.main()
# -*- coding: utf-8 -*-
##############################################################################
# Copyright (c) 2018 Vifib SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################
import mock
import os
import time
from collections import namedtuple
from slapos.grid.promise import PromiseError
from slapos.promise.plugin.check_network_transit import RunPromise
from . import TestPromisePluginMixin
class TestCheckNetworkTransit(TestPromisePluginMixin):
promise_name = "monitor-network-transit.py"
def setUp(self):
super(TestCheckNetworkTransit, self).setUp()
self.network_data = namedtuple('network_data', ['bytes_recv', 'bytes_sent'])
def writePromise(self, **kw):
super(TestCheckNetworkTransit, self).writePromise(self.promise_name,
"from %s import %s\nextra_config_dict = %r\n"
% (RunPromise.__module__, RunPromise.__name__, kw))
def runPromise(self, summary=None, failed=False):
self.configureLauncher(enable_anomaly=True, force=True)
with mock.patch('psutil.net_io_counters', return_value=summary):
if failed:
self.assertRaises(PromiseError, self.launcher.run)
else:
self.launcher.run()
result = self.getPromiseResult(self.promise_name)['result']
self.assertEqual(result['failed'], failed)
return result['message']
def test_network_transit_ok(self):
message = "Network transit OK"
mock_stats = {'bytes_recv':1e6, 'bytes_sent':1e6}
self.writePromise(**{
'transit-period': 1,
})
self.runPromise(self.network_data(**{'bytes_recv':1e3, 'bytes_sent':1e3}))
time.sleep(0.5)
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_min_nok(self):
message = "Network congested, data amount over the last 1 seconds"\
" reached minimum threshold: 1.4K (threshold is 102.4K)"
mock_stats = {'bytes_recv':1e3, 'bytes_sent':1e3}
self.writePromise(**{
'min-data-amount': 0.1, # MB
'transit-period': 1,
})
self.runPromise(self.network_data(**{'bytes_recv':300, 'bytes_sent':300}))
time.sleep(0.5)
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_max_nok(self):
message = "Network congested, data amount over the last 1 seconds"\
" reached maximum threshold: 1.1M (threshold is 1.0M)"
mock_stats = {'bytes_recv':0.7e6, 'bytes_sent':0.5e6}
self.writePromise(**{
'max-data-amount': 1, # MB
'transit-period': 1,
})
self.runPromise(self.network_data(**{'bytes_recv':300, 'bytes_sent':300}))
time.sleep(0.5)
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
def test_network_transit_nok(self):
message = "Network congested, data amount over the last 1 seconds reached minimum threshold: 0.0B (threshold is 0.0B)\n"\
"Network congested, data amount over the last 1 seconds reached maximum threshold: 0.0B (threshold is 0.0B)"
mock_stats = {'bytes_recv':1e6, 'bytes_sent':1e6}
self.writePromise(**{
'last_transit_file':'last_transit_file',
'max-data-amount': 0,
'min-data-amount': 0,
'transit-period': 1,
})
self.runPromise(self.network_data(**{'bytes_recv':1e6, 'bytes_sent':1e6}))
time.sleep(0.5)
self.assertEqual(message, self.runPromise(self.network_data(**mock_stats)))
if __name__ == '__main__':
unittest.main()
# -*- coding: utf-8 -*-
##############################################################################
# Copyright (c) 2018 Vifib SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################
import mock
import os
import time
from collections import namedtuple
from slapos.grid.promise import PromiseError
from slapos.promise.plugin.check_ram_usage import RunPromise
from . import TestPromisePluginMixin
class TestCheckRamUsage(TestPromisePluginMixin):
promise_name = "monitor-ram-usage.py"
def setUp(self):
super(TestCheckRamUsage, self).setUp()
self.ram_data = namedtuple('ram_data', ['available'])
def writePromise(self, **kw):
super(TestCheckRamUsage, self).writePromise(self.promise_name,
"from %s import %s\nextra_config_dict = %r\n"
% (RunPromise.__module__, RunPromise.__name__, kw))
def runPromise(self, summary, failed=False):
self.configureLauncher(enable_anomaly=True, force=True)
with mock.patch('psutil.virtual_memory', return_value=summary):
if failed:
self.assertRaises(PromiseError, self.launcher.run)
else:
self.launcher.run()
result = self.getPromiseResult(self.promise_name)['result']
self.assertEqual(result['failed'], failed)
return result['message']
def test_ram_ok(self):
message = "RAM usage OK"
available_ram = {'available':1e9}
self.writePromise(**{
'last-avg-ram-file':'last_avg_ram_file',
'min-threshold-ram': 500, # 500MB
'min-avg-ram': 100,
})
self.assertEqual(message, self.runPromise(self.ram_data(**available_ram)))
def test_ram_below_threshold_nok(self):
message = "RAM usage reached critical threshold: 190.7M (threshold is 500.0M)"
available_ram = {'available': 200e6}
self.writePromise(**{
'last-avg-ram-file':'last_avg_ram_file',
'min-threshold-ram': 500, # ≈500MB
'min-avg-ram': 100,
})
self.assertEqual(message, self.runPromise(self.ram_data(**available_ram)))
def test_ram_below_average_nok(self):
message = "Average RAM usage over the last 1 seconds reached threshold: 190.7M (threshold is 200.0M)"
available_ram = {'available': 200e6}
self.writePromise(**{
'last-avg-ram-file':'last_avg_ram_file',
'min-threshold-ram': 0,
'min-avg-ram': 200,
'avg-ram-period': 1,
})
m = self.runPromise(self.ram_data(**{'available': 300e6}))
m = self.runPromise(self.ram_data(**{'available': 200e6}))
time.sleep(1)
self.assertEqual(message, self.runPromise(self.ram_data(**available_ram)))
if __name__ == '__main__':
unittest.main()
##############################################################################
#
# Copyright (c) 2018 Vifib SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from slapos.test.promise.plugin import TestPromisePluginMixin
from slapos.grid.promise import PromiseError
import os
import sqlite3
import psutil
from slapos.grid.promise import PromiseError
class TestMonitorPartitionSpace(TestPromisePluginMixin):
def setUp(self):
TestPromisePluginMixin.setUp(self)
log_folder = os.path.join(self.partition_dir, 'var/log')
os.makedirs(log_folder)
# get disk partition name
disk_partition = ""
path = os.path.join(self.partition_dir, "") + "extrafolder"
partitions = psutil.disk_partitions()
while path is not '/':
if not disk_partition:
path = os.path.dirname(path)
else:
break
for p in partitions:
if p.mountpoint == path:
disk_partition = p.device
break
self.db_file = '/tmp/collector.db'
# populate db
self.conn = sqlite3.connect(self.db_file)
f = open(self.base_path+"/folder_disk_test.sql")
sql = f.read()
# replace every disk_partition_name string with disk partition name
sql = sql.replace('disk_partition_name', disk_partition)
self.conn.executescript(sql)
self.conn.close()
self.promise_name = "monitor-partition-space.py"
content = """from slapos.promise.plugin.monitor_partition_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-02',
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
def tearDown(self):
TestPromisePluginMixin.tearDown(self)
if os.path.exists(self.db_file):
os.remove(self.db_file)
def test_no_data_for_a_partition(self):
content = """from slapos.promise.plugin.monitor_partition_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-02',
'test-partition': 'slapuser1'
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(timeout=20)
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'],
"No result from collector database for the user slapuser1: skipped")
def test_no_recent_data(self):
content = """from slapos.promise.plugin.monitor_partition_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-04'
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(force=True, timeout=20)
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'], "Not enough recent data to detect anomalies: skipped")
def test_no_enough_data(self):
content = """from slapos.promise.plugin.monitor_partition_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-09-24'
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(force=True, timeout=20)
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False)
self.assertEqual(result['result']['message'],
"Not enough data to detect anomalies: skipped")
def test_no_anomalies(self):
content = """from slapos.promise.plugin.monitor_partition_space import RunPromise
extra_config_dict = {
'collectordb': '%(collectordb)s',
'test-check-date': '2017-10-02',
'threshold-ratio': '0.70'
}
""" % {'collectordb': self.db_file}
self.writePromise(self.promise_name, content)
self.configureLauncher(force=True, timeout=20)
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], False)
self.assertIn("No anomaly detected (last date checked: 2017-10-02 09:30:00)",
result['result']['message'])
def test_presence_of_anomalies(self):
self.configureLauncher(force=True, timeout=20)
with self.assertRaises(PromiseError):
self.launcher.run()
result = self.getPromiseResult(self.promise_name)
self.assertEqual(result['result']['failed'], True)
msg = "Anomaly detected on 2017-10-02 09:30:00. Space used by slapuser0: %.2f G."
self.assertIn(msg % (87533020.0/(1024*1024)), result['result']['message'])
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment