Commit 59b3e4a2 authored by Justin's avatar Justin Committed by lu.xu

software: add check_ram promise

parent 4c3b7a44
......@@ -16,12 +16,16 @@
[template]
filename = instance.cfg
md5sum = 23acb8fc6465bf0ccdfdd366c48a2688
md5sum = 2b1814759bda99be005bd94704ed897a
[template-default]
_update_hash_filename_ = instance-default.jinja2.cfg
md5sum = 97c4a1d7b1e72728ad66f3e710180b27
md5sum = 33bb6ef9a36392950b8b59a5fa035235
[cpu-promise]
_update_hash_filename_ = promise/check_cpu_temperature.py
md5sum = e8fc789eb7e7e3b054e921b0292c806f
md5sum = e61f99ed0ff58385c407035a37c2bd1d
[ram-usage-promise]
_update_hash_filename_ = promise/check_ram_usage.py
md5sum = 4ccd42ecb37fcee54f9c5e92f9a36135
......@@ -2,6 +2,7 @@
parts =
directory
cpu-promise
ram-usage-promise
publish-connection-information
extends = {{ monitor_template }}
......@@ -45,3 +46,13 @@ config-max-spot-temp = 90
config-max-avg-temp = 80
config-avg-temp-duration = 15
config-last-avg-computation-file = ${directory:var}/promise_cpu_last_avg_computation_file
[ram-usage-promise]
<= monitor-promise-base
promise = check_ram_usage
name = ${:_buildout_section_name_}.py
config-testing = {{ slapparameter_dict.get("testing", False) }}
config-min-threshold-ram = 500e6
config-min-avg-ram = 1e9
config-avg-ram-period = 15
config-last-avg-ram-file = ${directory:var}/promise_ram_last_avg_available_file
......@@ -40,3 +40,4 @@ extensions = jinja2.ext.do
extra-context =
raw monitor_template ${monitor2-template:output}
raw cpu_promise ${cpu-promise:target}
raw ram_usage_promise ${ram-usage-promise:target}
import json
import os
import psutil
import time
from psutil._common import bytes2human
from .util import get_data_interval_json_log
from .util import JSONRunPromise
from zope.interface import implementer
from slapos.grid.promise import interface
@implementer(interface.IPromise)
class RunPromise(JSONRunPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=2)
self.last_avg_ram_file = self.getConfig(
'last-avg-ram-file', 'last_avg')
def sense(self):
promise_success = True
# Get reference values
min_threshold_ram = float(self.getConfig('min-threshold-ram', 500e6)) # ≈500 MB
min_avg_ram = float(self.getConfig('min-avg-ram', 1e9)) # ≈1 GB
avg_ram_period_sec = int(self.getConfig('avg-ram-period-sec', 0))
if avg_ram_period_sec:
avg_ram_period = avg_ram_period_sec
else:
avg_ram_period = 60 * int(self.getConfig('avg-ram-period', 5))
# Get current RAM usage
ram_data = psutil.virtual_memory()
# Check with min threshold and log error if below it
if ram_data.available <= min_threshold_ram:
self.logger.error("RAM usage reached critical threshold: %7s "\
" (threshold is %7s)" % (bytes2human(ram_data.available), bytes2human(min_threshold_ram)))
promise_success = False
# Log RAM usage
data = json.dumps({'available_ram': ram_data.available})
self.json_logger.info("RAM data", extra={'data': data})
# Get last timestamp (i.e. last modification) of log file
try:
t = os.path.getmtime(self.last_avg_ram_file)
except OSError:
t = 0
# Get last available RAM from log file since avg_ram_period
if (time.time() - t) > avg_ram_period:
open(self.last_avg_ram_file, 'w').close()
temp_list = get_data_interval_json_log(self.log_file, avg_ram_period)
if temp_list:
avg_ram = sum(map(lambda x: x['available_ram'], temp_list)) / len(temp_list)
if avg_ram < min_avg_ram:
self.logger.error("Average RAM usage over the last %s seconds "\
"reached threshold: %7s (threshold is %7s)"
% (avg_ram_period, bytes2human(avg_ram), bytes2human(min_avg_ram)))
promise_success = False
else:
self.logger.error("Couldn't read available RAM from log")
promise_success = False
if promise_success:
self.logger.info("RAM usage OK")
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
......@@ -15,7 +15,7 @@ develop =
[slapos.toolbox-repository]
recipe = slapos.recipe.build:gitclone
repository = /srv/slapgrid/slappart72/srv/project/slapos.toolbox.git
repository = /srv/slapgrid/slappart86/srv/project/slapos.toolbox
branch = json-promise
git-executable = ${git:location}/bin/git
......@@ -33,3 +33,6 @@ url = ${:_profile_base_location_}/${:_update_hash_filename_}
[cpu-promise]
<= download-base
[ram-usage-promise]
<= download-base
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment