Commit 59b3e4a2 authored by Justin's avatar Justin Committed by lu.xu

software: add check_ram promise

parent 4c3b7a44
...@@ -16,12 +16,16 @@ ...@@ -16,12 +16,16 @@
[template] [template]
filename = instance.cfg filename = instance.cfg
md5sum = 23acb8fc6465bf0ccdfdd366c48a2688 md5sum = 2b1814759bda99be005bd94704ed897a
[template-default] [template-default]
_update_hash_filename_ = instance-default.jinja2.cfg _update_hash_filename_ = instance-default.jinja2.cfg
md5sum = 97c4a1d7b1e72728ad66f3e710180b27 md5sum = 33bb6ef9a36392950b8b59a5fa035235
[cpu-promise] [cpu-promise]
_update_hash_filename_ = promise/check_cpu_temperature.py _update_hash_filename_ = promise/check_cpu_temperature.py
md5sum = e8fc789eb7e7e3b054e921b0292c806f md5sum = e61f99ed0ff58385c407035a37c2bd1d
[ram-usage-promise]
_update_hash_filename_ = promise/check_ram_usage.py
md5sum = 4ccd42ecb37fcee54f9c5e92f9a36135
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
parts = parts =
directory directory
cpu-promise cpu-promise
ram-usage-promise
publish-connection-information publish-connection-information
extends = {{ monitor_template }} extends = {{ monitor_template }}
...@@ -45,3 +46,13 @@ config-max-spot-temp = 90 ...@@ -45,3 +46,13 @@ config-max-spot-temp = 90
config-max-avg-temp = 80 config-max-avg-temp = 80
config-avg-temp-duration = 15 config-avg-temp-duration = 15
config-last-avg-computation-file = ${directory:var}/promise_cpu_last_avg_computation_file config-last-avg-computation-file = ${directory:var}/promise_cpu_last_avg_computation_file
[ram-usage-promise]
<= monitor-promise-base
promise = check_ram_usage
name = ${:_buildout_section_name_}.py
config-testing = {{ slapparameter_dict.get("testing", False) }}
config-min-threshold-ram = 500e6
config-min-avg-ram = 1e9
config-avg-ram-period = 15
config-last-avg-ram-file = ${directory:var}/promise_ram_last_avg_available_file
...@@ -40,3 +40,4 @@ extensions = jinja2.ext.do ...@@ -40,3 +40,4 @@ extensions = jinja2.ext.do
extra-context = extra-context =
raw monitor_template ${monitor2-template:output} raw monitor_template ${monitor2-template:output}
raw cpu_promise ${cpu-promise:target} raw cpu_promise ${cpu-promise:target}
raw ram_usage_promise ${ram-usage-promise:target}
import json
import os
import psutil
import time
from psutil._common import bytes2human
from .util import get_data_interval_json_log
from .util import JSONRunPromise
from zope.interface import implementer
from slapos.grid.promise import interface
@implementer(interface.IPromise)
class RunPromise(JSONRunPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=2)
self.last_avg_ram_file = self.getConfig(
'last-avg-ram-file', 'last_avg')
def sense(self):
promise_success = True
# Get reference values
min_threshold_ram = float(self.getConfig('min-threshold-ram', 500e6)) # ≈500 MB
min_avg_ram = float(self.getConfig('min-avg-ram', 1e9)) # ≈1 GB
avg_ram_period_sec = int(self.getConfig('avg-ram-period-sec', 0))
if avg_ram_period_sec:
avg_ram_period = avg_ram_period_sec
else:
avg_ram_period = 60 * int(self.getConfig('avg-ram-period', 5))
# Get current RAM usage
ram_data = psutil.virtual_memory()
# Check with min threshold and log error if below it
if ram_data.available <= min_threshold_ram:
self.logger.error("RAM usage reached critical threshold: %7s "\
" (threshold is %7s)" % (bytes2human(ram_data.available), bytes2human(min_threshold_ram)))
promise_success = False
# Log RAM usage
data = json.dumps({'available_ram': ram_data.available})
self.json_logger.info("RAM data", extra={'data': data})
# Get last timestamp (i.e. last modification) of log file
try:
t = os.path.getmtime(self.last_avg_ram_file)
except OSError:
t = 0
# Get last available RAM from log file since avg_ram_period
if (time.time() - t) > avg_ram_period:
open(self.last_avg_ram_file, 'w').close()
temp_list = get_data_interval_json_log(self.log_file, avg_ram_period)
if temp_list:
avg_ram = sum(map(lambda x: x['available_ram'], temp_list)) / len(temp_list)
if avg_ram < min_avg_ram:
self.logger.error("Average RAM usage over the last %s seconds "\
"reached threshold: %7s (threshold is %7s)"
% (avg_ram_period, bytes2human(avg_ram), bytes2human(min_avg_ram)))
promise_success = False
else:
self.logger.error("Couldn't read available RAM from log")
promise_success = False
if promise_success:
self.logger.info("RAM usage OK")
def test(self):
"""
Called after sense() if the instance is still converging.
Returns success or failure based on sense results.
In this case, fail if the previous sensor result is negative.
"""
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
"""
Called after sense() if the instance has finished converging.
Returns success or failure based on sense results.
Failure signals the instance has diverged.
In this case, fail if two out of the last three results are negative.
"""
return self._anomaly(result_count=3, failure_amount=2)
...@@ -15,7 +15,7 @@ develop = ...@@ -15,7 +15,7 @@ develop =
[slapos.toolbox-repository] [slapos.toolbox-repository]
recipe = slapos.recipe.build:gitclone recipe = slapos.recipe.build:gitclone
repository = /srv/slapgrid/slappart72/srv/project/slapos.toolbox.git repository = /srv/slapgrid/slappart86/srv/project/slapos.toolbox
branch = json-promise branch = json-promise
git-executable = ${git:location}/bin/git git-executable = ${git:location}/bin/git
...@@ -33,3 +33,6 @@ url = ${:_profile_base_location_}/${:_update_hash_filename_} ...@@ -33,3 +33,6 @@ url = ${:_profile_base_location_}/${:_update_hash_filename_}
[cpu-promise] [cpu-promise]
<= download-base <= download-base
[ram-usage-promise]
<= download-base
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment