Commit 321734be authored by Alain Takoudjou's avatar Alain Takoudjou

monitor: fix cpu-load-threshold parameter, make cpu load promise more robust

parent 42b8cba0
......@@ -89,11 +89,6 @@ command-line =
${monitor-directory:bin}/python {{ monitor_collect_csv_dump }} --output_folder ${monitor-directory:consumption}
wrapper-path = ${monitor-directory:reports}/monitor-collect-csv-dump
[monitor-check-cpu-usage]
recipe = slapos.cookbook:wrapper
command-line = ${monitor-directory:bin}/python {{ monitor_check_system_health }} cpu ${init-monitor-parameters:cpu-load-file}
wrapper-path = ${directory:promises}/system-CPU-load-check
[monitor-check-memory-usage]
recipe = slapos.cookbook:wrapper
command-line = {{ buildout_bin}}/check-computer-memory
......@@ -102,20 +97,22 @@ command-line = {{ buildout_bin}}/check-computer-memory
--unit percent
wrapper-path = ${directory:promises}/check-computer-memory-usage
[monitor-check-cpu-usage]
recipe = slapos.cookbook:promise.plugin
eggs =
slapos.toolbox
file = ${monitor-conf-parameters:promise-output-file}
content =
from slapos.promise.plugin.check_server_cpu_load import RunPromise
output = ${directory:plugins}/system-CPU-load-check.py
mode = 600
config-cpu-load-threshold = ${slap-parameter:cpu-load-threshold}
[publish-connection-information]
recipe = slapos.cookbook:publish
monitor-setup-url = https://monitor.app.officejs.com/#page=settings_configurator&url=${monitor-publish-parameters:monitor-url}&username=${monitor-publish-parameters:monitor-user}&password=${monitor-publish-parameters:monitor-password}
server_log_url = ${monitor-publish-parameters:monitor-base-url}/${slap-configuration:private-hash}/
[init-monitor-parameters]
recipe = plone.recipe.command
cpu-load-file = ${directory:monitor}/cpu-load-tolerance
mem-free-file = ${directory:monitor}/mem-free-limit
command =
if [ ! -s "${:cpu-load-file}" ]; then
echo ${slap-parameter:cpu-load-threshold} > ${:cpu-load-file}
fi
[slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
......@@ -127,5 +124,6 @@ private-hash = ${pwgen:passwd}${pwgen32:passwd}
frontend-domain =
[slap-parameter]
cpu-load-threshold = 2.0
# Max cpu load for one core on server
cpu-load-threshold = 3.0
memory-percent-threshold = 96
......@@ -22,7 +22,6 @@ context = key develop_eggs_directory buildout:develop-eggs-directory
raw monitor_template_output ${monitor-template:output}
raw network_benck_cfg_output ${network-bench-cfg:output}
raw monitor_collect_csv_dump ${monitor-collect-csv-dump:output}
raw monitor_check_system_health ${monitor-system-health:output}
mode = 0644
[instance-base-distributor]
......
#!/usr/bin/env python
import subprocess
import os
import re
import json
cpu_command_list = ['top', '-n', '1', '-b']
mem_command_list = ['free', '-m']
head_command_list = ['head', '-n', '5']
cpu_core_cmd_list = ['nproc']
def cpu_usage(tolerance=1.5):
# tolerance=1.5 => accept up to 1.5 =150% CPU load
uptime_result = subprocess.check_output(['uptime'])
line = uptime_result.strip().split(' ')
load, load5, long_load = line[-3:]
core_count = int(subprocess.check_output(cpu_core_cmd_list).strip())
threshold = core_count * tolerance
if float(long_load) > threshold:
# display top statistics
top = subprocess.Popen(cpu_command_list, stdout=subprocess.PIPE)
result = subprocess.check_output(head_command_list, stdin=top.stdout)
message = "CPU load is high: %s %s %s\n\n" % (load, load5, long_load)
message += result
return message
def check_last_result(file, last_value, threshold=7.0, elt_count=5):
mem_average = 0.0
value_list = []
if os.path.exists(file):
with open(file) as f:
values = f.read()
value_list = values.split(' ')
size = len(value_list)
value_list.append(str(last_value))
if size >= elt_count:
while len(value_list) > elt_count:
value_list.pop(0)
# calculate average
average = sum([float(l) for l in value_list])/(size * 1.0)
if average < threshold:
mem_average = round(average, 2)
else:
value_list.append(str(last_value))
with open(file, 'w') as f:
f.write(' '.join(value_list))
return mem_average
def memory_usage(storage_file, threshold=7.0, elt_count=5):
mem_stats = subprocess.check_output(mem_command_list)
result_list = mem_stats.split('\n')
usage = re.sub('\s+', ' ', result_list[1])
usage_real = re.sub('\s+', ' ', result_list[2])
usage_list = usage.split(' ')
mem_total = float(usage_list[1])
mem_free = float(usage_real.split(' ')[-1])
if mem_free == 0.0:
mem_available = 0.0
else:
mem_available = round(mem_free * 100 / (mem_total * 1.0), 2)
average = check_last_result(
storage_file,
mem_available,
threshold=threshold,
elt_count=elt_count)
if average != 0.0 and average < threshold:
# mem used at (threshold)% at least
message = "Memory usage is high. %s%% is available (%s%% for last %s minutes).\n\n" % (
mem_available, average, elt_count)
message += mem_stats
return message
swap_usage = re.sub('\s+', ' ', result_list[3])
swap_usage_list = swap_usage.split(' ')
swap_total = float(swap_usage_list[1])
swap_free = float(swap_usage_list[3])
if swap_total > 1:
if swap_free == 0.0:
swap_available = 0.0
else:
swap_available = round(swap_free * 100 / (swap_total * 1.0), 2) * 100
if swap_available < threshold*1.7:
message = "Memory SWAP usage is high. %s%% is available.\n\n" % swap_available
message += mem_stats
return message
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: %s [cpu | mem] CONFIG_FILE [BASE_DIR]" % os.path.basename(sys.argv[0])
exit(2)
check_type = sys.argv[1]
threshold = None
if len(sys.argv) >= 3:
config_file = sys.argv[2]
if os.path.exists(config_file):
with open(config_file) as f:
try:
threshold = float(f.read())
if not threshold > 0:
threshold = None
except ValueError:
pass
if check_type == "cpu":
result = cpu_usage(threshold or 1.5)
if result:
print result
exit(1)
elif check_type == "mem":
directory = ""
if len(sys.argv) >= 4:
directory = sys.argv[3]
if not os.path.exists(directory) or not os.path.isdir(directory):
directory = os.getcwd()
storage_file = os.path.join(directory, 'mem-usage.mo')
result = memory_usage(storage_file, threshold=(threshold or 4.0), elt_count=10)
if result:
print result
exit(1)
else:
exit(3)
exit(0)
\ No newline at end of file
......@@ -21,14 +21,14 @@ parts =
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/instance.cfg
output = ${buildout:directory}/template.cfg
md5sum = 641c5916739f78171c616af00fe974a2
md5sum = 1b7d2d097f208f6641bf98a17df079c8
mode = 0644
[template-monitor]
recipe = slapos.recipe.build:download
url = ${:_profile_base_location_}/instance-monitor.cfg.jinja2
destination = ${buildout:directory}/template-base-monitor.cfg
md5sum = 79125819f20f4f18a301b806daed2ceb
md5sum = ef3297619e1fc2a5a8d1b0546c1a0db2
mode = 0644
[template-monitor-distributor]
......@@ -59,13 +59,6 @@ filename = collect_csv_dump.py
output = ${:destination}/${:filename}
md5sum = cad2402bbd21907cfed6bc5af8c5d3ab
[monitor-system-health]
<= monitor-template-script
url = ${:_profile_base_location_}/script/${:filename}
filename = check_system_health.py
output = ${:destination}/${:filename}
md5sum = 7eb74a0be4995c6a1015a9a1eb6874c6
[extra-eggs]
<= monitor-eggs
interpreter = pythonwitheggs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment