Commit 4558d511 authored by Lisa Casino's avatar Lisa Casino

promise: new promise

parent 6c8e637c
from __future__ import division
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
import sys
import pwd
import sqlite3
import argparse
import datetime
import psutil
from slapos.collect.db import Database
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# check disk space at least every 3 minutes
self.setPeriodicity(minute=3)
def biggestPartitions(self, db_path, date, time, limit=3):
database = Database(db_path, create=False, timeout=10)
try:
database.connect()
date_time = date + ' ' + time
where_query = "datetime(date || ' ' || time) >= datetime('%s', '-1 days') AND datetime(date || ' ' || time) <= datetime('%s')"
result = database.select(
"folder",
columns = "partition, disk_used*1024, max(datetime(date || ' ' || time))",
where = where_query % (date_time, date_time),
group = "partition",
order = "disk_used DESC",
limit = limit).fetchall()
if not result or not result[0]:
self.logger.info("No result from collector database in table folder: skipped")
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return result
def fastestPartitions(self, db_path, disk_partition, date, time, day_range, limit=3):
database = Database(db_path, create=False, timeout=10)
try:
database.connect()
# for each partition, we get two data: the actual size and
# the oldest (according to day_range) to compute the slope of the function
date_time = date + ' ' + time
where_query = "datetime(date) >= datetime('%s', '-1 days') AND datetime(date) <= datetime('%s')"
result_max = database.select(
"folder",
columns = "partition, disk_used*1024, datetime(date || ' ' || time)",
where = where_query % (date_time, date_time),
group = "partition",
order = "partition").fetchall()
if not result_max or not result_max[0]:
self.logger.info("No result from collector database in table folder: skipped")
return None
result_min = database.select(
"folder",
columns = "partition, disk_used*1024, min(datetime(date || ' ' || time))",
where = "datetime(date || ' ' || time) >= datetime('%s', '-%s days')" % (result_max[0][2], day_range),
group = "partition",
order = "partition").fetchall()
if not result_min or (result_min == result_max) :
self.logger.info("No result from collector database in table folder: skipped")
return None
timep = '%Y-%m-%d %H:%M:%S'
ranked_results = []
for i in range(len(result_max)):
timespan = datetime.datetime.strptime(result_max[i][2], timep) - \
datetime.datetime.strptime(result_min[i][2], timep)
delta_days = timespan.total_seconds() / (3600.*24)
# if we don't have enough information OR information are the same
if (int(delta_days) <= 1) or (result_max[i] == result_min[i]):
continue
user, size_max, date_max = result_max[i]
user, size_min, date_min = result_min[i]
# slope/(1024*1024*1024) = number of giga per day
slope = (size_max - size_min)/delta_days
ranked_results.append((user, slope, date_min, date_max, delta_days))
ranked_results = sorted(ranked_results, key=lambda tup: tup[1], reverse=True)
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return ranked_results[:limit]
def getDaysUntilFull(self, disk_partition, database, date, time, day_range):
"""Returns estimation of days until the disk_partition would become full
It uses date and time in order to find current disk free percentage, then rewinds
day_range back in history and calculates average speed of losing free space, which
is assumed constant and used to predict in how many days the disk would become full.
"""
database = Database(database, create=False, timeout=10)
try:
database.connect()
result_max = database.select(
"disk",
date = date,
columns = "free*1.0/(used+free) AS percent, max(datetime(date || ' ' || time))",
where = "time between '%s:00' and '%s:30' and partition='%s'" % (time, time, disk_partition),
limit = 1).fetchone()
if not result_max or not result_max[1]:
return None
result_min = database.select(
"disk",
columns = "free*1.0/(used+free) AS percent, min(datetime(date || ' ' || time))",
where = "datetime(date || ' ' || time) >= datetime('%s', '-%s days') and partition='%s'" % (result_max[1], day_range, disk_partition),
limit = 1).fetchone()
if not result_min or not result_min[1] or result_min == result_max:
return None
change = result_max[0] - result_min[0]
if change > 0.:
return None
timep = '%Y-%m-%d %H:%M:%S'
timespan = datetime.datetime.strptime(
result_max[1], timep) - datetime.datetime.strptime(
result_min[1], timep)
delta_days = timespan.total_seconds() / (3600.*24)
try:
return (-result_max[0] / (change / delta_days), result_min[1], result_min[0], result_max[1], result_max[0], delta_days)
except ZeroDivisionError as e:
# no data
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
def getDiskSize(self, disk_partition, database):
database = Database(database, create=False, timeout=10)
try:
# fetch disk size
database.connect()
where_query = "partition='%s'" % (disk_partition)
order = "datetime(date || ' ' || time) DESC"
result = database.select(
"disk",
columns="free+used",
where=where_query,
order=order,
limit=1).fetchone()
if not result or not result[0]:
return None
disk_size = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return disk_size
def getFreeSpace(self, disk_partition, database, date, time):
database = Database(database, create=False, timeout=10)
try:
# fetch free disk space
database.connect()
where_query = "time between '%s:00' and '%s:30' and partition='%s'" % (time, time, disk_partition)
result = database.select(
"disk",
date=date,
columns="free",
where=where_query).fetchone()
if not result or not result[0]:
self.logger.info("No result from collector database: disk check skipped")
return 0
disk_free = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return 0
raise
finally:
try:
database.close()
except Exception:
pass
return int(disk_free)
def raiseOnDatabaseLocked(self, locked_message):
max_warn = 10
latest_result_list = self.getLastPromiseResultList(result_count=max_warn)
warning_count = 0
if len(latest_result_list) < max_warn:
return False
for result in latest_result_list[0]:
if result['status'] == "ERROR" and locked_message in result["message"]:
return True
for result_list in latest_result_list:
found = False
for result in result_list:
if result['status'] == "WARNING" and locked_message in result["message"]:
found = True
warning_count += 1
break
if not found:
break
if warning_count == max_warn:
# too many warning on database locked, now fail.
return True
self.logger.warn("collector database is locked by another process")
return False
@staticmethod
def _checkInodeUsage(path):
stat = os.statvfs(path)
total_inode = stat.f_files
if total_inode:
usage = 100 * (total_inode - stat.f_ffree) / total_inode
if usage >= 98:
return "Disk Inodes usage is really high: %.4f%%" % usage
def getInodeUsage(self, path):
return (self._checkInodeUsage(path) or
os.path.ismount('/tmp') and self._checkInodeUsage('/tmp') or
"")
def sense(self):
# find if a disk is mounted on the path
disk_partition = ""
db_path = self.getConfig('collectordb')
check_date = self.getConfig('test-check-date')
path = os.path.join(self.getPartitionFolder(), "") + "extrafolder"
partitions = psutil.disk_partitions()
while path is not '/':
if not disk_partition:
path = os.path.dirname(path)
else:
break
for p in partitions:
if p.mountpoint == path:
disk_partition = p.device
break
if not disk_partition:
self.logger.error("Couldn't find disk partition")
return
if db_path.endswith("collector.db"):
db_path=db_path[:-len("collector.db")]
if check_date:
# testing mode
currentdate = check_date
currenttime = self.getConfig('test-check-time', '09:17')
disk_partition = self.getConfig('test-disk-partition', '/dev/sda1')
else:
# get last minute
now = datetime.datetime.utcnow()
currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M')
disk_size = self.getDiskSize(disk_partition, db_path)
default_threshold = None
if disk_size is not None:
default_threshold = round(disk_size/(1024*1024*1024) * 0.05, 2)
threshold = float(self.getConfig('threshold', default_threshold) or 2.0)
threshold_days = float(self.getConfig('threshold-days', '20'))
free_space = self.getFreeSpace(disk_partition, db_path, currentdate, currenttime)
days_until_full_tuple = self.getDaysUntilFull(disk_partition, db_path, currentdate, currenttime, threshold_days/2)
if days_until_full_tuple is not None:
days_until_full, min_date, min_free, max_date, max_free, day_span = days_until_full_tuple
message = "Disk will become full in %.2f days (threshold: %.2f days), checked from %s to %s, %.2f days span" % (
days_until_full, threshold_days, min_date, max_date, day_span)
if days_until_full < threshold_days:
message += ', free space dropped from %.1f%% to %.1f%%: ERROR. ' % (min_free*100, max_free*100)
# display the 3 partitions with the highest usage rate in the last few days (threshold_days/2)
fast_partitions = self.fastestPartitions(db_path, disk_partition, currentdate, currenttime, threshold_days/2)
if fast_partitions is not None:
for partition in fast_partitions:
user_name, slope, date_min, date_max, delta_days = partition
message += "The partition %s has used %s Giga per day for the last %s days (from %s to %s)" % (
user_name, slope/(1024*1024*1024), delta_days, date_min, date_max))
# display the final error message
self.logger.error(message)
else:
self.logger.info(message + ': OK')
if free_space == 0:
return
elif free_space > threshold*1024*1024*1024:
inode_usage = self.getInodeUsage(self.getPartitionFolder())
if inode_usage:
self.logger.error(inode_usage)
else:
self.logger.info("Disk usage: OK")
return
free_space = round(free_space/(1024*1024*1024), 2)
message = 'Free disk space low: remaining %s G (threshold: %s G).' % (
free_space, threshold)
# display the 3 partitions that have the most storage capacity on the disk
big_partitions = self.biggestPartitions(db_path, currentdate, currenttime)
if big_partitions is not None:
for partition in big_partitions:
user_name, size_partition, date_checked = partition
message += "The partition %s use %s Giga (date checked: %s)" % (
user_name, size_partition, date_checked))
# display the final error message
self.logger.error(message)
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment