Commit abb6dca0 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Reimplement SupportRequest_recheckMonitoring

   Use *_getReportedErrorDict for fetch the error message, avoiding duplicate the code/logic on multiple locations
   Drop batch_mode on *_getReportedErrorDict  since it is not required.
parent e3e3ae9e
Pipeline #37237 failed with stage
in 0 seconds
......@@ -16,7 +16,6 @@ error_dict = {
'issue_document_reference': None
}
if compute_node.getMonitorScope() == "disabled":
if batch_mode:
return None
......
......@@ -2,8 +2,9 @@ from DateTime import DateTime
tolerance = DateTime() - 0.5
software_installation = context
compute_node = software_installation.getAggregateValue()
reference = software_installation.getReference()
compute_node_title = software_installation.getAggregateTitle()
compute_node_title = compute_node.getTitle()
d = software_installation.getAccessStatus()
error_dict = {
......@@ -14,44 +15,52 @@ error_dict = {
'compute_node_title': compute_node_title,
'compute_node_id': reference,
'last_contact': None,
'message': None
}
if (software_installation.getCreationDate() > tolerance) or \
(software_installation.getSlapState() != 'start_requested') or \
(d.get("no_data", None) == 1) or \
(d.get("text").startswith("#access")):
if batch_mode:
return None
if software_installation.getCreationDate() > tolerance:
return error_dict
last_contact = DateTime(d.get('created_at'))
if d.get("text").startswith("#building"):
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
# All fine, we just provide a nice message but we dont notify
if compute_node.getMonitorScope() == "disabled":
error_dict['message'] = "Monitor is disabled on %s" % compute_node_title
return error_dict
error_dict['should_notify'] = True
error_dict['ticket_title'] = "%s is building for too long on %s" % (
reference, software_installation.getAggregateReference())
error_dict['ticket_description'] = "The software release %s is building for mode them 12 hours on %s, started on %s" % \
(software_installation.getUrlString(),
software_installation.getAggregateTitle(),
software_installation.getCreationDate())
if software_installation.getSlapState() != 'start_requested':
error_dict['message'] = "Software Installation is Destroyed or Stopped."
return error_dict
if d.get("text").startswith("#error"):
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
if d.get("no_data", None) == 1:
error_dict['message'] = "The software release %s did not started to build on %s since %s" % \
(context.getUrlString(), compute_node_title, context.getCreationDate())
return error_dict
access_status_text = d.get("text")
if access_status_text.startswith("#access"):
error_dict['message'] = "All OK, software built."
return error_dict
# Error occur, we should notify
last_contact = DateTime(d.get('created_at'))
if access_status_text.startswith("#building") or \
access_status_text.startswith("#error"):
error_dict['last_contact'] = last_contact
error_dict['should_notify'] = True
error_dict['ticket_title'] = "%s is failing to build on %s" % (reference, software_installation.getAggregateReference())
error_dict['ticket_description'] = "The software release %s is failing to build for too long on %s, started on %s" % \
(software_installation.getUrlString(),
software_installation.getAggregateTitle(),
software_installation.getCreationDate())
error_dict['ticket_title'] = "%s is failing or taking too long to build on %s" % (
reference, compute_node.getReference())
message_list = (software_installation.getUrlString(),
compute_node_title,
software_installation.getCreationDate())
if access_status_text.startswith("#building"):
error_dict['ticket_description'] = \
"The software release %s is building for mode them 12 hours on %s, started on %s" % message_list
else:
error_dict['ticket_description'] = \
"The software release %s is failing to build for too long on %s, started on %s" % message_list
error_dict['message'] = error_dict['ticket_description']
return error_dict
if batch_mode:
return None
return error_dict
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>batch_mode=False</string> </value>
<value> <string></string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
......@@ -8,28 +8,24 @@ error_dict = {
'notification_message_reference': None,
'last_contact': None,
'since': None,
'error_text': None,
'message': None
}
# Nothing to do
if context.getSlapState() != "start_requested":
if batch_mode:
return
return error_dict
def updateErrorDictWithError(_error_dict):
_error_dict['should_notify'] = True
_error_dict['ticket_title'] = "Instance Tree %s is failing." % _error_dict['instance_tree_title']
description = "%s is unallocated, reporting errors or allocated on a closed compute node." % (
context.getTitle())
if _error_dict['message']:
description += "\n\nMessage: %s" % str(_error_dict['message'])
_error_dict['ticket_description'] = description
return _error_dict
compute_partition = context.getAggregateValue(portal_type="Compute Partition")
if compute_partition is None:
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-instance-allocation.notification'
error_dict['message'] = "%s is not allocated." % context.getTitle()
error_dict['ticket_description'] = error_dict['message']
return updateErrorDictWithError(error_dict)
compute_node = compute_partition.getParentValue()
......@@ -44,23 +40,17 @@ if compute_node.getPortalType() == "Compute Node" and \
if context.getPortalType() == 'Slave Instance':
# We skip if the the slave is already allocated.
if batch_mode:
return
return error_dict
# Skip to check if monitor disabled on the compute node.
# Remote node has no state.
if compute_node.getPortalType() != "Compute Node":
if batch_mode:
return
portal_type = compute_partition.getParentValue().getPortalType()
error_dict['ticket_title'] = "Instance is allocated on a %s" % portal_type
error_dict['ticket_description'] = error_dict['ticket_title']
return error_dict
if compute_partition.getParentValue().getMonitorScope() != "enabled":
if batch_mode:
return
error_dict['ticket_title'] = "Monitor is disabled on the Compute Node"
error_dict['ticket_description'] = error_dict['ticket_title']
return error_dict
......@@ -68,22 +58,21 @@ if compute_partition.getParentValue().getMonitorScope() != "enabled":
d = context.getAccessStatus()
# Ignore if data isn't present.
if d.get("no_data", None) == 1:
if batch_mode:
return
error_dict['ticket_title'] = "Not possible to connect"
error_dict['ticket_description'] = "Not possible to connect"
return error_dict
error_dict['message'] = d['text']
error_dict['error_text'] = d['text']
error_dict['last_contact'] = DateTime(d.get('created_at'))
error_dict['since'] = DateTime(d.get('since'))
if error_dict['message'].startswith('#error '):
if error_dict['error_text'].startswith('#error '):
if ((DateTime()-error_dict['since'])*24*60) > tolerance:
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-instance-state.notification'
if batch_mode:
return True
description = "%s is reporting errors. \n\nMessage: %s" % (context.getTitle(), str(error_dict['error_text']))
error_dict['ticket_description'] = description
# Longer form for consistency.
error_dict['message'] = "%s has error (%s, %s at %s)" % (
context.getReference(), context.getTitle(), context.getUrlString(), compute_node.getReference())
return updateErrorDictWithError(error_dict)
if batch_mode:
return None
return error_dict
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>tolerance=0, batch_mode=False</string> </value>
<value> <string>tolerance=0</string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
from DateTime import DateTime
if context.getSimulationState() == "invalidated":
return "Closed Ticket"
if context.getPortalType() != "Support Request":
return "Not a Support Request"
now = DateTime()
portal = context.getPortalObject()
document = context.getAggregateValue()
if document is None:
return True
aggregate_portal_type = document.getPortalType()
if aggregate_portal_type == "Compute Node":
if document.getMonitorScope() == "disabled":
return "Monitor is disabled to the related %s." % document.getPortalType()
d = document.getAccessStatus()
if d.get("no_data", None) == 1:
return "No Contact Information"
last_contact = DateTime(d.get('created_at'))
if (now - last_contact) < 0.01:
ComputeNode_hasModifiedFile = getattr(
document, "ComputeNode_hasModifiedFile", None)
if ComputeNode_hasModifiedFile:
data_array = ComputeNode_hasModifiedFile()
if data_array:
return "Compute Node %s (%s) has modified file: %s" % (
document.getTitle(), document.getReference(), data_array.getReference())
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in document.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
is_instance_stalled = True
last_contact = None
instance_list = portal.portal_catalog(
portal_type='Software Instance',
default_aggregate_uid=compute_partition_uid_list)
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
is_instance_stalled = False
break
if is_instance_stalled and len(instance_list):
if last_contact is None:
return "Process instance stalled"
return "Process instance stalled, last contact was %s" % last_contact
return "All OK, latest contact: %s " % last_contact
else:
return "Problem, latest contact: %s" % last_contact
if aggregate_portal_type == "Software Installation":
compute_node_title = document.getAggregateTitle()
if document.getAggregateValue().getMonitorScope() == "disabled":
return "Monitor is disabled to the related %s." % document.getPortalType()
if document.getSlapState() not in ["start_requested", "stop_requested"]:
return "Software Installation is Destroyed."
d = document.getAccessStatus()
if d.get("no_data", None) == 1:
return "The software release %s did not started to build on %s since %s" % \
(document.getUrlString(), compute_node_title, document.getCreationDate())
last_contact = DateTime(d.get('created_at'))
if d.get("text").startswith("building"):
return "The software release %s is building for mode them 12 hours on %s, started on %s" % \
(document.getUrlString(), compute_node_title, document.getCreationDate())
elif d.get("text").startswith("#access"):
return "All OK, software built."
elif d.get("text").startswith("#error"):
return "The software release %s is failing to build for too long on %s, started on %s" % \
(document.getUrlString(), compute_node_title, document.getCreationDate())
if aggregate_portal_type == "Instance Tree":
if document.getMonitorScope() == "disabled":
return "Monitor is disabled to the related %s." % document.getPortalType()
causality_portal_type_list = [
'Compute Node',
'Instance Tree',
'Software Installation'
]
if (context.getSimulationState() == "invalidated") or \
(context.getPortalType() != "Support Request") or \
(not context.getCausality(portal_type=causality_portal_type_list)):
# Nothing to check
return
document = context.getCausalityValue(portal_type=causality_portal_type_list)
causality_portal_type = document.getPortalType()
if causality_portal_type == "Compute Node":
error_dict = document.ComputeNode_getReportedErrorDict()
return error_dict['message']
if causality_portal_type == "Software Installation":
error_dict = document.SoftwareInstallation_getReportedErrorDict()
return error_dict['message']
if causality_portal_type == "Instance Tree":
message_list = []
instance_tree = document
......@@ -98,24 +30,8 @@ if aggregate_portal_type == "Instance Tree":
# Check if at least one software Instance is Allocated
for instance in software_instance_list:
if instance.getSlapState() not in ["start_requested", "stop_requested"]:
continue
if instance.getAggregate() is not None:
compute_node = instance.getAggregateValue().getParentValue()
if instance.getPortalType() == "Software Instance" and \
instance.getSlapState() == "start_requested" and \
instance.SoftwareInstance_hasReportedError():
message_list.append("%s has error (%s, %s at %s scope %s)" % (instance.getReference(), instance.getTitle(),
instance.getUrlString(), compute_node.getReference(),
compute_node.getAllocationScope()))
if instance.getPortalType() == "Software Instance" and \
compute_node.getAllocationScope() in ["closed/outdated"] and \
instance.getSlapState() == "start_requested" and \
instance.SoftwareInstance_hasReportedError():
message_list.append("%s on a %s compute_node" % (instance.getReference(), compute_node.getAllocationScope()) )
else:
message_list.append("%s is not allocated" % instance.getReference())
error_dict = instance.SoftwareInstallation_getReportedErrorDict(tolerance=30)
if error_dict['should_notitfy']:
message_list.append(error_dict['message'])
return ",".join(message_list)
return None
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment