Commit b39fd3d6 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Update SupportRequest_recheckMonitoring

  Update script to handle also stalled cases
parent f21f0465
...@@ -54,26 +54,25 @@ if not should_notify: ...@@ -54,26 +54,25 @@ if not should_notify:
compute_partition_uid_list = [ compute_partition_uid_list = [
x.getUid() for x in context.contentValues(portal_type="Compute Partition") x.getUid() for x in context.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy'] if x.getSlapState() == 'busy']
if compute_partition_uid_list: if compute_partition_uid_list:
instance_list = portal.portal_catalog( instance_list = portal.portal_catalog(
portal_type='Software Instance', portal_type='Software Instance',
default_aggregate_uid=compute_partition_uid_list) default_aggregate_uid=compute_partition_uid_list)
if instance_list: if instance_list:
should_notify = True should_notify = True
description = "The Compute Node %s (%s) didnt process its instances for more them 24 hours" % ( description = "The Compute Node %s (%s) didnt process its instances for more them 24 hours, last contact: %s"
compute_node_title, reference)
for instance in instance_list: for instance in instance_list:
instance_access_status = instance.getAccessStatus() instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None): if instance_access_status.get('no_data', None):
# Ignore if there isnt any data # Ignore if there isnt any data
continue continue
# At lest one partition contacted in the last 24h30min. # At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact) last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.01: if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
should_notify = False should_notify = False
description = "" description = ""
break break
...@@ -91,8 +90,7 @@ if not should_notify: ...@@ -91,8 +90,7 @@ if not should_notify:
if software_installation_list: if software_installation_list:
should_notify = True should_notify = True
description = "The Compute Node %s (%s) didnt process its software releases for more them 24 hours" % ( description = "The Compute Node %s (%s) didnt process its software releases for more them 24 hours, last contact %s"
compute_node_title, reference)
# Test if server didnt process the internal softwares releases for more them 24h # Test if server didnt process the internal softwares releases for more them 24h
for installation in software_installation_list: for installation in software_installation_list:
...@@ -100,7 +98,7 @@ if not should_notify: ...@@ -100,7 +98,7 @@ if not should_notify:
if installation_access_status.get('no_data', None): if installation_access_status.get('no_data', None):
# Ignore if there isnt any data on it # Ignore if there isnt any data on it
continue continue
last_contact = max(DateTime(installation_access_status.get('created_at')), last_contact) last_contact = max(DateTime(installation_access_status.get('created_at')), last_contact)
if (now - DateTime(installation_access_status.get('created_at'))) < 1.01: if (now - DateTime(installation_access_status.get('created_at'))) < 1.01:
should_notify = False should_notify = False
...@@ -119,33 +117,33 @@ if should_notify: ...@@ -119,33 +117,33 @@ if should_notify:
if support_request is None: if support_request is None:
person.notify(support_request_title=ticket_title, person.notify(support_request_title=ticket_title,
support_request_description=description, support_request_description=description % (context.getTitle(), reference, last_contact),
aggregate=context.getRelativeUrl()) aggregate=context.getRelativeUrl())
support_request_relative_url = context.REQUEST.get("support_request_relative_url") support_request_relative_url = context.REQUEST.get("support_request_relative_url")
if support_request_relative_url is None: if support_request_relative_url is None:
return return
support_request = portal.restrictedTraverse(support_request_relative_url) support_request = portal.restrictedTraverse(support_request_relative_url)
if support_request is None: if support_request is None:
return return
# Send Notification message # Send Notification message
notification_message = portal.portal_notifications.getDocumentValue( notification_message = portal.portal_notifications.getDocumentValue(
reference=notification_message_reference) reference=notification_message_reference)
if notification_message is None: if notification_message is None:
message = """%s""" % description message = """%s""" % (description % (context.getTitle(), reference, last_contact))
else: else:
mapping_dict = {'compute_node_title':context.getTitle(), mapping_dict = {'compute_node_title':context.getTitle(),
'compute_node_id':reference, 'compute_node_id':reference,
'last_contact':last_contact} 'last_contact':last_contact}
message = notification_message.asText( message = notification_message.asText(
substitution_method_parameter_dict={'mapping_dict': mapping_dict}) substitution_method_parameter_dict={'mapping_dict': mapping_dict})
event = support_request.SupportRequest_getLastEvent(ticket_title) event = support_request.SupportRequest_getLastEvent(ticket_title)
if event is None: if event is None:
support_request.notify(message_title=ticket_title, message=message) support_request.notify(message_title=ticket_title, message=message)
return support_request return support_request
# #
# XXX This ticket contains dupplicated coded found arround SlapOS # XXX This ticket contains dupplicated coded found arround SlapOS
# It is required to rewrite this in a generic way. # It is required to rewrite this in a generic way.
# See also: InstanceTree_checkSoftwareInstanceState # See also: InstanceTree_checkSoftwareInstanceState
# See also: ComputeNode_checkState # See also: ComputeNode_checkState
# #
...@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated": ...@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated":
if context.getPortalType() != "Support Request": if context.getPortalType() != "Support Request":
return "Not a Support Request" return "Not a Support Request"
now = DateTime()
portal = context.getPortalObject()
document = context.getAggregateValue() document = context.getAggregateValue()
if document is None: if document is None:
return True return True
...@@ -25,13 +28,63 @@ if aggregate_portal_type == "Compute Node": ...@@ -25,13 +28,63 @@ if aggregate_portal_type == "Compute Node":
d = document.getAccessStatus() d = document.getAccessStatus()
if d.get("no_data", None) == 1: if d.get("no_data", None) == 1:
return "No Contact Information" return "No Contact Information"
last_contact = DateTime(d.get('created_at')) last_contact = DateTime(d.get('created_at'))
if (DateTime() - last_contact) < 0.01: if (now - last_contact) < 0.01:
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in document.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
is_instance_stalled = True
last_contact = None
instance_list = portal.portal_catalog(
portal_type='Software Instance',
default_aggregate_uid=compute_partition_uid_list)
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
is_instance_stalled = False
break
if is_instance_stalled and len(instance_list):
return "Process instance stalled, last contact was %s" % last_contact
# Since server is contacting, check for stalled software releases processes
is_software_stalled = True
last_contact = None
software_installation_list = portal.portal_catalog(
portal_type='Software Installation',
default_aggregate_uid=document.getUid(),
validation_state='validated')
# Test if server didnt process the internal softwares releases for more them 24h
for installation in software_installation_list:
installation_access_status = installation.getAccessStatus()
if installation_access_status.get('no_data', None):
# Ignore if there isnt any data on it
continue
last_contact = max(DateTime(installation_access_status.get('created_at')), last_contact)
if (now - DateTime(installation_access_status.get('created_at'))) < 1.01:
is_software_stalled = False
break
if is_software_stalled and len(software_installation_list):
return "Process instance stalled, last contact was %s" % last_contact
return "All OK, latest contact: %s " % last_contact return "All OK, latest contact: %s " % last_contact
else: else:
return "Problem, latest contact: %s" % last_contact return "Problem, latest contact: %s" % last_contact
if aggregate_portal_type == "Software Installation": if aggregate_portal_type == "Software Installation":
compute_node_title = document.getAggregateTitle() compute_node_title = document.getAggregateTitle()
if document.getAggregateValue().getMonitorScope() == "disabled": if document.getAggregateValue().getMonitorScope() == "disabled":
...@@ -40,11 +93,11 @@ if aggregate_portal_type == "Software Installation": ...@@ -40,11 +93,11 @@ if aggregate_portal_type == "Software Installation":
if document.getSlapState() not in ["start_requested", "stop_requested"]: if document.getSlapState() not in ["start_requested", "stop_requested"]:
return "Software Installation is Destroyed." return "Software Installation is Destroyed."
d = context.getAccessStatus() d = document.getAccessStatus()
if d.get("no_data", None) == 1: if d.get("no_data", None) == 1:
return "The software release %s did not started to build on %s since %s" % \ return "The software release %s did not started to build on %s since %s" % \
(document.getUrlString(), compute_node_title, document.getCreationDate()) (document.getUrlString(), compute_node_title, document.getCreationDate())
last_contact = DateTime(d.get('created_at')) last_contact = DateTime(d.get('created_at'))
if d.get("text").startswith("building"): if d.get("text").startswith("building"):
return "The software release %s is building for mode them 12 hours on %s, started on %s" % \ return "The software release %s is building for mode them 12 hours on %s, started on %s" % \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment