Commit 2dde6bd4 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Merge ComputeNode_check* alarms into project based alarm

   Refactor the implementation of the alarms:

      - Search from Project rather them query compute nodes, directly respecting if the project can create tickets
      - Split ticket creation from messaging introducing *_getReportedErrorDict to calculate the error messages (so we can re-use them)
       - Merge 2 alarms (for check compute node and check compute software installations) info a single one, so we launch at least half the amount of activities for compute nodes.
parent f835cdb0
portal = context.getPortalObject()
portal.portal_catalog.searchAndActivate(
portal_type='Project',
validation_state='validated',
method_id='Project_checkMonitoringState',
activate_kw={'tag': tag}
)
context.activate(after_tag=tag).getId()
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
</item> </item>
<item> <item>
<key> <string>id</string> </key> <key> <string>id</string> </key>
<value> <string>Alarm_checkComputeNodeState</string> </value> <value> <string>Alarm_checkProjectMonitoringState</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None)
if monitor_enabled_category is not None:
portal.portal_catalog.searchAndActivate(
portal_type='Compute Node',
validation_state='validated',
monitor_scope__uid=monitor_enabled_category.getUid(),
method_id='ComputeNode_checkSoftwareInstallationState',
activate_kw={'tag':tag}
)
context.activate(after_tag=tag).getId()
...@@ -8,51 +8,47 @@ project = context.getFollowUpValue() ...@@ -8,51 +8,47 @@ project = context.getFollowUpValue()
if project.Project_isSupportRequestCreationClosed(): if project.Project_isSupportRequestCreationClosed():
return return
software_installation_list = portal.portal_catalog( def createTicketForErrorDict(error_dict):
portal_type='Software Installation', if not error_dict['should_notify']:
aggregate__uid=context.getUid(), return
validation_state='validated',
sort_on=(('creation_date', 'DESC'),) support_request = project.Project_createSupportRequestWithCausality(
) error_dict['ticket_title'],
error_dict['ticket_description'],
support_request_list = [] causality=context.getRelativeUrl(),
should_notify = True destination_decision=project.getDestination()
)
tolerance = DateTime() - 0.5
for software_installation in software_installation_list: if support_request is not None:
should_notify = False
should_notify, ticket_title, description, last_contact = \
software_installation.SoftwareInstallation_hasReportedError(
tolerance=tolerance)
if should_notify:
project = context.getFollowUpValue()
support_request = project.Project_createSupportRequestWithCausality(
ticket_title,
description,
causality=context.getRelativeUrl(),
destination_decision=project.getDestination()
)
if support_request is None:
return
notification_message_reference = 'slapos-crm-compute_node_software_installation_state.notification'
support_request.Ticket_createProjectEvent( support_request.Ticket_createProjectEvent(
ticket_title, 'outgoing', 'Web Message', error_dict['ticket_title'], 'outgoing', 'Web Message',
portal.service_module.slapos_crm_information.getRelativeUrl(), portal.service_module.slapos_crm_information.getRelativeUrl(),
text_content=description, text_content=error_dict['ticket_description'],
content_type='text/plain', content_type='text/plain',
notification_message=notification_message_reference, notification_message=error_dict['notification_message_reference'],
#language=XXX, #language=XXX,
substitution_method_parameter_dict={ substitution_method_parameter_dict=error_dict
'compute_node_title':context.getTitle(),
# Maybe a mistake on compute_node_id
'compute_node_id': software_installation.getReference(),
'last_contact': last_contact
}
) )
return support_request
support_request_list = []
support_request = createTicketForErrorDict(
context.ComputeNode_getReportedErrorDict())
if support_request is not None:
support_request_list.append(support_request)
# Check all instances
for software_installation in portal.portal_catalog(
portal_type='Software Installation',
aggregate__uid=context.getUid(),
validation_state='validated',
sort_on=(('creation_date', 'DESC'),)
):
support_request = createTicketForErrorDict(
software_installation.SoftwareInstallation_getReportedErrorDict())
if support_request is not None:
support_request_list.append(support_request) support_request_list.append(support_request)
return support_request_list return support_request_list
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
</item> </item>
<item> <item>
<key> <string>id</string> </key> <key> <string>id</string> </key>
<value> <string>ComputeNode_checkState</string> </value> <value> <string>ComputeNode_checkMonitoringState</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
from DateTime import DateTime
portal = context.getPortalObject()
if (context.getMonitorScope() == "disabled"):
return
project = context.getFollowUpValue()
if project.Project_isSupportRequestCreationClosed():
return
reference = context.getReference()
compute_node_title = context.getTitle()
node_ticket_title = "Lost contact with compute_node %s" % reference
instance_ticket_title = "Compute Node %s has a stalled instance process" % reference
ticket_title = node_ticket_title
description = ""
last_contact = "No Contact Information"
issue_document_reference = ""
notification_message_reference = 'slapos-crm-compute_node_check_state.notification'
now = DateTime()
d = context.getAccessStatus()
# Ignore if data isn't present.
should_notify = False
if d.get("no_data") == 1:
should_notify = True
description = "The Compute Node %s (%s) has not contacted the server (No Contact Information)" % (
compute_node_title, reference)
else:
last_contact = DateTime(d.get('created_at'))
if (now - last_contact) > 0.01:
should_notify = True
description = "The Compute Node %s (%s) has not contacted the server for more than 30 minutes" \
"(last contact date: %s)" % (compute_node_title, reference, last_contact)
else:
data_array = context.ComputeNode_hasModifiedFile()
if data_array:
should_notify = True
notification_message_reference = "slapos-crm-compute_node_check_modified_file.notification"
ticket_title = "Compute Node %s has modified file" % reference
issue_document_reference = data_array.getReference()
description = "The Compute Node %s (%s) has modified file: %s" % (compute_node_title, reference, issue_document_reference)
if not should_notify:
# Since server is contacting, check for stalled processes
ticket_title = instance_ticket_title
notification_message_reference = 'slapos-crm-compute_node_check_stalled_instance_state.notification'
last_contact = "No Contact Information"
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in context.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
instance_list = portal.portal_catalog(
portal_type='Software Instance',
aggregate__uid=compute_partition_uid_list)
if instance_list:
should_notify = True
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
should_notify = False
break
if should_notify:
description = "The Compute Node %s (%s) didnt process its instances for more than 24 hours, last contact: %s" % (
context.getTitle(), context.getReference(), last_contact)
if should_notify:
support_request = project.Project_createSupportRequestWithCausality(
ticket_title,
description,
causality=context.getRelativeUrl(),
destination_decision=project.getDestination()
)
if support_request is None:
return
support_request.Ticket_createProjectEvent(
ticket_title, 'outgoing', 'Web Message',
portal.service_module.slapos_crm_information.getRelativeUrl(),
text_content=description,
content_type='text/plain',
notification_message=notification_message_reference,
#language=XXX,
substitution_method_parameter_dict={
'compute_node_title':context.getTitle(),
'compute_node_id':reference,
'last_contact':last_contact,
'issue_document_reference': issue_document_reference
}
)
return support_request
portal = context.getPortalObject()
compute_node = context
reference = context.getReference()
compute_node_title = context.getTitle()
d = compute_node.getAccessStatus()
error_dict = {
'should_notify': None,
'ticket_title': None,
'ticket_description': None,
'notification_message_reference': None,
'compute_node_title': compute_node_title,
'compute_node_id': reference,
'last_contact': None,
'issue_document_reference': None
}
if compute_node.getMonitorScope() == "disabled":
if batch_mode:
return None
for i in ['ticket_title', 'ticket_description', 'last_contact']:
error_dict[i] = "Monitor is disabled on this Compute Node."
return error_dict
if d.get("no_data") == 1:
error_dict['last_contact'] = "No Contact Information"
if batch_mode:
return error_dict['last_contact']
error_dict['ticket_title'] = "Lost contact with compute_node %s" % reference
error_dict['ticket_description'] = \
"The Compute Node %s (%s) has not contacted the server (No Contact Information)" % (
compute_node_title.getTitle(), reference())
error_dict['notification_message_reference'] = 'slapos-crm-compute_node_check_state.notification'
error_dict['should_notify'] = True
return error_dict
last_contact = DateTime(d.get('created_at'))
now = DateTime()
if (now - last_contact) > 0.01:
error_dict['should_notify'] = True
error_dict['ticket_title'] = "Lost contact with compute_node %s" % reference
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
error_dict['notification_message_reference'] = 'slapos-crm-compute_node_check_state.notification'
error_dict['ticket_description'] = "The Compute Node %s (%s) has not contacted the server for more than 30 minutes" \
"(last contact date: %s)" % (compute_node_title, reference, last_contact)
return error_dict
data_array = context.ComputeNode_hasModifiedFile()
if data_array:
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
error_dict['should_notify'] = True
error_dict['notification_message_reference'] = "slapos-crm-compute_node_check_modified_file.notification"
error_dict['ticket_title'] = "Compute Node %s has modified file" % reference
error_dict['issue_document_reference'] = data_array.getReference()
error_dict['ticket_description'] = "The Compute Node %s (%s) has modified file: %s" % (
compute_node_title, reference, error_dict['issue_document_reference'])
return error_dict
# Since server is contacting, check for stalled processes
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in context.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
instance_list = portal.portal_catalog(
portal_type='Software Instance',
aggregate__uid=compute_partition_uid_list)
should_notify = True
instance_last_contact = -1
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
instance_last_contact = max(DateTime(instance_access_status.get('created_at')),
instance_last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
should_notify = False
break
if len(instance_list) and should_notify:
if instance_last_contact == -1:
error_dict['last_contact'] = "No Contact Information"
else:
error_dict['last_contact'] = instance_last_contact
if batch_mode:
return error_dict['last_contact']
error_dict['should_notify'] = True
error_dict['notification_message_reference'] = "slapos-crm-compute_node_check_modified_file.notification"
error_dict['ticket_title'] = "Compute Node %s has a stalled instance process" % reference
error_dict['ticket_description'] = "The Compute Node %s (%s) didnt process its instances for more than 24 hours, last contact from the node: %s" % (
compute_node_title, reference, last_contact)
return error_dict
if batch_mode:
return
return error_dict
...@@ -50,11 +50,11 @@ ...@@ -50,11 +50,11 @@
</item> </item>
<item> <item>
<key> <string>_params</string> </key> <key> <string>_params</string> </key>
<value> <string>tolerance=None, batch_mode=False</string> </value> <value> <string>batch_mode=False</string> </value>
</item> </item>
<item> <item>
<key> <string>id</string> </key> <key> <string>id</string> </key>
<value> <string>SoftwareInstallation_hasReportedError</string> </value> <value> <string>ComputeNode_getReportedErrorDict</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
portal = context.getPortalObject() portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse( monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None) "portal_categories/monitor_scope/enabled", None)
if context.Project_isSupportRequestCreationClosed():
return
if monitor_enabled_category is not None: if monitor_enabled_category is not None:
portal.portal_catalog.searchAndActivate( portal.portal_catalog.searchAndActivate(
portal_type='Compute Node', portal_type='Compute Node',
validation_state='validated', validation_state='validated',
monitor_scope__uid=monitor_enabled_category.getUid(), monitor_scope__uid=monitor_enabled_category.getUid(),
method_id='ComputeNode_checkState', follow_up__uid=context.getUid(),
activate_kw={'tag':tag} method_id='ComputeNode_checkMonitoringState',
activate_kw={'tag': tag}
) )
context.activate(after_tag=tag).getId() context.activate(after_tag=tag).getId()
...@@ -50,11 +50,11 @@ ...@@ -50,11 +50,11 @@
</item> </item>
<item> <item>
<key> <string>_params</string> </key> <key> <string>_params</string> </key>
<value> <string></string> </value> <value> <string>tag</string> </value>
</item> </item>
<item> <item>
<key> <string>id</string> </key> <key> <string>id</string> </key>
<value> <string>ComputeNode_checkSoftwareInstallationState</string> </value> <value> <string>Project_checkMontoringState</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
from DateTime import DateTime
tolerance = DateTime() - 0.5
software_installation = context
reference = software_installation.getReference()
compute_node_title = software_installation.getAggregateTitle()
d = software_installation.getAccessStatus()
error_dict = {
'should_notify': None,
'ticket_title': None,
'ticket_description': None,
'notification_message_reference': 'slapos-crm-compute_node_software_installation_state.notification',
'compute_node_title': compute_node_title,
'compute_node_id': reference,
'last_contact': None,
}
if (software_installation.getCreationDate() > tolerance) or \
(software_installation.getSlapState() != 'start_requested') or \
(d.get("no_data", None) == 1) or \
(d.get("text").startswith("#access")):
if batch_mode:
return None
return error_dict
last_contact = DateTime(d.get('created_at'))
if d.get("text").startswith("#building"):
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
error_dict['should_notify'] = True
error_dict['ticket_title'] = "%s is building for too long on %s" % (
reference, software_installation.getAggregateReference())
error_dict['ticket_description'] = "The software release %s is building for mode them 12 hours on %s, started on %s" % \
(software_installation.getUrlString(),
software_installation.getAggregateTitle(),
software_installation.getCreationDate())
return error_dict
if d.get("text").startswith("#error"):
error_dict['last_contact'] = last_contact
if batch_mode:
return error_dict['last_contact']
error_dict['should_notify'] = True
error_dict['ticket_title'] = "%s is failing to build on %s" % (reference, software_installation.getAggregateReference())
error_dict['ticket_description'] = "The software release %s is failing to build for too long on %s, started on %s" % \
(software_installation.getUrlString(),
software_installation.getAggregateTitle(),
software_installation.getCreationDate())
return error_dict
if batch_mode:
return None
return error_dict
...@@ -50,11 +50,11 @@ ...@@ -50,11 +50,11 @@
</item> </item>
<item> <item>
<key> <string>_params</string> </key> <key> <string>_params</string> </key>
<value> <string>tag, fixit, params</string> </value> <value> <string>batch_mode=False</string> </value>
</item> </item>
<item> <item>
<key> <string>id</string> </key> <key> <string>id</string> </key>
<value> <string>Alarm_checkSoftwareInstallationState</string> </value> <value> <string>SoftwareInstallation_getReportedErrorDict</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
from DateTime import DateTime
if tolerance is None:
tolerance = DateTime() - 0.5
software_installation = context
reference = software_installation.getReference()
d = software_installation.getAccessStatus()
def return_ok(batch_mode):
if batch_mode:
return None
return None, None, None, None
if software_installation.getCreationDate() > tolerance:
return return_ok(batch_mode)
if software_installation.getSlapState() != 'start_requested':
return return_ok(batch_mode)
if d.get("no_data", None) == 1:
return return_ok(batch_mode)
if d.get("text").startswith("#access"):
return return_ok(batch_mode)
last_contact = DateTime(d.get('created_at'))
if d.get("text").startswith("#building"):
if batch_mode:
# is it a problem...?
return last_contact
should_notify = True
ticket_title = "%s is building for too long on %s" % (reference, software_installation.getAggregateReference())
description = "The software release %s is building for mode them 12 hours on %s, started on %s" % \
(software_installation.getUrlString(), software_installation.getAggregateTitle(), software_installation.getCreationDate())
return should_notify, ticket_title, description, last_contact
if d.get("text").startswith("#error"):
if batch_mode:
return DateTime(d.get('created_at'))
should_notify = True
ticket_title = "%s is failing to build on %s" % (reference, software_installation.getAggregateReference())
description = "The software release %s is failing to build for too long on %s, started on %s" % \
(software_installation.getUrlString(), software_installation.getAggregateTitle(), software_installation.getCreationDate())
return should_notify, ticket_title, description, last_contact
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment