Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
slapos.core
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
slapos.core
Commits
015add27
Commit
015add27
authored
Dec 20, 2022
by
Rafael Monnerat
Browse files
Options
Browse Files
Download
Plain Diff
slapos_crm: Check if the slapgrid is staled whenever check if the computer is down
See merge request
nexedi/slapos.core!468
parents
024787b4
5348fed0
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
230 additions
and
33 deletions
+230
-33
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
...tal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
+108
-33
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
...ateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
+122
-0
No files found.
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
View file @
015add27
...
...
@@ -13,29 +13,105 @@ if context.getAllocationScope("open").startswith("close"):
reference
=
context
.
getReference
()
compute_node_title
=
context
.
getTitle
()
ticket_title
=
"[MONITORING] Lost contact with compute_node %s"
%
reference
node_ticket_title
=
"[MONITORING] Lost contact with compute_node %s"
%
reference
instance_ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
reference
software_ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
reference
ticket_title
=
node_ticket_title
description
=
""
last_contact
=
"No Contact Information"
notification_message_reference
=
'slapos-crm-compute_node_check_state.notification'
now
=
DateTime
()
d
=
context
.
getAccessStatus
()
# Ignore if data isn't present.
should_notify
=
False
if
d
.
get
(
"no_data"
)
==
1
:
should_notify
=
True
description
=
"The Compute Node %s (%s) has not contacted the server (No Contact Information)"
%
(
compute_node_title
,
reference
)
else
:
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
if
(
DateTime
()
-
last_contact
)
>
0.01
:
if
(
now
-
last_contact
)
>
0.01
:
should_notify
=
True
description
=
"The Compute Node %s (%s) has not contacted the server for more than 30 minutes"
\
"(last contact date: %s)"
%
(
compute_node_title
,
reference
,
last_contact
)
else
:
# Nothing to notify.
return
support_request
=
person
.
Base_getSupportRequestInProgress
(
if
not
should_notify
:
# Since server is contacting, check for stalled processes
ticket_title
=
instance_ticket_title
notification_message_reference
=
'slapos-crm-compute_node_check_stalled_instance_state.notification'
last_contact
=
"No Contact Information"
# If server has no partitions skip
compute_partition_uid_list
=
[
x
.
getUid
()
for
x
in
context
.
contentValues
(
portal_type
=
"Compute Partition"
)
if
x
.
getSlapState
()
==
'busy'
]
if
compute_partition_uid_list
:
instance_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Instance'
,
default_aggregate_uid
=
compute_partition_uid_list
)
if
instance_list
:
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its instances for more them 24 hours"
%
(
compute_node_title
,
reference
)
for
instance
in
instance_list
:
instance_access_status
=
instance
.
getAccessStatus
()
if
instance_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.01
:
should_notify
=
False
description
=
""
break
if
not
should_notify
:
ticket_title
=
software_ticket_title
notification_message_reference
=
'slapos-crm-compute_node_check_stalled_software_state.notification'
last_contact
=
"No Contact Information"
# Since server is contacting, check for stalled software releases processes
software_installation_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Installation'
,
default_aggregate_uid
=
context
.
getUid
(),
validation_state
=
'validated'
)
if
software_installation_list
:
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its software releases for more them 24 hours"
%
(
compute_node_title
,
reference
)
# Test if server didnt process the internal softwares releases for more them 24h
for
installation
in
software_installation_list
:
installation_access_status
=
installation
.
getAccessStatus
()
if
installation_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data on it
continue
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
installation_access_status
.
get
(
'created_at'
)))
<
1.01
:
should_notify
=
False
description
=
""
break
if
should_notify
:
support_request
=
person
.
Base_getSupportRequestInProgress
(
title
=
node_ticket_title
,
aggregate
=
context
.
getRelativeUrl
())
if
support_request
is
None
:
support_request
=
person
.
Base_getSupportRequestInProgress
(
title
=
ticket_title
,
aggregate
=
context
.
getRelativeUrl
())
if
support_request
is
None
:
if
support_request
is
None
:
person
.
notify
(
support_request_title
=
ticket_title
,
support_request_description
=
description
,
aggregate
=
context
.
getRelativeUrl
())
...
...
@@ -46,25 +122,24 @@ if support_request is None:
support_request
=
portal
.
restrictedTraverse
(
support_request_relative_url
)
if
support_request
is
None
:
if
support_request
is
None
:
return
# Send Notification message
notification_message
=
portal
.
portal_notifications
.
getDocumentValue
(
reference
=
notification_message_reference
)
# Send Notification message
notification_message
=
portal
.
portal_notifications
.
getDocumentValue
(
reference
=
'slapos-crm-compute_node_check_state.notification'
)
if
notification_message
is
None
:
if
notification_message
is
None
:
message
=
"""%s"""
%
description
else
:
else
:
mapping_dict
=
{
'compute_node_title'
:
context
.
getTitle
(),
'compute_node_id'
:
reference
,
'last_contact'
:
last_contact
}
message
=
notification_message
.
asText
(
substitution_method_parameter_dict
=
{
'mapping_dict'
:
mapping_dict
})
event
=
support_request
.
SupportRequest_getLastEvent
(
ticket_title
)
if
event
is
None
:
event
=
support_request
.
SupportRequest_getLastEvent
(
ticket_title
)
if
event
is
None
:
support_request
.
notify
(
message_title
=
ticket_title
,
message
=
message
)
return
support_request
return
support_request
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
View file @
015add27
...
...
@@ -1113,6 +1113,128 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'assert reference == "slapos-crm-compute_node_check_stalled_instance_state.notification", reference
\
n
'
\
'return context.restrictedTraverse('
\
'context.REQUEST["test_ComputeNode_checkState_stalled_instance"])'
)
def
test_ComputeNode_checkState_stalled_instance
(
self
):
compute_node
=
self
.
_makeComputeNode
(
owner
=
self
.
makePerson
(
user
=
0
))[
0
]
self
.
_makeComplexComputeNode
()
person
=
compute_node
.
getSourceAdministrationValue
()
self
.
portal
.
REQUEST
[
'test_ComputeNode_checkState_stalled_instance'
]
=
\
self
.
_makeNotificationMessage
(
compute_node
.
getReference
())
# Computer is getting access
compute_node
.
setAccessStatus
(
""
)
try
:
self
.
pinDateTime
(
DateTime
()
-
1.1
)
self
.
start_requested_software_instance
.
setAccessStatus
(
""
)
finally
:
self
.
unpinDateTime
()
compute_node
.
ComputeNode_checkState
()
self
.
tic
()
ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
compute_node
.
getReference
()
ticket
=
self
.
_getGeneratedSupportRequest
(
compute_node
.
getUid
(),
ticket_title
)
self
.
assertNotEqual
(
ticket
,
None
)
event_list
=
ticket
.
getFollowUpRelatedValueList
()
self
.
assertEqual
(
len
(
event_list
),
1
)
event
=
event_list
[
0
]
self
.
assertEqual
(
event
.
getTitle
(),
ticket
.
getTitle
())
self
.
assertIn
(
compute_node
.
getReference
(),
event
.
getTextContent
())
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'assert reference == "slapos-crm-compute_node_check_stalled_software_state.notification", reference
\
n
'
\
'return context.restrictedTraverse('
\
'context.REQUEST["test_ComputeNode_checkState_stalled_software"])'
)
def
test_ComputeNode_checkState_stalled_software
(
self
):
compute_node
=
self
.
_makeComputeNode
(
owner
=
self
.
makePerson
(
user
=
0
))[
0
]
self
.
_makeComplexComputeNode
()
person
=
compute_node
.
getSourceAdministrationValue
()
self
.
portal
.
REQUEST
[
'test_ComputeNode_checkState_stalled_software'
]
=
\
self
.
_makeNotificationMessage
(
compute_node
.
getReference
())
# Computer is getting access, also internal instance
compute_node
.
setAccessStatus
(
""
)
self
.
start_requested_software_instance
.
setAccessStatus
(
""
)
try
:
self
.
pinDateTime
(
DateTime
()
-
1.1
)
self
.
start_requested_software_installation
.
setAccessStatus
(
""
)
finally
:
self
.
unpinDateTime
()
compute_node
.
ComputeNode_checkState
()
self
.
tic
()
ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
compute_node
.
getReference
()
ticket
=
self
.
_getGeneratedSupportRequest
(
compute_node
.
getUid
(),
ticket_title
)
self
.
assertNotEqual
(
ticket
,
None
)
event_list
=
ticket
.
getFollowUpRelatedValueList
()
self
.
assertEqual
(
len
(
event_list
),
1
)
event
=
event_list
[
0
]
self
.
assertEqual
(
event
.
getTitle
(),
ticket
.
getTitle
())
self
.
assertIn
(
compute_node
.
getReference
(),
event
.
getTextContent
())
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'assert reference == "slapos-crm-compute_node_check_stalled_instance_state.notification", reference
\
n
'
\
'return context.restrictedTraverse('
\
'context.REQUEST["test_ComputeNode_checkState_stalled_instance"])'
)
def
test_ComputeNode_checkState_stalled_instance_single
(
self
):
compute_node
=
self
.
_makeComputeNode
(
owner
=
self
.
makePerson
(
user
=
0
))[
0
]
self
.
_makeComplexComputeNode
()
person
=
compute_node
.
getSourceAdministrationValue
()
self
.
portal
.
REQUEST
[
'test_ComputeNode_checkState_stalled_instance'
]
=
\
self
.
_makeNotificationMessage
(
compute_node
.
getReference
())
# Computer is getting access
compute_node
.
setAccessStatus
(
""
)
try
:
self
.
pinDateTime
(
DateTime
()
-
1.1
)
self
.
start_requested_software_instance
.
setAccessStatus
(
""
)
self
.
start_requested_software_installation
.
setAccessStatus
(
""
)
finally
:
self
.
unpinDateTime
()
compute_node
.
ComputeNode_checkState
()
self
.
tic
()
ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
compute_node
.
getReference
()
ticket
=
self
.
_getGeneratedSupportRequest
(
compute_node
.
getUid
(),
ticket_title
)
self
.
assertNotEqual
(
ticket
,
None
)
event_list
=
ticket
.
getFollowUpRelatedValueList
()
self
.
assertEqual
(
len
(
event_list
),
1
)
event
=
event_list
[
0
]
self
.
assertEqual
(
event
.
getTitle
(),
ticket
.
getTitle
())
self
.
assertIn
(
compute_node
.
getReference
(),
event
.
getTextContent
())
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
class
TestSlapOSInstanceTree_createSupportRequestEvent
(
SlapOSTestCaseMixin
):
def
_makeNotificationMessage
(
self
,
reference
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment