Commit d8f23f51 authored by Rémy Coutable's avatar Rémy Coutable

Merge branch '33184-associate-self-managed-prometheus-alerts-and-issues' into 'master'

Associate self-managed Prometheus Alerts and Issues

Closes #33184

See merge request gitlab-org/gitlab!18046
parents 3f4c19cd 816d4b69
# frozen_string_literal: true
class AddSelfManagedPrometheusAlerts < ActiveRecord::Migration[5.2]
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
def change
create_table :self_managed_prometheus_alert_events do |t|
t.references :project, index: false, foreign_key: { on_delete: :cascade }, null: false
t.references :environment, index: true, foreign_key: { on_delete: :cascade }
t.datetime_with_timezone :started_at, null: false
t.datetime_with_timezone :ended_at
t.integer :status, null: false, limit: 2
t.string :title, null: false, limit: 255
t.string :query_expression, limit: 255
t.string :payload_key, null: false, limit: 255
t.index [:project_id, :payload_key], unique: true, name: 'idx_project_id_payload_key_self_managed_prometheus_alert_events'
end
end
end
# frozen_string_literal: true
class AddJoinTableForSelfManagedPrometheusAlertIssues < ActiveRecord::Migration[5.2]
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
def change
# Join table to Issues
create_table :issues_self_managed_prometheus_alert_events, id: false do |t|
t.references :issue, null: false,
index: false, # Uses the index below
foreign_key: { on_delete: :cascade }
t.references :self_managed_prometheus_alert_event, null: false,
index: { name: 'issue_id_issues_self_managed_rometheus_alert_events_index' },
foreign_key: { on_delete: :cascade }
t.timestamps_with_timezone
t.index [:issue_id, :self_managed_prometheus_alert_event_id],
unique: true, name: 'issue_id_self_managed_prometheus_alert_event_id_index'
end
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2019_10_16_072826) do
ActiveRecord::Schema.define(version: 2019_10_16_220135) do
# These are extensions that must be enabled in order to support this database
enable_extension "pg_trgm"
......@@ -1945,6 +1945,15 @@ ActiveRecord::Schema.define(version: 2019_10_16_072826) do
t.index ["prometheus_alert_event_id"], name: "issue_id_issues_prometheus_alert_events_index"
end
create_table "issues_self_managed_prometheus_alert_events", id: false, force: :cascade do |t|
t.bigint "issue_id", null: false
t.bigint "self_managed_prometheus_alert_event_id", null: false
t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false
t.index ["issue_id", "self_managed_prometheus_alert_event_id"], name: "issue_id_self_managed_prometheus_alert_event_id_index", unique: true
t.index ["self_managed_prometheus_alert_event_id"], name: "issue_id_issues_self_managed_rometheus_alert_events_index"
end
create_table "jira_connect_installations", force: :cascade do |t|
t.string "client_key"
t.string "encrypted_shared_secret"
......@@ -3318,6 +3327,19 @@ ActiveRecord::Schema.define(version: 2019_10_16_072826) do
t.index ["group_id", "token_encrypted"], name: "index_scim_oauth_access_tokens_on_group_id_and_token_encrypted", unique: true
end
create_table "self_managed_prometheus_alert_events", force: :cascade do |t|
t.bigint "project_id", null: false
t.bigint "environment_id"
t.datetime_with_timezone "started_at", null: false
t.datetime_with_timezone "ended_at"
t.integer "status", limit: 2, null: false
t.string "title", limit: 255, null: false
t.string "query_expression", limit: 255
t.string "payload_key", limit: 255, null: false
t.index ["environment_id"], name: "index_self_managed_prometheus_alert_events_on_environment_id"
t.index ["project_id", "payload_key"], name: "idx_project_id_payload_key_self_managed_prometheus_alert_events", unique: true
end
create_table "sent_notifications", id: :serial, force: :cascade do |t|
t.integer "project_id"
t.integer "noteable_id"
......@@ -4150,6 +4172,8 @@ ActiveRecord::Schema.define(version: 2019_10_16_072826) do
add_foreign_key "issues", "users", column: "updated_by_id", name: "fk_ffed080f01", on_delete: :nullify
add_foreign_key "issues_prometheus_alert_events", "issues", on_delete: :cascade
add_foreign_key "issues_prometheus_alert_events", "prometheus_alert_events", on_delete: :cascade
add_foreign_key "issues_self_managed_prometheus_alert_events", "issues", on_delete: :cascade
add_foreign_key "issues_self_managed_prometheus_alert_events", "self_managed_prometheus_alert_events", on_delete: :cascade
add_foreign_key "jira_connect_subscriptions", "jira_connect_installations", on_delete: :cascade
add_foreign_key "jira_connect_subscriptions", "namespaces", on_delete: :cascade
add_foreign_key "jira_tracker_data", "services", on_delete: :cascade
......@@ -4289,6 +4313,8 @@ ActiveRecord::Schema.define(version: 2019_10_16_072826) do
add_foreign_key "reviews", "users", column: "author_id", on_delete: :nullify
add_foreign_key "saml_providers", "namespaces", column: "group_id", on_delete: :cascade
add_foreign_key "scim_oauth_access_tokens", "namespaces", column: "group_id", on_delete: :cascade
add_foreign_key "self_managed_prometheus_alert_events", "environments", on_delete: :cascade
add_foreign_key "self_managed_prometheus_alert_events", "projects", on_delete: :cascade
add_foreign_key "services", "projects", name: "fk_71cce407f9", on_delete: :cascade
add_foreign_key "slack_integrations", "services", on_delete: :cascade
add_foreign_key "smartcard_identities", "users", on_delete: :cascade
......
# frozen_string_literal: true
module AlertEventLifecycle
extend ActiveSupport::Concern
included do
validates :started_at, presence: true
validates :status, presence: true
state_machine :status, initial: :none do
state :none, value: nil
state :firing, value: 0 do
validates :payload_key, presence: true
validates :ended_at, absence: true
end
state :resolved, value: 1 do
validates :ended_at, presence: true
end
event :fire do
transition none: :firing
end
event :resolve do
transition firing: :resolved
end
before_transition to: :firing do |alert_event, transition|
started_at = transition.args.first
alert_event.started_at = started_at
end
before_transition to: :resolved do |alert_event, transition|
ended_at = transition.args.first
alert_event.ended_at = ended_at || Time.current
end
end
scope :firing, -> { where(status: status_value_for(:firing)) }
scope :resolved, -> { where(status: status_value_for(:resolved)) }
scope :count_by_project_id, -> { group(:project_id).count }
def self.status_value_for(name)
state_machines[:status].states[name].value
end
end
end
......@@ -27,6 +27,7 @@ module EE
end
end
has_and_belongs_to_many :self_managed_prometheus_alert_events, join_table: :issues_self_managed_prometheus_alert_events
has_and_belongs_to_many :prometheus_alert_events, join_table: :issues_prometheus_alert_events
has_many :prometheus_alerts, through: :prometheus_alert_events
......
......@@ -81,6 +81,7 @@ module EE
has_many :prometheus_alerts, inverse_of: :project
has_many :prometheus_alert_events, inverse_of: :project
has_many :self_managed_prometheus_alert_events, inverse_of: :project
has_many :operations_feature_flags, class_name: 'Operations::FeatureFlag'
has_one :operations_feature_flags_client, class_name: 'Operations::FeatureFlagsClient'
......
# frozen_string_literal: true
class PrometheusAlertEvent < ApplicationRecord
include AlertEventLifecycle
belongs_to :project, required: true, validate: true, inverse_of: :prometheus_alert_events
belongs_to :prometheus_alert, required: true, validate: true, inverse_of: :prometheus_alert_events
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_prometheus_alert_events
validates :payload_key, uniqueness: { scope: :prometheus_alert_id }
validates :started_at, presence: true
validates :status, presence: true
delegate :title, :prometheus_metric_id, to: :prometheus_alert
state_machine :status, initial: :none do
state :none, value: nil
state :firing, value: 0 do
validates :payload_key, presence: true
validates :ended_at, absence: true
end
state :resolved, value: 1 do
validates :payload_key, absence: true
validates :ended_at, presence: true
end
event :fire do
transition none: :firing
end
event :resolve do
transition firing: :resolved
end
before_transition to: :firing do |alert_event, transition|
started_at = transition.args.first
alert_event.started_at = started_at
end
before_transition to: :resolved do |alert_event, transition|
ended_at = transition.args.first
alert_event.payload_key = nil
alert_event.ended_at = ended_at
end
end
scope :firing, -> { where(status: status_value_for(:firing)) }
scope :resolved, -> { where(status: status_value_for(:resolved)) }
scope :for_environment, -> (environment) do
joins(:prometheus_alert).where(prometheus_alerts: { environment_id: environment })
end
scope :count_by_project_id, -> { group(:project_id).count }
scope :with_prometheus_alert, -> { includes(:prometheus_alert) }
def self.last_by_project_id
......
# frozen_string_literal: true
class SelfManagedPrometheusAlertEvent < ApplicationRecord
include AlertEventLifecycle
belongs_to :project, validate: true, inverse_of: :self_managed_prometheus_alert_events
belongs_to :environment, validate: true
has_and_belongs_to_many :related_issues, class_name: 'Issue', join_table: :issues_self_managed_prometheus_alert_events
validates :payload_key, uniqueness: { scope: :project_id }
def self.find_or_initialize_by_payload_key(project, payload_key)
find_or_initialize_by(project: project, payload_key: payload_key) do |event|
yield event if block_given?
end
end
def self.payload_key_for(started_at, alert_name, query_expression)
plain = [started_at, alert_name, query_expression].join('/')
Digest::SHA1.hexdigest(plain)
end
end
......@@ -16,31 +16,23 @@ module Projects
end
def create_event(payload)
return unless payload.respond_to?(:dig)
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: payload)
status = payload.dig('status')
return unless status
return unless parsed_alert.valid?
started_at = validate_date(payload['startsAt'])
return unless started_at
ended_at = validate_date(payload['endsAt'])
return unless ended_at
gitlab_alert_id = payload.dig('labels', 'gitlab_alert_id')
return unless gitlab_alert_id
alert = find_alert(gitlab_alert_id)
return unless alert
payload_key = PrometheusAlertEvent.payload_key_for(gitlab_alert_id, started_at)
event = PrometheusAlertEvent.find_or_initialize_by_payload_key(project, alert, payload_key)
event = if parsed_alert.gitlab_managed?
build_managed_prometheus_alert_event(parsed_alert)
else
build_self_managed_prometheus_alert_event(parsed_alert)
end
result = case status
if event
result = case parsed_alert.status
when 'firing'
event.fire(started_at)
event.fire(parsed_alert.starts_at)
when 'resolved'
event.resolve(ended_at)
event.resolve(parsed_alert.ends_at)
end
end
event if result
......@@ -57,12 +49,24 @@ module Projects
.first
end
def validate_date(date)
return unless date
def build_managed_prometheus_alert_event(parsed_alert)
alert = find_alert(parsed_alert.metric_id)
Time.rfc3339(date)
date
rescue ArgumentError
return if alert.blank?
payload_key = PrometheusAlertEvent.payload_key_for(parsed_alert.metric_id, parsed_alert.starts_at_raw)
PrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, alert, payload_key)
end
def build_self_managed_prometheus_alert_event(parsed_alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(parsed_alert.starts_at_raw, parsed_alert.title, parsed_alert.full_query)
SelfManagedPrometheusAlertEvent.find_or_initialize_by_payload_key(parsed_alert.project, payload_key) do |event|
event.environment = parsed_alert.environment
event.title = parsed_alert.title
event.query_expression = parsed_alert.full_query
end
end
end
end
......
......@@ -10,7 +10,8 @@ module IncidentManagement
project = find_project(project_id)
return unless project
event = find_prometheus_alert_event(alert_hash)
parsed_alert = Gitlab::Alerting::Alert.new(project: project, payload: alert_hash)
event = find_prometheus_alert_event(parsed_alert)
issue = create_issue(project, alert_hash)
relate_issue_to_event(event, issue)
......@@ -22,14 +23,26 @@ module IncidentManagement
Project.find_by_id(project_id)
end
def find_prometheus_alert_event(alert_hash)
started_at = alert_hash.dig('startsAt')
gitlab_alert_id = alert_hash.dig('labels', 'gitlab_alert_id')
payload_key = PrometheusAlertEvent.payload_key_for(gitlab_alert_id, started_at)
def find_prometheus_alert_event(alert)
if alert.gitlab_managed?
find_gitlab_managed_event(alert)
else
find_self_managed_event(alert)
end
end
def find_gitlab_managed_event(alert)
payload_key = PrometheusAlertEvent.payload_key_for(alert.metric_id, alert.starts_at_raw)
PrometheusAlertEvent.find_by_payload_key(payload_key)
end
def find_self_managed_event(alert)
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(alert.starts_at_raw, alert.title, alert.full_query)
SelfManagedPrometheusAlertEvent.find_by_payload_key(payload_key)
end
def create_issue(project, alert)
IncidentManagement::CreateIssueService
.new(project, alert)
......
---
title: Associate self-managed Prometheus Alerts and Issues
merge_request: 18046
author:
type: added
......@@ -15,6 +15,12 @@ module Gitlab
end
end
def metric_id
strong_memoize(:metric_id) do
payload&.dig('labels', 'gitlab_alert_id')
end
end
def title
strong_memoize(:title) do
gitlab_alert&.title || parse_title_from_payload
......@@ -28,7 +34,9 @@ module Gitlab
end
def environment
gitlab_alert&.environment
strong_memoize(:environment) do
gitlab_alert&.environment || parse_environment_from_payload
end
end
def annotations
......@@ -43,6 +51,18 @@ module Gitlab
end
end
def starts_at_raw
strong_memoize(:starts_at_raw) do
payload&.dig('startsAt')
end
end
def ends_at
strong_memoize(:ends_at) do
parse_datetime_from_payload('endsAt')
end
end
def full_query
strong_memoize(:full_query) do
gitlab_alert&.full_query || parse_expr_from_payload
......@@ -55,8 +75,18 @@ module Gitlab
end
end
def status
strong_memoize(:status) do
payload&.dig('status')
end
end
def gitlab_managed?
metric_id.present?
end
def valid?
project && title && starts_at
payload.respond_to?(:dig) && project && title && starts_at
end
def present
......@@ -65,8 +95,17 @@ module Gitlab
private
def parse_environment_from_payload
environment_name = payload&.dig('labels', 'gitlab_environment_name')
return unless environment_name
EnvironmentsFinder.new(project, nil, { name: environment_name })
.find
&.first
end
def parse_gitlab_alert_from_payload
metric_id = payload&.dig('labels', 'gitlab_alert_id')
return unless metric_id
Projects::Prometheus::AlertsFinder
......
# frozen_string_literal: true
FactoryBot.define do
factory :self_managed_prometheus_alert_event do
project
sequence(:payload_key) { |n| "hash payload key #{n}" }
status { SelfManagedPrometheusAlertEvent.status_value_for(:firing) }
title { 'alert' }
query_expression { 'vector(2)' }
started_at { Time.now }
trait :resolved do
status { SelfManagedPrometheusAlertEvent.status_value_for(:resolved) }
ended_at { Time.now }
payload_key { nil }
end
trait :none do
status { nil }
started_at { nil }
end
end
end
......@@ -46,6 +46,7 @@ describe Issue do
it { is_expected.to have_many(:designs) }
it { is_expected.to have_many(:design_versions) }
it { is_expected.to have_and_belong_to_many(:prometheus_alert_events) }
it { is_expected.to have_and_belong_to_many(:self_managed_prometheus_alert_events) }
it { is_expected.to have_many(:prometheus_alerts) }
describe 'versions.most_recent' do
......
......@@ -85,7 +85,6 @@ describe PrometheusAlertEvent do
expect(result).to eq(true)
expect(subject).to be_resolved
expect(subject.ended_at).to be_like_time(ended_at)
expect(subject.payload_key).to be_nil
end
end
......
......@@ -32,6 +32,18 @@ describe Projects::Prometheus::Alerts::CreateEventsService do
end
end
shared_examples 'self managed events persisted' do
subject { service.execute }
it 'returns created events' do
expect(subject).not_to be_empty
end
it 'does change self managed event count' do
expect { subject }.to change { SelfManagedPrometheusAlertEvent.count }
end
end
context 'with valid alerts_payload' do
let!(:alert) { create(:prometheus_alert, prometheus_metric: metric, project: project) }
......@@ -221,14 +233,14 @@ describe Projects::Prometheus::Alerts::CreateEventsService do
end
describe '`ended_at`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: nil)] } }
context 'is missing and status is resolved' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: nil, status: 'resolved')] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: 'invalid date')] } }
context 'is invalid and status is resolved' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: 'invalid date', status: 'resolved')] } }
it_behaves_like 'no events persisted'
end
......@@ -242,6 +254,25 @@ describe Projects::Prometheus::Alerts::CreateEventsService do
it_behaves_like 'no events persisted'
end
context 'is missing but title is given' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil, title: 'alert')] } }
it_behaves_like 'self managed events persisted'
end
context 'is missing and environment name is given' do
let(:environment) { create(:environment, project: project) }
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil, title: 'alert', environment: environment.name)] } }
it_behaves_like 'self managed events persisted'
it 'associates the environment to the alert event' do
service.execute
expect(SelfManagedPrometheusAlertEvent.last.environment).to eq environment
end
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: '-1')] } }
......@@ -254,13 +285,16 @@ describe Projects::Prometheus::Alerts::CreateEventsService do
private
def alert_payload(status: 'firing', started_at: Time.now, ended_at: Time.now, gitlab_alert_id: alert.prometheus_metric_id)
def alert_payload(status: 'firing', started_at: Time.now, ended_at: Time.now, gitlab_alert_id: alert.prometheus_metric_id, title: nil, environment: nil)
payload = {}
payload['status'] = status if status
payload['startsAt'] = utc_rfc3339(started_at) if started_at
payload['endsAt'] = utc_rfc3339(ended_at) if ended_at
payload['labels'] = { 'gitlab_alert_id' => gitlab_alert_id.to_s } if gitlab_alert_id
payload['labels'] = {}
payload['labels']['gitlab_alert_id'] = gitlab_alert_id.to_s if gitlab_alert_id
payload['labels']['alertname'] = title if title
payload['labels']['gitlab_environment_name'] = environment if environment
payload
end
......
......@@ -29,7 +29,9 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert.related_issues, :count).from(0).to(1)
.to change(prometheus_alert.related_issues, :count)
.from(0)
.to(1)
end
context 'when project could not be found' do
......@@ -56,7 +58,7 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
it 'does not relate issue to an event' do
expect { subject.perform(project.id, alert_params) }
.not_to change(Issue, :count)
.not_to change(prometheus_alert.related_issues, :count)
end
end
......@@ -72,5 +74,55 @@ describe IncidentManagement::ProcessPrometheusAlertWorker do
.not_to change(prometheus_alert.related_issues, :count)
end
end
context 'self-managed alert' do
let(:alert_name) { 'alert' }
let(:starts_at) { Time.now.rfc3339 }
let!(:prometheus_alert) do
payload_key = SelfManagedPrometheusAlertEvent.payload_key_for(starts_at, alert_name, 'vector(1)')
create(:self_managed_prometheus_alert_event, project: project, payload_key: payload_key)
end
let(:alert_params) do
{
startsAt: starts_at,
generatorURL: 'http://localhost:9090/graph?g0.expr=vector%281%29&g0.tab=1',
labels: {
alertname: alert_name
}
}.with_indifferent_access
end
it 'creates an issue' do
expect { subject.perform(project.id, alert_params) }
.to change(Issue, :count)
.by(1)
end
it 'relates issue to an event' do
expect { subject.perform(project.id, alert_params) }
.to change(prometheus_alert.related_issues, :count)
.from(0)
.to(1)
end
context 'when event could not be found' do
before do
alert_params[:generatorURL] = 'http://somethingelse.com'
end
it 'creates an issue' do
expect { subject.perform(project.id, alert_params) }
.to change(Issue, :count)
.by(1)
end
it 'does not relate issue to an event' do
expect { subject.perform(project.id, alert_params) }
.not_to change(prometheus_alert.related_issues, :count)
end
end
end
end
end
......@@ -27,6 +27,7 @@ issues:
- design_versions
- prometheus_alerts
- prometheus_alert_events
- self_managed_prometheus_alert_events
events:
- author
- project
......@@ -401,6 +402,7 @@ project:
- operations_feature_flags_client
- prometheus_alerts
- prometheus_alert_events
- self_managed_prometheus_alert_events
- software_license_policies
- project_registry
- packages
......@@ -474,6 +476,8 @@ prometheus_alerts:
- prometheus_alert_events
prometheus_alert_events:
- project
self_managed_prometheus_alert_events:
- project
epic_issues:
- issue
- epic
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment