Commit 029d4eea authored by Alex Kalderimis's avatar Alex Kalderimis

Merge branch 'pl-prometheus-notify-payload-too-large-ops-ff' into 'master'

Truncate alerts for large Prometheus payloads when FF is enabled

See merge request gitlab-org/gitlab!77168
parents fe9bb368 d473fefa
...@@ -18,6 +18,14 @@ module Projects ...@@ -18,6 +18,14 @@ module Projects
SUPPORTED_VERSION = '4' SUPPORTED_VERSION = '4'
# If feature flag :prometheus_notify_max_alerts is enabled truncate
# alerts to 100 and process only them.
# If feature flag is disabled process any amount of alerts.
#
# This is to mitigate incident:
# https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6086
PROCESS_MAX_ALERTS = 100
def initialize(project, payload) def initialize(project, payload)
@project = project @project = project
@payload = payload @payload = payload
...@@ -28,6 +36,8 @@ module Projects ...@@ -28,6 +36,8 @@ module Projects
return unprocessable_entity unless self.class.processable?(payload) return unprocessable_entity unless self.class.processable?(payload)
return unauthorized unless valid_alert_manager_token?(token, integration) return unauthorized unless valid_alert_manager_token?(token, integration)
truncate_alerts! if max_alerts_exceeded?
alert_responses = process_prometheus_alerts alert_responses = process_prometheus_alerts
alert_response(alert_responses) alert_response(alert_responses)
...@@ -49,12 +59,23 @@ module Projects ...@@ -49,12 +59,23 @@ module Projects
Gitlab::Utils::DeepSize.new(payload).valid? Gitlab::Utils::DeepSize.new(payload).valid?
end end
def firings def max_alerts_exceeded?
@firings ||= alerts_by_status('firing') return false unless Feature.enabled?(:prometheus_notify_max_alerts, project, type: :ops)
alerts.size > PROCESS_MAX_ALERTS
end end
def alerts_by_status(status) def truncate_alerts!
alerts.select { |alert| alert['status'] == status } Gitlab::AppLogger.warn(
message: 'Prometheus payload exceeded maximum amount of alerts. Truncating alerts.',
project_id: project.id,
alerts: {
total: alerts.size,
max: PROCESS_MAX_ALERTS
}
)
payload['alerts'] = alerts.first(PROCESS_MAX_ALERTS)
end end
def alerts def alerts
...@@ -137,7 +158,7 @@ module Projects ...@@ -137,7 +158,7 @@ module Projects
end end
def alert_response(alert_responses) def alert_response(alert_responses)
alerts = alert_responses.map { |resp| resp.payload[:alert] }.compact alerts = alert_responses.flat_map { |resp| resp.payload[:alerts] }.compact
success(alerts) success(alerts)
end end
......
---
name: prometheus_notify_max_alerts
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/77168
rollout_issue_url: https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6086
milestone: '14.7'
type: ops
group: group::monitor
default_enabled: false
...@@ -224,6 +224,78 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do ...@@ -224,6 +224,78 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
end end
end end
end end
context 'when payload exceeds max amount of processable alerts' do
# We are defining 2 alerts in payload_raw above
let(:max_alerts) { 1 }
before do
stub_const("#{described_class}::PROCESS_MAX_ALERTS", max_alerts)
create(:prometheus_integration, project: project)
create(:project_alerting_setting, project: project, token: token)
allow(Gitlab::AppLogger).to receive(:warn)
end
shared_examples 'process truncated alerts' do
it 'returns 200 but skips processing and logs a warning', :aggregate_failures do
expect(subject).to be_success
expect(subject.payload[:alerts].size).to eq(max_alerts)
expect(Gitlab::AppLogger)
.to have_received(:warn)
.with(
message: 'Prometheus payload exceeded maximum amount of alerts. Truncating alerts.',
project_id: project.id,
alerts: {
total: 2,
max: max_alerts
})
end
end
shared_examples 'process all alerts' do
it 'returns 200 and process alerts without warnings', :aggregate_failures do
expect(subject).to be_success
expect(subject.payload[:alerts].size).to eq(2)
expect(Gitlab::AppLogger).not_to have_received(:warn)
end
end
context 'with feature flag globally enabled' do
before do
stub_feature_flags(prometheus_notify_max_alerts: true)
end
include_examples 'process truncated alerts'
end
context 'with feature flag enabled on project' do
before do
stub_feature_flags(prometheus_notify_max_alerts: project)
end
include_examples 'process truncated alerts'
end
context 'with feature flag enabled on unrelated project' do
let(:another_project) { create(:project) }
before do
stub_feature_flags(prometheus_notify_max_alerts: another_project)
end
include_examples 'process all alerts'
end
context 'with feature flag disabled' do
before do
stub_feature_flags(prometheus_notify_max_alerts: false)
end
include_examples 'process all alerts'
end
end
end end
context 'with invalid payload' do context 'with invalid payload' do
......
...@@ -64,12 +64,16 @@ RSpec.shared_examples 'processes never-before-seen recovery alert' do ...@@ -64,12 +64,16 @@ RSpec.shared_examples 'processes never-before-seen recovery alert' do
end end
RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' do RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' do
it 'creates AlertManagement::Alert' do it 'creates alerts and returns them in the payload', :aggregate_failures do
expect(Gitlab::AppLogger).not_to receive(:warn) expect(Gitlab::AppLogger).not_to receive(:warn)
expect { subject } expect { subject }
.to change(AlertManagement::Alert, :count).by(2) .to change(AlertManagement::Alert, :count).by(2)
.and change(Note, :count).by(4) .and change(Note, :count).by(4)
expect(subject).to be_success
expect(subject.payload[:alerts]).to all(be_a_kind_of(AlertManagement::Alert))
expect(subject.payload[:alerts].size).to eq(2)
end end
it_behaves_like 'processes incident issues' it_behaves_like 'processes incident issues'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment