Fix Prometheus endpoint to return created alerts

Due to a bug we always returned an empty list even if alerts were created. This commit adds a new ops feature flag :prometheus_notify_max_alerts which limits to amount of processable alerts in a single Prometheus payload. When enabled, it truncates incoming alerts if the amount of alerts in the provided Prometheus payload exceeds 100. Changelog: fixed

Fix Prometheus endpoint to return created alerts
Due to a bug we always returned an empty list even if alerts were created. This commit adds a new ops feature flag :prometheus_notify_max_alerts which limits to amount of processable alerts in a single Prometheus payload. When enabled, it truncates incoming alerts if the amount of alerts in the provided Prometheus payload exceeds 100. Changelog: fixed
d473fefa · Peter Leitzen · Alex Kalderimis · 55d31b13 · d473fefa · d473fefa
Commit d473fefa authored Dec 22, 2021 by Peter Leitzen Committed by Alex Kalderimis Dec 22, 2021
4 changed files
--- a/app/services/projects/prometheus/alerts/notify_service.rb
+++ b/app/services/projects/prometheus/alerts/notify_service.rb
@@ -18,6 +18,14 @@ module Projects
        SUPPORTED_VERSION = '4'
+        # If feature flag :prometheus_notify_max_alerts is enabled truncate
+        # alerts to 100 and process only them.
+        # If feature flag is disabled process any amount of alerts.
+        #
+        # This is to mitigate incident:
+        # https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6086
+        PROCESS_MAX_ALERTS = 100
        def initialize(project, payload)
          @project = project
          @payload = payload
@@ -28,6 +36,8 @@ module Projects
          return unprocessable_entity unless self.class.processable?(payload)
          return unauthorized unless valid_alert_manager_token?(token, integration)
+          truncate_alerts! if max_alerts_exceeded?
          alert_responses = process_prometheus_alerts
          alert_response(alert_responses)
@@ -49,12 +59,23 @@ module Projects
          Gitlab::Utils::DeepSize.new(payload).valid?
        end
-        def firings
+        def max_alerts_exceeded?
-          @firings ||= alerts_by_status('firing')
+          return false unless Feature.enabled?(:prometheus_notify_max_alerts, project, type: :ops)
+          alerts.size > PROCESS_MAX_ALERTS
        end
-        def alerts_by_status(status)
+        def truncate_alerts!
-          alerts.select { |alert| alert['status'] == status }
+          Gitlab::AppLogger.warn(
+            message: 'Prometheus payload exceeded maximum amount of alerts. Truncating alerts.',
+            project_id: project.id,
+            alerts: {
+              total: alerts.size,
+              max: PROCESS_MAX_ALERTS
+            }
+          )
+          payload['alerts'] = alerts.first(PROCESS_MAX_ALERTS)
        end
        def alerts
@@ -137,7 +158,7 @@ module Projects
        end
        def alert_response(alert_responses)
-          alerts = alert_responses.map { |resp| resp.payload[:alert] }.compact
+          alerts = alert_responses.flat_map { |resp| resp.payload[:alerts] }.compact
          success(alerts)
        end

--- a/config/feature_flags/ops/prometheus_notify_max_alerts.yml
+++ b/config/feature_flags/ops/prometheus_notify_max_alerts.yml
+---
+name: prometheus_notify_max_alerts
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/77168
+rollout_issue_url: https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6086
+milestone: '14.7'
+type: ops
+group: group::monitor
+default_enabled: false
--- a/spec/services/projects/prometheus/alerts/notify_service_spec.rb
+++ b/spec/services/projects/prometheus/alerts/notify_service_spec.rb
@@ -224,6 +224,78 @@ RSpec.describe Projects::Prometheus::Alerts::NotifyService do
        end
      end
    end
+    context 'when payload exceeds max amount of processable alerts' do
+      # We are defining 2 alerts in payload_raw above
+      let(:max_alerts) { 1 }
+      before do
+        stub_const("#{described_class}::PROCESS_MAX_ALERTS", max_alerts)
+        create(:prometheus_integration, project: project)
+        create(:project_alerting_setting, project: project, token: token)
+        allow(Gitlab::AppLogger).to receive(:warn)
+      end
+      shared_examples 'process truncated alerts' do
+        it 'returns 200 but skips processing and logs a warning', :aggregate_failures do
+          expect(subject).to be_success
+          expect(subject.payload[:alerts].size).to eq(max_alerts)
+          expect(Gitlab::AppLogger)
+            .to have_received(:warn)
+            .with(
+              message: 'Prometheus payload exceeded maximum amount of alerts. Truncating alerts.',
+              project_id: project.id,
+              alerts: {
+                total: 2,
+                max: max_alerts
+              })
+        end
+      end
+      shared_examples 'process all alerts' do
+        it 'returns 200 and process alerts without warnings', :aggregate_failures do
+          expect(subject).to be_success
+          expect(subject.payload[:alerts].size).to eq(2)
+          expect(Gitlab::AppLogger).not_to have_received(:warn)
+        end
+      end
+      context 'with feature flag globally enabled' do
+        before do
+          stub_feature_flags(prometheus_notify_max_alerts: true)
+        end
+        include_examples 'process truncated alerts'
+      end
+      context 'with feature flag enabled on project' do
+        before do
+          stub_feature_flags(prometheus_notify_max_alerts: project)
+        end
+        include_examples 'process truncated alerts'
+      end
+      context 'with feature flag enabled on unrelated project' do
+        let(:another_project) { create(:project) }
+        before do
+          stub_feature_flags(prometheus_notify_max_alerts: another_project)
+        end
+        include_examples 'process all alerts'
+      end
+      context 'with feature flag disabled' do
+        before do
+          stub_feature_flags(prometheus_notify_max_alerts: false)
+        end
+        include_examples 'process all alerts'
+      end
+    end
  end
  context 'with invalid payload' do

--- a/spec/support/shared_examples/services/alert_management_shared_examples.rb
+++ b/spec/support/shared_examples/services/alert_management_shared_examples.rb
@@ -64,12 +64,16 @@ RSpec.shared_examples 'processes never-before-seen recovery alert' do
 end
 RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' do
-  it 'creates AlertManagement::Alert' do
+  it 'creates alerts and returns them in the payload', :aggregate_failures do
    expect(Gitlab::AppLogger).not_to receive(:warn)
    expect { subject }
      .to change(AlertManagement::Alert, :count).by(2)
      .and change(Note, :count).by(4)
+    expect(subject).to be_success
+    expect(subject.payload[:alerts]).to all(be_a_kind_of(AlertManagement::Alert))
+    expect(subject.payload[:alerts].size).to eq(2)
  end
  it_behaves_like 'processes incident issues'