Track test failures on pipeline completion

We moved this to pipeline completion so that we have the total count of failures across all builds. We need this count to check if the limit is reached.

Track test failures on pipeline completion
We moved this to pipeline completion so that we have the total count of failures across all builds. We need this count to check if the limit is reached.
22ce4dff · Erick Bajao · 5559a68e · 22ce4dff · 5559a68e · 22ce4dff
Commit 22ce4dff authored Nov 28, 2020 by Erick Bajao
14 changed files
--- a/app/models/ci/pipeline.rb
+++ b/app/models/ci/pipeline.rb
@@ -269,6 +269,12 @@ module Ci
        end
      end
+      after_transition any => ::Ci::Pipeline.completed_statuses do |pipeline|
+        pipeline.run_after_commit do
+          ::Ci::TestFailureHistoryService.new(pipeline).async.perform_if_needed # rubocop: disable CodeReuse/ServiceClass
+        end
+      end
      after_transition any => [:success, :failed] do |pipeline|
        ref_status = pipeline.ci_ref&.update_status_by!(pipeline)
@@ -938,10 +944,18 @@ module Ci
      builds.latest.with_reports(reports_scope)
    end
+    def latest_test_report_builds
+      latest_report_builds(Ci::JobArtifact.test_reports).preload(:project)
+    end
    def builds_with_coverage
      builds.latest.with_coverage
    end
+    def builds_with_failed_tests(limit: nil)
+      latest_test_report_builds.failed.limit(limit)
+    end
    def has_reports?(reports_scope)
      complete? && latest_report_builds(reports_scope).exists?
    end
@@ -962,7 +976,7 @@ module Ci
    def test_reports
      Gitlab::Ci::Reports::TestReports.new.tap do |test_reports|
-        latest_report_builds(Ci::JobArtifact.test_reports).preload(:project).find_each do |build|
+        latest_test_report_builds.find_each do |build|
          build.collect_test_reports!(test_reports)
        end
      end

--- a/app/services/ci/test_cases_service.rb
+++ b/app/services/ci/test_cases_service.rb
-# frozen_string_literal: true
-module Ci
-  class TestCasesService
-    MAX_TRACKABLE_FAILURES = 200
-    def execute(build)
-      return unless Feature.enabled?(:test_failure_history, build.project)
-      return unless build.has_test_reports?
-      return unless build.project.default_branch_or_master == build.ref
-      test_suite = generate_test_suite_report(build)
-      track_failures(build, test_suite)
-    end
-    private
-    def generate_test_suite_report(build)
-      build.collect_test_reports!(Gitlab::Ci::Reports::TestReports.new)
-    end
-    def track_failures(build, test_suite)
-      return if test_suite.failed_count > MAX_TRACKABLE_FAILURES
-      test_suite.failed.keys.each_slice(100) do |keys|
-        Ci::TestCase.transaction do
-          test_cases = Ci::TestCase.find_or_create_by_batch(build.project, keys)
-          Ci::TestCaseFailure.insert_all(test_case_failures(test_cases, build))
-        end
-      end
-    end
-    def test_case_failures(test_cases, build)
-      test_cases.map do |test_case|
-        {
-          test_case_id: test_case.id,
-          build_id: build.id,
-          failed_at: build.finished_at
-        }
-      end
-    end
-  end
-end
--- a/app/services/ci/test_failure_history_service.rb
+++ b/app/services/ci/test_failure_history_service.rb
+# frozen_string_literal: true
+module Ci
+  class TestFailureHistoryService
+    class Async
+      attr_reader :service
+      def initialize(service)
+        @service = service
+      end
+      def perform_if_needed
+        TestFailureHistoryWorker.perform_async(service.pipeline.id) if service.should_track_failures?
+      end
+    end
+    MAX_TRACKABLE_FAILURES = 200
+    attr_reader :pipeline
+    delegate :project, to: :pipeline
+    def initialize(pipeline)
+      @pipeline = pipeline
+    end
+    def execute
+      return unless should_track_failures?
+      track_failures
+    end
+    def should_track_failures?
+      return false unless Feature.enabled?(:test_failure_history, project)
+      return false unless project.default_branch_or_master == pipeline.ref
+      # We fetch for up to MAX_TRACKABLE_FAILURES + 1 builds. So if ever we get
+      # 201 total number of builds with the assumption that each job has at least
+      # 1 failed test case, then we have at least 201 failed test cases which exceeds
+      # the MAX_TRACKABLE_FAILURES of 200. If this is the case, let's early exit so we
+      # don't have to parse each JUnit report of each of the 201 builds.
+      failed_builds.length <= MAX_TRACKABLE_FAILURES
+    end
+    def async
+      Async.new(self)
+    end
+    private
+    def failed_builds
+      @failed_builds ||= pipeline.builds_with_failed_tests(limit: MAX_TRACKABLE_FAILURES + 1)
+    end
+    def track_failures
+      failed_test_cases = gather_failed_test_cases(failed_builds)
+      return if failed_test_cases.size > MAX_TRACKABLE_FAILURES
+      failed_test_cases.keys.each_slice(100) do |key_hashes|
+        Ci::TestCase.transaction do
+          ci_test_cases = Ci::TestCase.find_or_create_by_batch(project, key_hashes)
+          failures = test_case_failures(ci_test_cases, failed_test_cases)
+          Ci::TestCaseFailure.insert_all(failures)
+        end
+      end
+    end
+    def gather_failed_test_cases(failed_builds)
+      failed_builds.each_with_object({}) do |build, failed_test_cases|
+        test_suite = generate_test_suite!(build)
+        test_suite.failed.keys.each do |key|
+          failed_test_cases[key] = build
+        end
+      end
+    end
+    def generate_test_suite!(build)
+      # Returns an instance of Gitlab::Ci::Reports::TestSuite
+      build.collect_test_reports!(Gitlab::Ci::Reports::TestReports.new)
+    end
+    def test_case_failures(ci_test_cases, failed_test_cases)
+      ci_test_cases.map do |test_case|
+        build = failed_test_cases[test_case.key_hash]
+        {
+          test_case_id: test_case.id,
+          build_id: build.id,
+          failed_at: build.finished_at
+        }
+      end
+    end
+  end
+end
--- a/app/workers/all_queues.yml
+++ b/app/workers/all_queues.yml
@@ -1101,6 +1101,14 @@
  :weight: 1
  :idempotent: true
  :tags: []
+- :name: pipeline_background:ci_test_failure_history
+  :feature_category: :continuous_integration
+  :has_external_dependencies:
+  :urgency: :low
+  :resource_boundary: :unknown
+  :weight: 1
+  :idempotent: true
+  :tags: []
 - :name: pipeline_cache:expire_job_cache
  :feature_category: :continuous_integration
  :has_external_dependencies:

--- a/app/workers/build_finished_worker.rb
+++ b/app/workers/build_finished_worker.rb
@@ -33,11 +33,6 @@ class BuildFinishedWorker # rubocop:disable Scalability/IdempotentWorker
    BuildCoverageWorker.new.perform(build.id)
    Ci::BuildReportResultWorker.new.perform(build.id)
-    # TODO: As per https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/194, it may be
-    # best to avoid creating more workers that we have no intention of calling async.
-    # Change the previous worker calls on top to also just call the service directly.
-    Ci::TestCasesService.new.execute(build)
    # We execute these async as these are independent operations.
    BuildHooksWorker.perform_async(build.id)
    ExpirePipelineCacheWorker.perform_async(build.pipeline_id) if build.pipeline.cacheable?

--- a/app/workers/ci/test_failure_history_worker.rb
+++ b/app/workers/ci/test_failure_history_worker.rb
+# frozen_string_literal: true
+module Ci
+  class TestFailureHistoryWorker
+    include ApplicationWorker
+    include PipelineBackgroundQueue
+    idempotent!
+    def perform(pipeline_id)
+      Ci::Pipeline.find_by_id(pipeline_id).try do |pipeline|
+        Ci::TestFailureHistoryService.new(pipeline).execute
+      end
+    end
+  end
+end
--- a/changelogs/unreleased/eb-track-failures-on-pipeline-complete.yml
+++ b/changelogs/unreleased/eb-track-failures-on-pipeline-complete.yml
+---
+title: Track test failures on pipeline completion
+merge_request: 48695
+author:
+type: changed
--- a/spec/factories/ci/pipelines.rb
+++ b/spec/factories/ci/pipelines.rb
@@ -126,10 +126,10 @@ FactoryBot.define do
      end
      trait :with_test_reports_with_three_failures do
-        status { :success }
+        status { :failed }
        after(:build) do |pipeline, _evaluator|
-          pipeline.builds << build(:ci_build, :test_reports_with_three_failures, pipeline: pipeline, project: pipeline.project)
+          pipeline.builds << build(:ci_build, :failed, :test_reports_with_three_failures, pipeline: pipeline, project: pipeline.project)
        end
      end

--- a/spec/models/ci/pipeline_spec.rb
+++ b/spec/models/ci/pipeline_spec.rb
@@ -4030,4 +4030,70 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do
      bridge
    end
  end
+  describe 'test failure history processing' do
+    it 'performs the service asynchronously when the pipeline is completed' do
+      service = double
+      expect(Ci::TestFailureHistoryService).to receive(:new).with(pipeline).and_return(service)
+      expect(service).to receive_message_chain(:async, :perform_if_needed)
+      pipeline.succeed!
+    end
+  end
+  describe '#latest_test_report_builds' do
+    it 'returns pipeline builds with test report artifacts' do
+      test_build = create(:ci_build, :test_reports, pipeline: pipeline, project: project)
+      create(:ci_build, :artifacts, pipeline: pipeline, project: project)
+      expect(pipeline.latest_test_report_builds).to contain_exactly(test_build)
+    end
+    it 'preloads project on each build to avoid N+1 queries' do
+      create(:ci_build, :test_reports, pipeline: pipeline, project: project)
+      control_count = ActiveRecord::QueryRecorder.new do
+        pipeline.latest_test_report_builds.map(&:project).map(&:full_path)
+      end
+      multi_build_pipeline = create(:ci_empty_pipeline, status: :created, project: project)
+      create(:ci_build, :test_reports, pipeline: multi_build_pipeline, project: project)
+      create(:ci_build, :test_reports, pipeline: multi_build_pipeline, project: project)
+      expect { multi_build_pipeline.latest_test_report_builds.map(&:project).map(&:full_path) }
+        .not_to exceed_query_limit(control_count)
+    end
+  end
+  describe '#builds_with_failed_tests' do
+    it 'returns pipeline builds with test report artifacts' do
+      failed_build = create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: project)
+      create(:ci_build, :success, :test_reports, pipeline: pipeline, project: project)
+      expect(pipeline.builds_with_failed_tests).to contain_exactly(failed_build)
+    end
+    it 'supports limiting the number of builds to fetch' do
+      create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: project)
+      create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: project)
+      expect(pipeline.builds_with_failed_tests(limit: 1).count).to eq(1)
+    end
+    it 'preloads project on each build to avoid N+1 queries' do
+      create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: project)
+      control_count = ActiveRecord::QueryRecorder.new do
+        pipeline.builds_with_failed_tests.map(&:project).map(&:full_path)
+      end
+      multi_build_pipeline = create(:ci_empty_pipeline, status: :created, project: project)
+      create(:ci_build, :failed, :test_reports, pipeline: multi_build_pipeline, project: project)
+      create(:ci_build, :failed, :test_reports, pipeline: multi_build_pipeline, project: project)
+      expect { multi_build_pipeline.builds_with_failed_tests.map(&:project).map(&:full_path) }
+        .not_to exceed_query_limit(control_count)
+    end
+  end
 end
--- a/spec/services/ci/compare_test_reports_service_spec.rb
+++ b/spec/services/ci/compare_test_reports_service_spec.rb
@@ -67,7 +67,7 @@ RSpec.describe Ci::CompareTestReportsService do
        # The JUnit fixture for the given build has 3 failures.
        # This service will create 1 test case failure record for each.
-        Ci::TestCasesService.new.execute(build)
+        Ci::TestFailureHistoryService.new(head_pipeline).execute
      end
      it 'loads recent failures on limited test cases to avoid building up a huge DB query', :aggregate_failures do

--- a/spec/services/ci/test_cases_service_spec.rb
+++ b/spec/services/ci/test_cases_service_spec.rb
-# frozen_string_literal: true
-require 'spec_helper'
-RSpec.describe Ci::TestCasesService, :aggregate_failures do
-  describe '#execute' do
-    subject(:execute_service) { described_class.new.execute(build) }
-    context 'when build has test reports' do
-      let(:build) { create(:ci_build, :success, :test_reports) } # The test report has 2 test case failures
-      it 'creates test case failures records' do
-        execute_service
-        expect(Ci::TestCase.count).to eq(2)
-        expect(Ci::TestCaseFailure.count).to eq(2)
-      end
-      context 'when feature flag for test failure history is disabled' do
-        before do
-          stub_feature_flags(test_failure_history: false)
-        end
-        it 'does not persist data' do
-          execute_service
-          expect(Ci::TestCase.count).to eq(0)
-          expect(Ci::TestCaseFailure.count).to eq(0)
-        end
-      end
-      context 'when build is not for the default branch' do
-        before do
-          build.update_column(:ref, 'new-feature')
-        end
-        it 'does not persist data' do
-          execute_service
-          expect(Ci::TestCase.count).to eq(0)
-          expect(Ci::TestCaseFailure.count).to eq(0)
-        end
-      end
-      context 'when test failure data have already been persisted with the same exact attributes' do
-        before do
-          execute_service
-        end
-        it 'does not fail but does not persist new data' do
-          expect { described_class.new.execute(build) }.not_to raise_error
-          expect(Ci::TestCase.count).to eq(2)
-          expect(Ci::TestCaseFailure.count).to eq(2)
-        end
-      end
-      context 'when test failure data have duplicates within the same payload (happens when the JUnit report has duplicate test case names but have different failures)' do
-        let(:build) { create(:ci_build, :success, :test_reports_with_duplicate_failed_test_names) } # The test report has 2 test case failures but with the same test case keys
-        it 'does not fail but does not persist duplicate data' do
-          expect { described_class.new.execute(build) }.not_to raise_error
-          expect(Ci::TestCase.count).to eq(1)
-          expect(Ci::TestCaseFailure.count).to eq(1)
-        end
-      end
-      context 'when number of failed test cases exceed the limit' do
-        before do
-          stub_const("#{described_class.name}::MAX_TRACKABLE_FAILURES", 1)
-        end
-        it 'does not persist data' do
-          execute_service
-          expect(Ci::TestCase.count).to eq(0)
-          expect(Ci::TestCaseFailure.count).to eq(0)
-        end
-      end
-    end
-    context 'when build has no test reports' do
-      let(:build) { create(:ci_build, :running) }
-      it 'does not persist data' do
-        execute_service
-        expect(Ci::TestCase.count).to eq(0)
-        expect(Ci::TestCaseFailure.count).to eq(0)
-      end
-    end
-  end
-end
--- a/spec/services/ci/test_failure_history_service_spec.rb
+++ b/spec/services/ci/test_failure_history_service_spec.rb
+# frozen_string_literal: true
+require 'spec_helper'
+RSpec.describe Ci::TestFailureHistoryService, :aggregate_failures do
+  describe '#execute' do
+    let(:project) { create(:project) }
+    let(:pipeline) { create(:ci_empty_pipeline, status: :created, project: project) }
+    subject(:execute_service) { described_class.new(pipeline).execute }
+    context 'when pipeline has failed builds with test reports' do
+      before do
+        # The test report has 2 test case failures
+        create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: project)
+      end
+      it 'creates test case failures records' do
+        execute_service
+        expect(Ci::TestCase.count).to eq(2)
+        expect(Ci::TestCaseFailure.count).to eq(2)
+      end
+      context 'when feature flag for test failure history is disabled' do
+        before do
+          stub_feature_flags(test_failure_history: false)
+        end
+        it 'does not persist data' do
+          execute_service
+          expect(Ci::TestCase.count).to eq(0)
+          expect(Ci::TestCaseFailure.count).to eq(0)
+        end
+      end
+      context 'when pipeline is not for the default branch' do
+        before do
+          pipeline.update_column(:ref, 'new-feature')
+        end
+        it 'does not persist data' do
+          execute_service
+          expect(Ci::TestCase.count).to eq(0)
+          expect(Ci::TestCaseFailure.count).to eq(0)
+        end
+      end
+      context 'when test failure data have already been persisted with the same exact attributes' do
+        before do
+          execute_service
+        end
+        it 'does not fail but does not persist new data' do
+          expect { described_class.new(pipeline).execute }.not_to raise_error
+          expect(Ci::TestCase.count).to eq(2)
+          expect(Ci::TestCaseFailure.count).to eq(2)
+        end
+      end
+      context 'when number of failed test cases exceed the limit' do
+        before do
+          stub_const("#{described_class.name}::MAX_TRACKABLE_FAILURES", 1)
+        end
+        it 'does not persist data' do
+          execute_service
+          expect(Ci::TestCase.count).to eq(0)
+          expect(Ci::TestCaseFailure.count).to eq(0)
+        end
+      end
+      context 'when number of failed test cases across multiple builds exceed the limit' do
+        before do
+          stub_const("#{described_class.name}::MAX_TRACKABLE_FAILURES", 2)
+          # This other test report has 1 unique test case failure which brings us to 3 total failures across all builds
+          # thus exceeding the limit of 2 for MAX_TRACKABLE_FAILURES
+          create(:ci_build, :failed, :test_reports_with_duplicate_failed_test_names, pipeline: pipeline, project: project)
+        end
+        it 'does not persist data' do
+          execute_service
+          expect(Ci::TestCase.count).to eq(0)
+          expect(Ci::TestCaseFailure.count).to eq(0)
+        end
+      end
+    end
+    context 'when test failure data have duplicates within the same payload (happens when the JUnit report has duplicate test case names but have different failures)' do
+      before do
+        # The test report has 2 test case failures but with the same test case keys
+        create(:ci_build, :failed, :test_reports_with_duplicate_failed_test_names, pipeline: pipeline, project: project)
+      end
+      it 'does not fail but does not persist duplicate data' do
+        expect { execute_service }.not_to raise_error
+        expect(Ci::TestCase.count).to eq(1)
+        expect(Ci::TestCaseFailure.count).to eq(1)
+      end
+    end
+    context 'when pipeline has no failed builds with test reports' do
+      before do
+        create(:ci_build, :test_reports, pipeline: pipeline, project: project)
+        create(:ci_build, :failed, pipeline: pipeline, project: project)
+      end
+      it 'does not persist data' do
+        execute_service
+        expect(Ci::TestCase.count).to eq(0)
+        expect(Ci::TestCaseFailure.count).to eq(0)
+      end
+    end
+  end
+  describe '#should_track_failures?' do
+    let(:project) { create(:project, :repository) }
+    let(:pipeline) { create(:ci_empty_pipeline, status: :created, project: project, ref: project.default_branch) }
+    subject { described_class.new(pipeline).should_track_failures? }
+    before do
+      create(:ci_build, :test_reports, :failed, pipeline: pipeline, project: project)
+      create(:ci_build, :test_reports, :failed, pipeline: pipeline, project: project)
+    end
+    context 'when feature flag is enabled and pipeline ref is the default branch' do
+      it { is_expected.to eq(true) }
+    end
+    context 'when feature flag is disabled' do
+      before do
+        stub_feature_flags(test_failure_history: false)
+      end
+      it { is_expected.to eq(false) }
+    end
+    context 'when pipeline is not equal to the project default branch' do
+      before do
+        pipeline.update_column(:ref, 'some-other-branch')
+      end
+      it { is_expected.to eq(false) }
+    end
+    context 'when total number of builds with failed tests exceeds the max number of trackable failures' do
+      before do
+        stub_const("#{described_class.name}::MAX_TRACKABLE_FAILURES", 1)
+      end
+      it { is_expected.to eq(false) }
+    end
+  end
+  describe '#async' do
+    let(:pipeline) { double(id: 1) }
+    let(:service) { described_class.new(pipeline) }
+    context 'when service should track failures' do
+      before do
+        allow(service).to receive(:should_track_failures?).and_return(true)
+      end
+      it 'enqueues the worker when #perform_if_needed is called' do
+        expect(Ci::TestFailureHistoryWorker).to receive(:perform_async).with(pipeline.id)
+        service.async.perform_if_needed
+      end
+    end
+    context 'when service should not track failures' do
+      before do
+        allow(service).to receive(:should_track_failures?).and_return(false)
+      end
+      it 'does not enqueue the worker when #perform_if_needed is called' do
+        expect(Ci::TestFailureHistoryWorker).not_to receive(:perform_async)
+        service.async.perform_if_needed
+      end
+    end
+  end
+end
--- a/spec/workers/build_finished_worker_spec.rb
+++ b/spec/workers/build_finished_worker_spec.rb
@@ -26,9 +26,6 @@ RSpec.describe BuildFinishedWorker do
        expect_next_instance_of(Ci::BuildReportResultWorker) do |instance|
          expect(instance).to receive(:perform)
        end
-        expect_next_instance_of(Ci::TestCasesService) do |instance|
-          expect(instance).to receive(:execute)
-        end
        expect(BuildHooksWorker).to receive(:perform_async)
        expect(ExpirePipelineCacheWorker).to receive(:perform_async)

--- a/spec/workers/ci/test_failure_history_worker_spec.rb
+++ b/spec/workers/ci/test_failure_history_worker_spec.rb
+# frozen_string_literal: true
+require 'spec_helper'
+RSpec.describe ::Ci::TestFailureHistoryWorker do
+  describe '#perform' do
+    subject(:perform) { described_class.new.perform(pipeline_id) }
+    context 'when pipeline exists' do
+      let(:pipeline) { create(:ci_pipeline) }
+      let(:pipeline_id) { pipeline.id }
+      it 'executes test failure history service' do
+        expect_next_instance_of(::Ci::TestFailureHistoryService) do |service|
+          expect(service).to receive(:execute)
+        end
+        perform
+      end
+    end
+    context 'when pipeline does not exist' do
+      let(:pipeline_id) { non_existing_record_id }
+      it 'does not execute test failure history service' do
+        expect(Ci::TestFailureHistoryService).not_to receive(:new)
+        perform
+      end
+    end
+  end
+  include_examples 'an idempotent worker' do
+    let(:pipeline) { create(:ci_pipeline) }
+    let(:job_args) { [pipeline.id] }
+    it 'tracks test failures' do
+      # The test report has 2 test case failures
+      create(:ci_build, :failed, :test_reports, pipeline: pipeline, project: pipeline.project)
+      subject
+      expect(Ci::TestCase.count).to eq(2)
+      expect(Ci::TestCaseFailure.count).to eq(2)
+    end
+  end
+end