diff --git a/app/services/ci/archive_trace_service.rb b/app/services/ci/archive_trace_service.rb new file mode 100644 index 0000000000000000000000000000000000000000..a1dd00721b59553c286f7a56d3e2886890f87274 --- /dev/null +++ b/app/services/ci/archive_trace_service.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module Ci + class ArchiveTraceService + def execute(job) + job.trace.archive! + rescue ::Gitlab::Ci::Trace::AlreadyArchivedError + # It's already archived, thus we can safely ignore this exception. + rescue => e + # Tracks this error with application logs, Sentry, and Prometheus. + # If `archive!` keeps failing for over a week, that could incur data loss. + # (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture) + # In order to avoid interrupting the system, we do not raise an exception here. + archive_error(e, job) + end + + private + + def failed_archive_counter + @failed_archive_counter ||= + Gitlab::Metrics.counter(:job_trace_archive_failed_total, + "Counter of failed attempts of trace archiving") + end + + def archive_error(error, job) + failed_archive_counter.increment + Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" + + Gitlab::Sentry + .track_exception(error, + issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502', + extra: { job_id: job.id }) + end + end +end diff --git a/app/workers/archive_trace_worker.rb b/app/workers/archive_trace_worker.rb index c1283e9b2fc9e8682437cdb331907a7160db9497..4a9becf0ca7b0a357ed5dc3b39e04f187e63400d 100644 --- a/app/workers/archive_trace_worker.rb +++ b/app/workers/archive_trace_worker.rb @@ -7,7 +7,7 @@ class ArchiveTraceWorker # rubocop: disable CodeReuse/ActiveRecord def perform(job_id) Ci::Build.without_archived_trace.find_by(id: job_id).try do |job| - job.trace.archive! + Ci::ArchiveTraceService.new.execute(job) end end # rubocop: enable CodeReuse/ActiveRecord diff --git a/app/workers/ci/archive_traces_cron_worker.rb b/app/workers/ci/archive_traces_cron_worker.rb index 7443aad13802caf3b6def4fde583995c76af0286..f65ff239866242ecf5ac75011487cf6bd9f2431a 100644 --- a/app/workers/ci/archive_traces_cron_worker.rb +++ b/app/workers/ci/archive_traces_cron_worker.rb @@ -11,21 +11,9 @@ module Ci # This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL # More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791 Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build| - begin - build.trace.archive! - rescue ::Gitlab::Ci::Trace::AlreadyArchivedError - rescue => e - failed_archive_counter.increment - Rails.logger.error "Failed to archive stale live trace. id: #{build.id} message: #{e.message}" - end + Ci::ArchiveTraceService.new.execute(build) end end # rubocop: enable CodeReuse/ActiveRecord - - private - - def failed_archive_counter - @failed_archive_counter ||= Gitlab::Metrics.counter(:job_trace_archive_failed_total, "Counter of failed attempts of traces archiving") - end end end diff --git a/spec/services/ci/archive_trace_service_spec.rb b/spec/services/ci/archive_trace_service_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..8e9cb65f3bcd44040b2e26edbebaf23c5f097bfc --- /dev/null +++ b/spec/services/ci/archive_trace_service_spec.rb @@ -0,0 +1,51 @@ +require 'spec_helper' + +describe Ci::ArchiveTraceService, '#execute' do + subject { described_class.new.execute(job) } + + context 'when job is finished' do + let(:job) { create(:ci_build, :success, :trace_live) } + + it 'creates an archived trace' do + expect { subject }.not_to raise_error + + expect(job.reload.job_artifacts_trace).to be_exist + end + + context 'when trace is already archived' do + let!(:job) { create(:ci_build, :success, :trace_artifact) } + + it 'ignores an exception' do + expect { subject }.not_to raise_error + end + + it 'does not create an archived trace' do + expect { subject }.not_to change { Ci::JobArtifact.trace.count } + end + end + end + + context 'when job is running' do + let(:job) { create(:ci_build, :running, :trace_live) } + + it 'increments Prometheus counter, sends crash report to Sentry and ignore an error for continuing to archive' do + expect(Gitlab::Sentry) + .to receive(:track_exception) + .with(::Gitlab::Ci::Trace::ArchiveError, + issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502', + extra: { job_id: job.id } ).once + + expect(Rails.logger) + .to receive(:error) + .with("Failed to archive trace. id: #{job.id} message: Job is not finished yet") + .and_call_original + + expect(Gitlab::Metrics) + .to receive(:counter) + .with(:job_trace_archive_failed_total, "Counter of failed attempts of trace archiving") + .and_call_original + + expect { subject }.not_to raise_error + end + end +end diff --git a/spec/workers/archive_trace_worker_spec.rb b/spec/workers/archive_trace_worker_spec.rb index b768588c6e13f31d6784d3268105f324bcfd8067..7244ad4f199023a5ace5ac84dadf8fadf0cb72a2 100644 --- a/spec/workers/archive_trace_worker_spec.rb +++ b/spec/workers/archive_trace_worker_spec.rb @@ -5,10 +5,11 @@ describe ArchiveTraceWorker do subject { described_class.new.perform(job&.id) } context 'when job is found' do - let(:job) { create(:ci_build) } + let(:job) { create(:ci_build, :trace_live) } it 'executes service' do - expect_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) + expect_any_instance_of(Ci::ArchiveTraceService) + .to receive(:execute).with(job) subject end @@ -18,7 +19,8 @@ describe ArchiveTraceWorker do let(:job) { nil } it 'does not execute service' do - expect_any_instance_of(Gitlab::Ci::Trace).not_to receive(:archive!) + expect_any_instance_of(Ci::ArchiveTraceService) + .not_to receive(:execute) subject end diff --git a/spec/workers/ci/archive_traces_cron_worker_spec.rb b/spec/workers/ci/archive_traces_cron_worker_spec.rb index 23f5dda298a40e765d54a8d6692ad7e4f08309e7..478fb7d2c0f0a2a32a4d7ea0a3ba37d13de0515d 100644 --- a/spec/workers/ci/archive_traces_cron_worker_spec.rb +++ b/spec/workers/ci/archive_traces_cron_worker_spec.rb @@ -30,6 +30,13 @@ describe Ci::ArchiveTracesCronWorker do it_behaves_like 'archives trace' + it 'executes service' do + expect_any_instance_of(Ci::ArchiveTraceService) + .to receive(:execute).with(build) + + subject + end + context 'when a trace had already been archived' do let!(:build) { create(:ci_build, :success, :trace_live, :trace_artifact) } let!(:build2) { create(:ci_build, :success, :trace_live) } @@ -46,11 +53,12 @@ describe Ci::ArchiveTracesCronWorker do let!(:build) { create(:ci_build, :success, :trace_live) } before do + allow(Gitlab::Sentry).to receive(:track_exception) allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!).and_raise('Unexpected error') end it 'puts a log' do - expect(Rails.logger).to receive(:error).with("Failed to archive stale live trace. id: #{build.id} message: Unexpected error") + expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error") subject end