Commit 1f187906 authored by Tomasz Maczukin's avatar Tomasz Maczukin

Add proper exceptions handling in StuckCiJobsWorker

Issues like https://gitlab.com/gitlab-org/gitlab/issues/34749 show, that
the worker is vulnerable for exceptions happening during model update in
DB. What's worse is that exceptios happening on this level, are failing
the whole execution of StuckCiJobsWorker, which unnoticed ends with
thousands of stale pending/running/scheduled jobs waiting in the queue.

This commit re-uses the fix added with
https://gitlab.com/gitlab-org/gitlab/merge_requests/19150 to properly
handle and log such exceptions and drop the jobs regardless of them.
parent 69562679
...@@ -73,5 +73,19 @@ class StuckCiJobsWorker ...@@ -73,5 +73,19 @@ class StuckCiJobsWorker
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b| Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
b.drop(reason) b.drop(reason)
end end
rescue => ex
build.doom!
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::Sentry.track_acceptable_exception(ex, extra: {
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
})
end end
end end
---
title: Properly handle exceptions in StuckCiJobsWorker
merge_request: 19465
author:
type: fixed
...@@ -18,15 +18,30 @@ describe StuckCiJobsWorker do ...@@ -18,15 +18,30 @@ describe StuckCiJobsWorker do
end end
shared_examples 'job is dropped' do shared_examples 'job is dropped' do
before do it "changes status" do
worker.perform worker.perform
job.reload job.reload
end
it "changes status" do
expect(job).to be_failed expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure expect(job).to be_stuck_or_timeout_failure
end end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::Sentry).to receive(:track_acceptable_exception)
.with(anything, a_hash_including(extra: a_hash_including(build_id: job.id)))
.once
.and_call_original
worker.perform
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end
end
end end
shared_examples 'job is unchanged' do shared_examples 'job is unchanged' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment