Commit 6d6e8e42 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '291010-stuckcijobsworker-always-times-out' into 'master'

Optimize StuckCiJobsWorker Queries because they timeout in production

See merge request gitlab-org/gitlab!62239
parents 712f99a7 8ab6cefe
...@@ -57,6 +57,9 @@ class CommitStatus < ApplicationRecord ...@@ -57,6 +57,9 @@ class CommitStatus < ApplicationRecord
scope :in_pipelines, ->(pipelines) { where(pipeline: pipelines) } scope :in_pipelines, ->(pipelines) { where(pipeline: pipelines) }
scope :eager_load_pipeline, -> { eager_load(:pipeline, project: { namespace: :route }) } scope :eager_load_pipeline, -> { eager_load(:pipeline, project: { namespace: :route }) }
scope :with_pipeline, -> { joins(:pipeline) } scope :with_pipeline, -> { joins(:pipeline) }
scope :updated_before, ->(lookback:, timeout:) {
where('(ci_builds.created_at BETWEEN ? AND ?) AND (ci_builds.updated_at BETWEEN ? AND ?)', lookback, timeout, lookback, timeout)
}
scope :for_project_paths, -> (paths) do scope :for_project_paths, -> (paths) do
where(project: Project.where_full_path_in(Array(paths))) where(project: Project.where_full_path_in(Array(paths)))
......
...@@ -15,22 +15,46 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -15,22 +15,46 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def perform def perform
return unless try_obtain_lease return unless try_obtain_lease
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds" Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure
drop :scheduled, BUILD_SCHEDULED_OUTDATED_TIMEOUT, 'scheduled_at IS NOT NULL AND scheduled_at < ?', :stale_schedule drop(
drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT, 'ci_builds.updated_at < ?', :stuck_or_timeout_failure Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
remove_lease remove_lease
end end
private private
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def try_obtain_lease def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain @uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end end
...@@ -39,28 +63,27 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -39,28 +63,27 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid) Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end end
def drop(status, timeout, condition, reason) def drop(builds, failure_reason:)
search(status, timeout, condition) do |build| fetch(builds) do |build|
drop_build :outdated, build, status, timeout, reason drop_build :outdated, build, failure_reason
end end
end end
def drop_stuck(status, timeout, condition, reason) def drop_stuck(builds, failure_reason:)
search(status, timeout, condition) do |build| fetch(builds) do |build|
break unless build.stuck? break unless build.stuck?
drop_build :stuck, build, status, timeout, reason drop_build :stuck, build, failure_reason
end end
end end
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def search(status, timeout, condition) def fetch(builds)
loop do loop do
jobs = Ci::Build.where(status: status) jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.where(condition, timeout.ago)
.includes(:tags, :runner, project: [:namespace, :route])
.limit(100) .limit(100)
.to_a .to_a
break if jobs.empty? break if jobs.empty?
jobs.each do |job| jobs.each do |job|
...@@ -70,8 +93,8 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -70,8 +93,8 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, status, timeout, reason) def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})" Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b| Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason) b.drop(reason)
end end
......
...@@ -79,6 +79,32 @@ RSpec.describe CommitStatus do ...@@ -79,6 +79,32 @@ RSpec.describe CommitStatus do
end end
end end
describe '.updated_before' do
let!(:lookback) { 5.days.ago }
let!(:timeout) { 1.day.ago }
let!(:before_lookback) { lookback - 1.hour }
let!(:after_lookback) { lookback + 1.hour }
let!(:before_timeout) { timeout - 1.hour }
let!(:after_timeout) { timeout + 1.hour }
subject { described_class.updated_before(lookback: lookback, timeout: timeout) }
def create_build_with_set_timestamps(created_at:, updated_at:)
travel_to(created_at) { create(:ci_build, created_at: Time.current) }.tap do |build|
travel_to(updated_at) { build.update!(status: :failed) }
end
end
it 'finds builds updated and created in the window between lookback and timeout' do
build_in_lookback_timeout_window = create_build_with_set_timestamps(created_at: after_lookback, updated_at: before_timeout)
build_outside_lookback_window = create_build_with_set_timestamps(created_at: before_lookback, updated_at: before_timeout)
build_outside_timeout_window = create_build_with_set_timestamps(created_at: after_lookback, updated_at: after_timeout)
expect(subject).to contain_exactly(build_in_lookback_timeout_window)
expect(subject).not_to include(build_outside_lookback_window, build_outside_timeout_window)
end
end
describe '#processed' do describe '#processed' do
subject { commit_status.processed } subject { commit_status.processed }
......
...@@ -9,12 +9,17 @@ RSpec.describe StuckCiJobsWorker do ...@@ -9,12 +9,17 @@ RSpec.describe StuckCiJobsWorker do
let!(:job) { create :ci_build, runner: runner } let!(:job) { create :ci_build, runner: runner }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY } let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid } let(:worker_lease_uuid) { SecureRandom.uuid }
let(:created_at) { }
let(:updated_at) { }
subject(:worker) { described_class.new } subject(:worker) { described_class.new }
before do before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid) stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
job.update!(status: status, updated_at: updated_at) job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end end
shared_examples 'job is dropped' do shared_examples 'job is dropped' do
...@@ -63,22 +68,70 @@ RSpec.describe StuckCiJobsWorker do ...@@ -63,22 +68,70 @@ RSpec.describe StuckCiJobsWorker do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false) allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false)
end end
context 'when job was not updated for more than 1 day ago' do context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 2.days.ago } let(:updated_at) { 1.5.days.ago }
it_behaves_like 'job is dropped' context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
context 'when job was updated in less than 1 day ago' do context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago } let(:updated_at) { 6.hours.ago }
it_behaves_like 'job is unchanged' context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
context 'when job was not updated for more than 1 hour ago' do context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago } let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is unchanged' context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
end end
...@@ -87,17 +140,48 @@ RSpec.describe StuckCiJobsWorker do ...@@ -87,17 +140,48 @@ RSpec.describe StuckCiJobsWorker do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true) allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true)
end end
context 'when job was not updated for more than 1 hour ago' do context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 2.hours.ago } let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
context 'when job was updated in less than 1 context 'when job was updated in less than 1 hour ago' do
hour ago' do
let(:updated_at) { 30.minutes.ago } let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged' context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
end end
end end
...@@ -105,7 +189,7 @@ RSpec.describe StuckCiJobsWorker do ...@@ -105,7 +189,7 @@ RSpec.describe StuckCiJobsWorker do
context 'when job is running' do context 'when job is running' do
let(:status) { 'running' } let(:status) { 'running' }
context 'when job was not updated for more than 1 hour ago' do context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago } let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped' it_behaves_like 'job is dropped'
...@@ -123,7 +207,23 @@ RSpec.describe StuckCiJobsWorker do ...@@ -123,7 +207,23 @@ RSpec.describe StuckCiJobsWorker do
let(:status) { status } let(:status) { status }
let(:updated_at) { 2.days.ago } let(:updated_at) { 2.days.ago }
it_behaves_like 'job is unchanged' context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment