Commit 49338c97 authored by Kamil Trzciński's avatar Kamil Trzciński

Introduce DR mode for CI queueing

There's a known set of deficiencies in a CI queueing
that might impact our ability to process builds.

As a way to temporarily mitigate them we allow
to disable them for a very limited period.

Use one feature flag for two related optimizations
and rename defcon to disaster recovery.
parent e8fd52b4
......@@ -253,17 +253,23 @@ module Ci
# rubocop: disable CodeReuse/ActiveRecord
def builds_for_shared_runner
new_builds.
relation = new_builds.
# don't run projects which have not enabled shared runners and builds
joins(:project).where(projects: { shared_runners_enabled: true, pending_delete: false })
.joins('LEFT JOIN project_features ON ci_builds.project_id = project_features.project_id')
.where('project_features.builds_access_level IS NULL or project_features.builds_access_level > 0').
.where('project_features.builds_access_level IS NULL or project_features.builds_access_level > 0')
# Implement fair scheduling
# this returns builds that are ordered by number of running builds
# we prefer projects that don't use shared runners at all
joins("LEFT JOIN (#{running_builds_for_shared_runners.to_sql}) AS project_builds ON ci_builds.project_id=project_builds.project_id")
.order(Arel.sql('COALESCE(project_builds.running_builds, 0) ASC'), 'ci_builds.id ASC')
if Feature.enabled?(:ci_queueing_disaster_recovery, runner, type: :ops, default_enabled: :yaml)
# if disaster recovery is enabled, we fallback to FIFO scheduling
relation.order('ci_builds.id ASC')
else
# Implement fair scheduling
# this returns builds that are ordered by number of running builds
# we prefer projects that don't use shared runners at all
relation
.joins("LEFT JOIN (#{running_builds_for_shared_runners.to_sql}) AS project_builds ON ci_builds.project_id=project_builds.project_id")
.order(Arel.sql('COALESCE(project_builds.running_builds, 0) ASC'), 'ci_builds.id ASC')
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
---
name: ci_queueing_disaster_recovery
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/56658
rollout_issue_url:
milestone: "13.12"
type: ops
group: group::continuous integration
default_enabled: false
---
stage: Enablement
group: Distribution
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
type: reference
---
# Disaster Recovery
This document describes a feature that allows to easily disable some important but computationally
expensive parts of the application, in order to relieve stress on the database in an ongoing downtime.
## `ci_queueing_disaster_recovery`
This feature flag, if enabled temporarily disables fair scheduling on shared runners.
This can help reduce system resource usage on the `jobs/request` endpoint
by significantly reducing computations being performed.
Side effects:
- In case of a large backlog of jobs, the jobs will be processed in the order
they were put in the system instead of balancing the jobs across many projects
- Projects which are out of quota will be run. This affects
only jobs that were created during the last hour, as prior jobs are canceled
by a periodic background worker (`StuckCiJobsWorker`).
......@@ -41,9 +41,12 @@ module EE
end
def builds_for_shared_runner
return super unless shared_runner_build_limits_feature_enabled?
enforce_minutes_based_on_cost_factors(super)
# if disaster recovery is enabled, we disable quota
if ::Feature.enabled?(:ci_queueing_disaster_recovery, runner, type: :ops, default_enabled: :yaml)
super
else
enforce_minutes_based_on_cost_factors(super)
end
end
# rubocop: disable CodeReuse/ActiveRecord
......@@ -89,10 +92,6 @@ module EE
::Gitlab::CurrentSettings.shared_runners_minutes
end
def shared_runner_build_limits_feature_enabled?
ENV['DISABLE_SHARED_RUNNER_BUILD_MINUTES_LIMIT'].to_s != 'true'
end
def traversal_ids_enabled?
::Feature.enabled?(:sync_traversal_ids, default_enabled: :yaml) &&
::Feature.enabled?(:traversal_ids_for_quota_calculation, type: :development, default_enabled: :yaml)
......
......@@ -47,7 +47,7 @@ RSpec.describe Ci::RegisterJobService do
shared_runners_seconds: runners_minutes_used * 60)
end
context 'with flags enabled' do
context 'with traversal_ids enabled' do
before do
stub_feature_flags(sync_traversal_ids: true)
stub_feature_flags(traversal_ids_for_quota_calculation: true)
......@@ -56,13 +56,19 @@ RSpec.describe Ci::RegisterJobService do
it { is_expected.to be_kind_of(Ci::Build) }
end
context 'with flag disabled' do
context 'with traversal_ids disabled' do
before do
stub_feature_flags(traversal_ids_for_quota_calculation: false)
end
it { is_expected.to be_kind_of(Ci::Build) }
end
it 'when in disaster recovery it ignores quota and returns anyway' do
stub_feature_flags(ci_queueing_disaster_recovery: true)
is_expected.to be_kind_of(Ci::Build)
end
end
shared_examples 'does not return a build' do |runners_minutes_used|
......@@ -71,7 +77,7 @@ RSpec.describe Ci::RegisterJobService do
shared_runners_seconds: runners_minutes_used * 60)
end
context 'with flags enabled' do
context 'with traversal_ids enabled' do
before do
stub_feature_flags(sync_traversal_ids: true)
stub_feature_flags(traversal_ids_for_quota_calculation: true)
......@@ -80,13 +86,19 @@ RSpec.describe Ci::RegisterJobService do
it { is_expected.to be_nil }
end
context 'with flag disabled' do
context 'with traversal_ids disabled' do
before do
stub_feature_flags(traversal_ids_for_quota_calculation: false)
end
it { is_expected.to be_nil }
end
it 'when in disaster recovery it ignores quota and returns anyway' do
stub_feature_flags(ci_queueing_disaster_recovery: true)
is_expected.to be_kind_of(Ci::Build)
end
end
context 'when limit set at global level' do
......
......@@ -82,31 +82,69 @@ module Ci
let!(:build2_project2) { FactoryBot.create :ci_build, pipeline: pipeline2 }
let!(:build1_project3) { FactoryBot.create :ci_build, pipeline: pipeline3 }
it 'prefers projects without builds first' do
# it gets for one build from each of the projects
expect(execute(shared_runner)).to eq(build1_project1)
expect(execute(shared_runner)).to eq(build1_project2)
expect(execute(shared_runner)).to eq(build1_project3)
# then it gets a second build from each of the projects
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build2_project2)
# in the end the third build
expect(execute(shared_runner)).to eq(build3_project1)
end
it 'equalises number of running builds' do
# after finishing the first build for project 1, get a second build from the same project
expect(execute(shared_runner)).to eq(build1_project1)
build1_project1.reload.success
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build1_project2)
build1_project2.reload.success
expect(execute(shared_runner)).to eq(build2_project2)
expect(execute(shared_runner)).to eq(build1_project3)
expect(execute(shared_runner)).to eq(build3_project1)
context 'when using fair scheduling' do
context 'when all builds are pending' do
it 'prefers projects without builds first' do
# it gets for one build from each of the projects
expect(execute(shared_runner)).to eq(build1_project1)
expect(execute(shared_runner)).to eq(build1_project2)
expect(execute(shared_runner)).to eq(build1_project3)
# then it gets a second build from each of the projects
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build2_project2)
# in the end the third build
expect(execute(shared_runner)).to eq(build3_project1)
end
end
context 'when some builds transition to success' do
it 'equalises number of running builds' do
# after finishing the first build for project 1, get a second build from the same project
expect(execute(shared_runner)).to eq(build1_project1)
build1_project1.reload.success
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build1_project2)
build1_project2.reload.success
expect(execute(shared_runner)).to eq(build2_project2)
expect(execute(shared_runner)).to eq(build1_project3)
expect(execute(shared_runner)).to eq(build3_project1)
end
end
end
context 'when using DEFCON mode that disables fair scheduling' do
before do
stub_feature_flags(ci_queueing_disaster_recovery: true)
end
context 'when all builds are pending' do
it 'returns builds in order of creation (FIFO)' do
# it gets for one build from each of the projects
expect(execute(shared_runner)).to eq(build1_project1)
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build3_project1)
expect(execute(shared_runner)).to eq(build1_project2)
expect(execute(shared_runner)).to eq(build2_project2)
expect(execute(shared_runner)).to eq(build1_project3)
end
end
context 'when some builds transition to success' do
it 'returns builds in order of creation (FIFO)' do
expect(execute(shared_runner)).to eq(build1_project1)
build1_project1.reload.success
expect(execute(shared_runner)).to eq(build2_project1)
expect(execute(shared_runner)).to eq(build3_project1)
build2_project1.reload.success
expect(execute(shared_runner)).to eq(build1_project2)
expect(execute(shared_runner)).to eq(build2_project2)
expect(execute(shared_runner)).to eq(build1_project3)
end
end
end
end
......
......@@ -253,6 +253,9 @@ RSpec.configure do |config|
# tests, until we introduce it in user settings
stub_feature_flags(forti_token_cloud: false)
# This feature flag is by default disabled and used in disaster recovery mode
stub_feature_flags(ci_queueing_disaster_recovery: false)
enable_rugged = example.metadata[:enable_rugged].present?
# Disable Rugged features by default
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment