Commit ba9ab364 authored by Fabio Pitino's avatar Fabio Pitino

Drop running builds when CI minutes quota exceeded

When a group exceeds the CI minutes quota we drop the
running build. To do this we track the live consumption
using Redis and we decrement the remaining minutes
every time the Runner contacts GitLab Rails for ping
or trace update.
parent c0b13f15
......@@ -22,6 +22,7 @@ module Enums
forward_deployment_failure: 13,
user_blocked: 14,
project_deleted: 15,
ci_quota_exceeded: 16,
insufficient_bridge_permissions: 1_001,
downstream_bridge_project_not_found: 1_002,
invalid_bridge_trigger: 1_003,
......
......@@ -23,7 +23,8 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated
secrets_provider_not_found: 'The secrets provider can not be found',
reached_max_descendant_pipelines_depth: 'You reached the maximum depth of child pipelines',
project_deleted: 'The job belongs to a deleted project',
user_blocked: 'The user who created this job is blocked'
user_blocked: 'The user who created this job is blocked',
ci_quota_exceeded: 'No more CI minutes available'
}.freeze
private_constant :CALLOUT_FAILURE_MESSAGES
......
---
name: ci_minutes_track_live_consumption
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/59263
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/329197
milestone: '13.12'
type: development
group: group::continuous integration
default_enabled: false
......@@ -64,6 +64,10 @@ module Ci
namespace.root? && namespace.any_project_with_shared_runners_enabled?
end
def current_balance
total_minutes.to_i - total_minutes_used
end
private
def minutes_limit
......@@ -83,7 +87,7 @@ module Ci
end
def total_minutes_remaining
[total_minutes.to_i - total_minutes_used, 0].max
[current_balance, 0].max
end
def monthly_minutes_used_up?
......
# frozen_string_literal: true
module Ci
module Minutes
class TrackLiveConsumptionService
TTL_RUNNING_BUILDS = 5.minutes
# We allow remaining minutes to drop below this number to avoid dropping
# builds immediately when the quota is exceeded
CONSUMPTION_THRESHOLD = -1000
def initialize(build)
@build = build
end
def execute
result = validate_preconditions
return result if result.error?
consumption = consumption_since_last_update
return ServiceResponse.success(message: 'Build consumption is zero') if consumption == 0 # first build update
accumulate_total_build_consumption(consumption)
new_balance = cached_quota.track_consumption(consumption)
if new_balance < CONSUMPTION_THRESHOLD
build.drop(:ci_quota_exceeded)
metrics.ci_minutes_exceeded_builds_counter.increment
ServiceResponse.success(message: 'Build dropped due to CI minutes limit exceeded', payload: { current_balance: new_balance })
else
ServiceResponse.success(message: 'CI minutes limit not exceeded', payload: { current_balance: new_balance })
end
end
def live_consumption
::Gitlab::Redis::SharedState.with do |redis|
redis.get(consumption_key).to_f
end
end
def time_last_tracked_consumption!(new_time)
old_time = nil
::Gitlab::Redis::SharedState.with do |redis|
redis.multi do
key = last_build_update_key
old_time = redis.get(key)
redis.set(key, new_time)
redis.expire(key, TTL_RUNNING_BUILDS)
end
end
if old_time&.value
DateTime.parse(old_time.value)
else
new_time
end
end
private
attr_reader :build
def validate_preconditions
if !feature_enabled?
ServiceResponse.error(message: 'Feature not enabled')
elsif !build.running?
ServiceResponse.error(message: 'Build is not running')
elsif !free_or_trial_plan?
ServiceResponse.error(message: 'Project is not on Free or trial plan')
elsif !build.shared_runners_minutes_limit_enabled?
ServiceResponse.error(message: 'CI minutes limit not enabled for build')
else
ServiceResponse.success
end
end
def feature_enabled?
Feature.enabled?(:ci_minutes_track_live_consumption, build.project, default_enabled: :yaml)
end
def free_or_trial_plan?
Gitlab.com? && (root_namespace.free_plan? || root_namespace.trial?)
end
def consumption_since_last_update
last_tracking = time_last_tracked_consumption!(Time.current.utc)
duration = Time.current.utc - last_tracking
::Gitlab::Ci::Minutes::BuildConsumption.new(build, duration).amount
end
def last_build_update_key
"ci:minutes:builds:#{build.id}:last_update"
end
def accumulate_total_build_consumption(consumption)
::Gitlab::Redis::SharedState.with do |redis|
redis.multi do |multi|
multi.incrbyfloat(consumption_key, consumption)
multi.expire(consumption_key, TTL_RUNNING_BUILDS)
end
end
end
def consumption_key
"ci:minutes:builds:#{build.id}:consumption"
end
def cached_quota
@cached_quota ||= Gitlab::Ci::Minutes::CachedQuota.new(root_namespace)
end
def root_namespace
@root_namespace ||= build.project.root_namespace
end
def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new
end
end
end
end
......@@ -8,7 +8,7 @@ module Ci
return unless build.complete?
return unless build.duration&.positive?
consumption = ::Gitlab::Ci::Minutes::BuildConsumption.new(build).amount
consumption = ::Gitlab::Ci::Minutes::BuildConsumption.new(build, build.duration).amount
return unless consumption > 0
......
......@@ -29,6 +29,11 @@ module EE
super
end
override :track_ci_minutes_usage!
def track_ci_minutes_usage!(build, runner)
::Ci::Minutes::TrackLiveConsumptionService.new(build).execute
end
end
end
end
......
......@@ -8,12 +8,13 @@ module Gitlab
# The amount returned is a float so that internally we could track
# an accurate usage of minutes/credits.
class BuildConsumption
def initialize(build)
def initialize(build, duration)
@build = build
@duration = duration
end
def amount
(@build.duration.to_f / 60 * cost_factor).round(2)
@amount ||= (@duration.to_f / 60 * cost_factor).round(2)
end
private
......
# frozen_string_literal: true
module Gitlab
module Ci
module Minutes
# Tracks current remaining minutes in Redis for faster access and tracking
# consumption of running builds.
class CachedQuota
include ::Gitlab::Utils::StrongMemoize
TTL_REMAINING_MINUTES = 10.minutes
attr_reader :root_namespace
def initialize(root_namespace)
@root_namespace = root_namespace
end
# TODO:
# - when monthly minutes are updated via the API (e.g. plan change)
# - when extra_shared_runners_minutes_limit are updated via the API
# - when minutes consumption is reset via the controller (TS or admin)
def expire!
# todo
end
# Reduces the remaining minutes by the consumption argument.
# Then returns the new balance of remaining minutes.
def track_consumption(consumption)
new_balance = nil
::Gitlab::Redis::SharedState.with do |redis|
if redis.exists(cache_key)
redis.multi do |multi|
multi.expire(cache_key, TTL_REMAINING_MINUTES)
new_balance = multi.incrbyfloat(cache_key, -consumption)
end
else
redis.multi do |multi|
redis.set(cache_key, uncached_current_balance, nx: true, ex: TTL_REMAINING_MINUTES)
new_balance = multi.incrbyfloat(cache_key, -consumption)
end
end
end
new_balance.value.to_f
end
# We include the current month in the key so that the entry
# automatically expires on the 1st of the month, when we reset CI minutes.
def cache_key
strong_memoize(:cache_key) do
now = Time.current.utc
"ci:minutes:namespaces:#{root_namespace.id}:#{now.year}#{now.month}:remaining"
end
end
private
def uncached_current_balance
root_namespace.ci_minutes_quota.current_balance.to_f
end
end
end
end
end
......@@ -5,7 +5,7 @@ require 'spec_helper'
RSpec.describe Gitlab::Ci::Minutes::BuildConsumption do
using RSpec::Parameterized::TableSyntax
let(:consumption) { described_class.new(build) }
let(:consumption) { described_class.new(build, build.duration) }
let(:build) { build_stubbed(:ci_build, runner: runner, project: project) }
let_it_be(:project) { create(:project) }
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Ci::Minutes::CachedQuota do
let_it_be(:namespace) { create(:namespace, shared_runners_minutes_limit: 100) }
let(:cached_quota) { described_class.new(namespace) }
describe '#track_consumption', :redis do
subject { cached_quota.track_consumption(consumption) }
let(:consumption) { 10 }
context 'when the cache is cold' do
it 'stores the remaining minutes in the cache and decrements them from there' do
freeze_time do
expect(cached_quota).to receive(:uncached_current_balance).and_call_original
expect(subject).to eq(90.0)
::Gitlab::Redis::SharedState.with do |redis|
expect(redis.ttl(cached_quota.cache_key)).to eq(described_class::TTL_REMAINING_MINUTES)
end
end
end
end
context 'when the cache is warm' do
before do
::Gitlab::Redis::SharedState.with do |redis|
redis.set(cached_quota.cache_key, 80.0, ex: 20)
end
end
it 'only decrements the consumption' do
freeze_time do
expect(cached_quota).not_to receive(:uncached_current_balance)
expect(subject).to eq(70.0)
::Gitlab::Redis::SharedState.with do |redis|
expect(redis.ttl(cached_quota.cache_key)).to eq(described_class::TTL_REMAINING_MINUTES)
end
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
let_it_be(:group) { create(:group, shared_runners_minutes_limit: 100) }
let_it_be(:project) { create(:project, :private, namespace: group, shared_runners_enabled: true) }
let_it_be(:pipeline) { create(:ci_pipeline, project: project, ref: 'master') }
let_it_be(:runner) { create(:ci_runner, :instance) }
let_it_be(:user) { create(:user) }
let(:headers) { { API::Helpers::Runner::JOB_TOKEN_HEADER => job.token, 'Content-Type' => 'text/plain' } }
before do
allow(Gitlab).to receive(:com?).and_return(true)
end
describe 'PUT /api/v4/jobs/:id' do
let(:job) do
create(:ci_build, :running, :trace_live,
project: project,
user: user,
runner: runner,
pipeline: pipeline)
end
let(:minutes_already_consumed) do
95 + Ci::Minutes::TrackLiveConsumptionService::CONSUMPTION_THRESHOLD.abs
end
let!(:statistics) do
create(:namespace_statistics,
namespace: group,
shared_runners_seconds: minutes_already_consumed.minutes)
end
it 'tracks CI minutes usage of running job' do
expect(Ci::Minutes::TrackLiveConsumptionService).to receive(:new).with(job).and_call_original
update_job(state: 'running')
end
context 'when CI minutes usage is exceeded' do
it 'drops the job' do
freeze_time do
Ci::Minutes::TrackLiveConsumptionService.new(job).time_last_tracked_consumption!(10.minutes.ago)
update_job(state: 'running')
expect(response).to have_gitlab_http_status(:ok)
expect(job.reload).to be_failed
expect(job.failure_reason).to eq('ci_quota_exceeded')
end
end
end
context 'when CI minutes usage is not exceeded' do
it 'does not drop the job' do
freeze_time do
Ci::Minutes::TrackLiveConsumptionService.new(job).time_last_tracked_consumption!(2.minutes.ago)
update_job(state: 'running')
expect(response).to have_gitlab_http_status(:ok)
expect(job.reload).to be_running
end
end
end
def update_job(token = job.token, **params)
new_params = params.merge(token: token)
put api("/jobs/#{job.id}"), params: new_params
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
let_it_be(:group) { create(:group, shared_runners_minutes_limit: 100) }
let_it_be(:project) { create(:project, :private, namespace: group, shared_runners_enabled: true) }
let_it_be(:pipeline) { create(:ci_pipeline, project: project, ref: 'master') }
let_it_be(:runner) { create(:ci_runner, :instance) }
let_it_be(:user) { create(:user) }
let(:headers) { { API::Helpers::Runner::JOB_TOKEN_HEADER => job.token, 'Content-Type' => 'text/plain' } }
before do
allow(Gitlab).to receive(:com?).and_return(true)
end
describe 'PATCH /api/v4/jobs/:id/trace' do
let(:job) do
create(:ci_build, :running, :trace_live,
project: project,
user: user,
runner: runner,
pipeline: pipeline)
end
let(:minutes_already_consumed) do
95 + Ci::Minutes::TrackLiveConsumptionService::CONSUMPTION_THRESHOLD.abs
end
let!(:statistics) do
create(:namespace_statistics,
namespace: group,
shared_runners_seconds: minutes_already_consumed.minutes)
end
it 'tracks CI minutes usage of running job' do
expect(Ci::Minutes::TrackLiveConsumptionService).to receive(:new).with(job).and_call_original
patch_the_trace
end
context 'when CI minutes usage is exceeded' do
it 'drops the job' do
freeze_time do
Ci::Minutes::TrackLiveConsumptionService.new(job).time_last_tracked_consumption!(10.minutes.ago)
patch_the_trace
expect(response).to have_gitlab_http_status(:accepted)
expect(response.header['Job-Status']).to eq('failed')
expect(job.reload.trace.raw).to eq 'BUILD TRACE appended'
expect(response.header).to have_key 'Range'
expect(response.header).to have_key 'X-GitLab-Trace-Update-Interval'
expect(job).to be_failed
expect(job.failure_reason).to eq('ci_quota_exceeded')
end
end
end
context 'when CI minutes usage is not exceeded' do
it 'does not drop the job' do
freeze_time do
Ci::Minutes::TrackLiveConsumptionService.new(job).time_last_tracked_consumption!(2.minutes.ago)
patch_the_trace
expect(response).to have_gitlab_http_status(:accepted)
expect(response.header['Job-Status']).to eq('running')
expect(job.reload.trace.raw).to eq 'BUILD TRACE appended'
expect(response.header).to have_key 'Range'
expect(response.header).to have_key 'X-GitLab-Trace-Update-Interval'
end
end
end
def patch_the_trace(content = ' appended')
headers = { API::Helpers::Runner::JOB_TOKEN_HEADER => job.token, 'Content-Type' => 'text/plain' }
job.trace.read do |stream|
offset = stream.size
limit = offset + content.length - 1
headers = headers.merge({ 'Content-Range' => "#{offset}-#{limit}" })
end
patch api("/jobs/#{job.id}/trace"), params: content, headers: headers
job.reload
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::Minutes::TrackLiveConsumptionService do
let(:project) { create(:project, :private, shared_runners_enabled: true, namespace: namespace) }
let(:namespace) { create(:namespace, shared_runners_minutes_limit: 100) }
let(:build) { create(:ci_build, :running, project: project, runner: runner) }
let(:runner) { create(:ci_runner, :instance) }
let(:service) { described_class.new(build) }
before do
allow(Gitlab).to receive(:com?).and_return(true)
end
describe '#execute', :clean_gitlab_redis_shared_state do
subject { service.execute }
shared_examples 'returns early' do |error_message|
it 'returns an error response' do
response = subject
expect(response).to be_error
expect(response.message).to eq(error_message)
end
end
shared_examples 'limit not exceeded' do |expected_balance, expected_consumption|
it 'does not drop the build', :aggregate_failures do
response = subject
expect(response).to be_success
expect(response.message).to eq('CI minutes limit not exceeded')
expect(response.payload.fetch(:current_balance).round).to eq(expected_balance)
expect(service.live_consumption.to_i).to eq(expected_consumption)
end
end
shared_examples 'limit exceeded' do
it 'drops the build' do
response = subject
expect(response).to be_success
expect(response.message).to eq('Build dropped due to CI minutes limit exceeded')
expect(response.payload.fetch(:current_balance).round).to eq(-1001)
expect(build.reload).to be_failed
expect(build.failure_reason).to eq('ci_quota_exceeded')
expect(service.live_consumption.to_i).to eq(minutes_consumption)
end
end
context 'when build is not running' do
let(:build) { create(:ci_build, :success) }
it_behaves_like 'returns early', 'Build is not running'
end
context 'when runner is not of instance type' do
let(:runner) { create(:ci_runner, :project) }
it_behaves_like 'returns early', 'CI minutes limit not enabled for build'
end
context 'when project is not on Free plan' do
before do
create(:gitlab_subscription, :premium, namespace: namespace)
end
it_behaves_like 'returns early', 'Project is not on Free or trial plan'
end
context 'when running on self-hosted' do
before do
allow(Gitlab).to receive(:com?).and_return(false)
end
it_behaves_like 'returns early', 'Project is not on Free or trial plan'
end
context 'when shared runners limit is not enabled for build' do
before do
allow(build).to receive(:shared_runners_minutes_limit_enabled?).and_return(false)
end
it_behaves_like 'returns early', 'CI minutes limit not enabled for build'
end
context 'when build has not been tracked recently' do
it 'considers the current consumption as zero' do
response = subject
expect(response).to be_success
expect(response.message).to eq('Build consumption is zero')
end
end
context 'when build has been tracked recently' do
before do
service.time_last_tracked_consumption!(1.minute.ago.utc)
end
it_behaves_like 'limit not exceeded', 99, 1
end
context 'when current consumption exceeds the limit but not the grace period' do
before do
service.time_last_tracked_consumption!(200.minutes.ago.utc)
end
it_behaves_like 'limit not exceeded', -100, 200
end
context 'when current consumption exceeds the limit and the grace period' do
let(:minutes_consumption) do
namespace.shared_runners_minutes_limit + described_class::CONSUMPTION_THRESHOLD.abs + 1
end
before do
service.time_last_tracked_consumption!(minutes_consumption.minutes.ago.utc)
end
it_behaves_like 'limit exceeded'
context 'when namespace is on a trial hosted plan' do
before do
create(:gitlab_subscription, :premium, :active_trial, namespace: namespace)
end
it_behaves_like 'limit exceeded'
end
context 'when feature flag is disabled' do
before do
stub_feature_flags(ci_minutes_track_live_consumption: false)
end
it_behaves_like 'returns early', 'Feature not enabled'
end
end
end
describe '#live_consumption', :clean_gitlab_redis_shared_state do
subject { service.live_consumption }
context 'when build has not been tracked' do
it { is_expected.to be_zero }
end
context 'when build has been tracked once' do
it 'returns the consumption since last update' do
freeze_time do
service.time_last_tracked_consumption!(3.minutes.ago)
service.execute
expect(subject).to eq(3.0)
end
end
end
context 'when build has been tracked multiple times' do
before do
service.time_last_tracked_consumption!(7.minutes.ago)
travel_to 5.minutes.ago do
service.execute # track 2 min
end
service.execute # track 5 min
travel_to 10.minutes.from_now do
service.execute # track 10 min
end
end
it 'accumulates the consumption over different runs' do
expect(subject.to_i).to eq(17)
end
end
end
end
......@@ -184,6 +184,8 @@ module API
.new(job, declared_params(include_missing: false))
service.execute.then do |result|
track_ci_minutes_usage!(job, current_runner)
header 'X-GitLab-Trace-Update-Interval', result.backoff
status result.status
body result.status.to_s
......@@ -214,6 +216,8 @@ module API
break error!('416 Range Not Satisfiable', 416, { 'Range' => "0-#{result.stream_size}" })
end
track_ci_minutes_usage!(job, current_runner)
status result.status
header 'Job-Status', job.status
header 'Range', "0-#{result.stream_size}"
......
......@@ -87,6 +87,10 @@ module API
project: -> { current_job.project }
)
end
def track_ci_minutes_usage!(_build, _runner)
# noop: overridden in EE
end
end
end
end
......@@ -63,6 +63,13 @@ module Gitlab
Gitlab::Metrics.counter(name, comment)
end
def ci_minutes_exceeded_builds_counter
name = :ci_minutes_exceeded_builds_counter
comment = 'Count of builds dropped due to CI minutes exceeded'
Gitlab::Metrics.counter(name, comment)
end
end
end
end
......
......@@ -28,7 +28,8 @@ module Gitlab
secrets_provider_not_found: 'secrets provider can not be found',
reached_max_descendant_pipelines_depth: 'reached maximum depth of child pipelines',
project_deleted: 'pipeline project was deleted',
user_blocked: 'pipeline user was blocked'
user_blocked: 'pipeline user was blocked',
ci_quota_exceeded: 'no more CI minutes available'
}.freeze
private_constant :REASONS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment