Commit 8aac62b2 authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'feature/gb/runner-register-build-metrics' into 'master'

Add new runner build registration metrics [RUN ALL RSPEC] [RUN AS-IF-FOSS]

See merge request gitlab-org/gitlab!54909
parents 4b373026 5f8c8203
......@@ -4,7 +4,7 @@ module Ci
# This class responsible for assigning
# proper pending build to runner on runner API request
class RegisterJobService
attr_reader :runner
attr_reader :runner, :metrics
Result = Struct.new(:build, :build_json, :valid?)
......@@ -13,8 +13,18 @@ module Ci
@metrics = ::Gitlab::Ci::Queue::Metrics.new(runner)
end
# rubocop: disable CodeReuse/ActiveRecord
def execute(params = {})
@metrics.increment_queue_operation(:queue_attempt)
@metrics.observe_queue_time do
process_queue(params)
end
end
private
# rubocop: disable CodeReuse/ActiveRecord
def process_queue(params)
builds =
if runner.instance_type?
builds_for_shared_runner
......@@ -24,8 +34,6 @@ module Ci
builds_for_project_runner
end
valid = true
# pick builds that does not have other tags than runner's one
builds = builds.matches_tag_ids(runner.tags.ids)
......@@ -39,14 +47,23 @@ module Ci
builds = builds.queued_before(params[:job_age].seconds.ago)
end
@metrics.observe_queue_size(-> { builds.to_a.size })
valid = true
depth = 0
builds.each do |build|
depth += 1
@metrics.increment_queue_operation(:queue_iteration)
result = process_build(build, params)
next unless result
if result.valid?
@metrics.register_success(result.build)
@metrics.observe_queue_depth(:found, depth)
return result
return result # rubocop:disable Cop/AvoidReturnFromBlocks
else
# The usage of valid: is described in
# handling of ActiveRecord::StaleObjectError
......@@ -54,22 +71,30 @@ module Ci
end
end
@metrics.increment_queue_operation(:queue_conflict) unless valid
@metrics.observe_queue_depth(:conflict, depth) unless valid
@metrics.observe_queue_depth(:not_found, depth) if valid
@metrics.register_failure
Result.new(nil, nil, valid)
end
# rubocop: enable CodeReuse/ActiveRecord
private
def process_build(build, params)
return unless runner.can_pick?(build)
if runner.can_pick?(build)
@metrics.increment_queue_operation(:build_can_pick)
else
@metrics.increment_queue_operation(:build_not_pick)
return
end
# In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params)
present_build!(build)
end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
rescue ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting
# It also indicates that this build can be picked and passed to runner.
# If we don't do it, basically a bunch of runners would be competing for a build
......@@ -79,8 +104,16 @@ module Ci
# In case we hit the concurrency-access lock,
# we still have to return 409 in the end,
# to make sure that this is properly handled by runner.
@metrics.increment_queue_operation(:build_conflict_lock)
Result.new(nil, nil, false)
rescue StateMachines::InvalidTransition
@metrics.increment_queue_operation(:build_conflict_transition)
Result.new(nil, nil, false)
rescue => ex
@metrics.increment_queue_operation(:build_conflict_exception)
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
# the result, consider this as a failure to be retried.
scheduler_failure!(build)
......@@ -106,8 +139,12 @@ module Ci
failure_reason, _ = pre_assign_runner_checks.find { |_, check| check.call(build, params) }
if failure_reason
@metrics.increment_queue_operation(:runner_pre_assign_checks_failed)
build.drop!(failure_reason)
else
@metrics.increment_queue_operation(:runner_pre_assign_checks_success)
build.run!
end
......
---
name: gitlab_ci_builds_queuing_metrics
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/54909
rollout_issue_url:
milestone: '13.10'
type: development
group: group::continuous integration
default_enabled: false
......@@ -21,6 +21,8 @@ module EE
# "Hi, we don't have any more builds now, but not everything is right anyway, so try again".
# Runner will retry, but again, against replica, and again will check if replication lag did catch-up.
if !db_all_caught_up && !result.build
metrics.increment_queue_operation(:queue_replication_lag)
return ::Ci::RegisterJobService::Result.new(nil, false) # rubocop:disable Cop/AvoidReturnFromBlocks
end
end
......
......@@ -7,10 +7,34 @@ module Gitlab
extend Gitlab::Utils::StrongMemoize
QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900, 1800, 3600].freeze
QUEUE_DEPTH_TOTAL_BUCKETS = [1, 2, 3, 5, 8, 16, 32, 50, 100, 250, 500, 1000, 2000, 5000].freeze
QUEUE_SIZE_TOTAL_BUCKETS = [1, 5, 10, 50, 100, 500, 1000, 2000, 5000].freeze
QUEUE_ITERATION_DURATION_SECONDS_BUCKETS = [0.1, 0.3, 0.5, 1, 5, 10, 30, 60, 180, 300].freeze
METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'
DEFAULT_METRICS_SHARD = 'default'
JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze
OPERATION_COUNTERS = [
:build_can_pick,
:build_not_pick,
:build_conflict_lock,
:build_conflict_exception,
:build_conflict_transition,
:queue_attempt,
:queue_conflict,
:queue_iteration,
:queue_replication_lag,
:runner_pre_assign_checks_failed,
:runner_pre_assign_checks_success
].to_set.freeze
QUEUE_DEPTH_HISTOGRAMS = [
:found,
:not_found,
:conflict
].to_set.freeze
attr_reader :runner
def initialize(runner)
......@@ -47,6 +71,43 @@ module Gitlab
end
# rubocop: enable CodeReuse/ActiveRecord
def increment_queue_operation(operation)
if !Rails.env.production? && !OPERATION_COUNTERS.include?(operation)
raise ArgumentError, "unknown queue operation: #{operation}"
end
self.class.queue_operations_total.increment(operation: operation)
end
def observe_queue_depth(queue, size)
return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)
if !Rails.env.production? && !QUEUE_DEPTH_HISTOGRAMS.include?(queue)
raise ArgumentError, "unknown queue depth label: #{queue}"
end
self.class.queue_depth_total.observe({ queue: queue }, size.to_f)
end
def observe_queue_size(size_proc)
return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)
self.class.queue_size_total.observe({}, size_proc.call.to_f)
end
def observe_queue_time
start_time = ::Gitlab::Metrics::System.monotonic_time
result = yield
return result unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, default_enabled: false)
seconds = ::Gitlab::Metrics::System.monotonic_time - start_time
self.class.queue_iteration_duration_seconds.observe({}, seconds.to_f)
result
end
def self.failed_attempt_counter
strong_memoize(:failed_attempt_counter) do
name = :job_register_attempts_failed_total
......@@ -75,6 +136,48 @@ module Gitlab
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
def self.queue_operations_total
strong_memoize(:queue_operations_total) do
name = :gitlab_ci_queue_operations_total
comment = 'Counts all the operations that are happening inside a queue'
Gitlab::Metrics.counter(name, comment)
end
end
def self.queue_depth_total
strong_memoize(:queue_depth_total) do
name = :gitlab_ci_queue_depth_total
comment = 'Size of a CI/CD builds queue in relation to the operation result'
labels = {}
buckets = QUEUE_DEPTH_TOTAL_BUCKETS
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
def self.queue_size_total
strong_memoize(:queue_size_total) do
name = :gitlab_ci_queue_size_total
comment = 'Size of initialized CI/CD builds queue'
labels = {}
buckets = QUEUE_SIZE_TOTAL_BUCKETS
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
def self.queue_iteration_duration_seconds
strong_memoize(:queue_iteration_duration_seconds) do
name = :gitlab_ci_queue_iteration_duration_seconds
comment = 'Time it takes to find a build in CI/CD queue'
labels = {}
buckets = QUEUE_ITERATION_DURATION_SECONDS_BUCKETS
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment