Commit 5bdca904 authored by Furkan Ayhan's avatar Furkan Ayhan

Add metrics to track failure reasons of pipelines and jobs

This will help us to track possible dramatic failure changes
when we deploy new features / bug fixes.
parent c821e3a3
...@@ -286,9 +286,11 @@ module Ci ...@@ -286,9 +286,11 @@ module Ci
end end
after_transition any => [:failed] do |pipeline| after_transition any => [:failed] do |pipeline|
next unless pipeline.auto_devops_source? pipeline.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.pipeline_failure_reason_counter.increment(reason: pipeline.failure_reason)
pipeline.run_after_commit { AutoDevops::DisableWorker.perform_async(pipeline.id) } AutoDevops::DisableWorker.perform_async(pipeline.id) if pipeline.auto_devops_source?
end
end end
end end
......
...@@ -179,6 +179,12 @@ class CommitStatus < ApplicationRecord ...@@ -179,6 +179,12 @@ class CommitStatus < ApplicationRecord
ExpireJobCacheWorker.perform_async(id) ExpireJobCacheWorker.perform_async(id)
end end
end end
after_transition any => :failed do |commit_status|
commit_status.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.job_failure_reason_counter.increment(reason: commit_status.failure_reason)
end
end
end end
def self.names def self.names
......
...@@ -19,7 +19,7 @@ module Ci ...@@ -19,7 +19,7 @@ module Ci
end end
def metrics def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new @metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end end
private private
......
...@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do ...@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
project: project, project: project,
current_user: user, current_user: user,
save_incompleted: false, save_incompleted: false,
pipeline_seed: double(:seed, size: 2)) pipeline_seed: double(:seed, size: 2),
increment_pipeline_failure_reason_counter: true)
end end
it 'does not drop the pipeline' do it 'does not drop the pipeline' do
...@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do ...@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
expect(step.break?).to be true expect(step.break?).to be true
end end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:size_limit_exceeded)
subject
end
end end
end end
......
...@@ -84,7 +84,7 @@ module Gitlab ...@@ -84,7 +84,7 @@ module Gitlab
end end
def metrics def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new @metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end end
def observe_creation_duration(duration) def observe_creation_duration(duration)
...@@ -97,6 +97,11 @@ module Gitlab ...@@ -97,6 +97,11 @@ module Gitlab
.observe({ source: pipeline.source.to_s }, pipeline.total_size) .observe({ source: pipeline.source.to_s }, pipeline.total_size)
end end
def increment_pipeline_failure_reason_counter(reason)
metrics.pipeline_failure_reason_counter
.increment(reason: (reason || :unknown_failure).to_s)
end
def dangling_build? def dangling_build?
%i[ondemand_dast_scan webide].include?(source) %i[ondemand_dast_scan webide].include?(source)
end end
......
...@@ -13,16 +13,7 @@ module Gitlab ...@@ -13,16 +13,7 @@ module Gitlab
pipeline.add_error_message(message) pipeline.add_error_message(message)
if drop_reason && persist_pipeline? drop_pipeline!(drop_reason)
if Feature.enabled?(:ci_pipeline_ensure_iid_on_drop, pipeline.project, default_enabled: :yaml)
# Project iid must be called outside a transaction, so we ensure it is set here
# otherwise it may be set within the state transition transaction of the drop! call
# which it will lock the InternalId row for the whole transaction
pipeline.ensure_project_iid!
end
pipeline.drop!(drop_reason)
end
# TODO: consider not to rely on AR errors directly as they can be # TODO: consider not to rely on AR errors directly as they can be
# polluted with other unrelated errors (e.g. state machine) # polluted with other unrelated errors (e.g. state machine)
...@@ -34,8 +25,23 @@ module Gitlab ...@@ -34,8 +25,23 @@ module Gitlab
pipeline.add_warning_message(message) pipeline.add_warning_message(message)
end end
def persist_pipeline? private
command.save_incompleted && !pipeline.readonly?
def drop_pipeline!(drop_reason)
return if pipeline.readonly?
if drop_reason && command.save_incompleted
if Feature.enabled?(:ci_pipeline_ensure_iid_on_drop, pipeline.project, default_enabled: :yaml)
# Project iid must be called outside a transaction, so we ensure it is set here
# otherwise it may be set within the state transition transaction of the drop! call
# which it will lock the InternalId row for the whole transaction
pipeline.ensure_project_iid!
end
pipeline.drop!(drop_reason)
else
command.increment_pipeline_failure_reason_counter(drop_reason)
end
end end
end end
end end
......
...@@ -14,7 +14,7 @@ module Gitlab ...@@ -14,7 +14,7 @@ module Gitlab
end end
def counter def counter
::Gitlab::Ci::Pipeline::Metrics.new.pipelines_created_counter ::Gitlab::Ci::Pipeline::Metrics.pipelines_created_counter
end end
end end
end end
......
...@@ -4,55 +4,57 @@ module Gitlab ...@@ -4,55 +4,57 @@ module Gitlab
module Ci module Ci
module Pipeline module Pipeline
class Metrics class Metrics
include Gitlab::Utils::StrongMemoize def self.pipeline_creation_duration_histogram
name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration'
labels = {}
buckets = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 20.0, 50.0, 240.0]
def pipeline_creation_duration_histogram ::Gitlab::Metrics.histogram(name, comment, labels, buckets)
strong_memoize(:pipeline_creation_duration_histogram) do end
name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration' def self.pipeline_size_histogram
labels = {} name = :gitlab_ci_pipeline_size_builds
buckets = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 20.0, 50.0, 240.0] comment = 'Pipeline size'
labels = { source: nil }
buckets = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
def self.pipeline_processing_events_counter
name = :gitlab_ci_pipeline_processing_events_total
comment = 'Total amount of pipeline processing events'
::Gitlab::Metrics.histogram(name, comment, labels, buckets) Gitlab::Metrics.counter(name, comment)
end
end end
def pipeline_size_histogram def self.pipelines_created_counter
strong_memoize(:pipeline_size_histogram) do name = :pipelines_created_total
name = :gitlab_ci_pipeline_size_builds comment = 'Counter of pipelines created'
comment = 'Pipeline size'
labels = { source: nil }
buckets = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
::Gitlab::Metrics.histogram(name, comment, labels, buckets) Gitlab::Metrics.counter(name, comment)
end
end end
def pipeline_processing_events_counter def self.legacy_update_jobs_counter
strong_memoize(:pipeline_processing_events_counter) do name = :ci_legacy_update_jobs_as_retried_total
name = :gitlab_ci_pipeline_processing_events_total comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
comment = 'Total amount of pipeline processing events'
Gitlab::Metrics.counter(name, comment) Gitlab::Metrics.counter(name, comment)
end
end end
def pipelines_created_counter def self.pipeline_failure_reason_counter
strong_memoize(:pipelines_created_count) do name = :gitlab_ci_pipeline_failure_reasons
name = :pipelines_created_total comment = 'Counter of pipeline failure reasons'
comment = 'Counter of pipelines created'
Gitlab::Metrics.counter(name, comment) Gitlab::Metrics.counter(name, comment)
end
end end
def legacy_update_jobs_counter def self.job_failure_reason_counter
strong_memoize(:legacy_update_jobs_counter) do name = :gitlab_ci_job_failure_reasons
name = :ci_legacy_update_jobs_as_retried_total comment = 'Counter of job failure reasons'
comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
Gitlab::Metrics.counter(name, comment) Gitlab::Metrics.counter(name, comment)
end
end end
end end
end end
......
...@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do ...@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do
it { is_expected.to be_falsey } it { is_expected.to be_falsey }
end end
end end
describe '#increment_pipeline_failure_reason_counter' do
let(:command) { described_class.new }
let(:reason) { :size_limit_exceeded }
subject { command.increment_pipeline_failure_reason_counter(reason) }
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: reason.to_s) }.by(1)
end
context 'when the reason is nil' do
let(:reason) { nil }
it 'increments the error metric with unknown_failure' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
end
end end
...@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
let(:save_incompleted) { false } let(:save_incompleted) { false }
let(:command) do let(:command) do
double(:command, Gitlab::Ci::Pipeline::Chain::Command.new(
project: project, project: project,
pipeline_seed: pipeline_seed, pipeline_seed: pipeline_seed,
save_incompleted: save_incompleted save_incompleted: save_incompleted
...@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.deployments_limit_exceeded?).to be true expect(pipeline.deployments_limit_exceeded?).to be true
end end
it 'calls increment_pipeline_failure_reason_counter' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { perform }.to change { counter.get(reason: 'deployments_limit_exceeded') }.by(1)
end
end end
context 'when not saving incomplete pipelines' do context 'when not saving incomplete pipelines' do
...@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.']) expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.'])
end end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:deployments_limit_exceeded)
perform
end
end end
it 'logs the error' do it 'logs the error' do
......
...@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do ...@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do
it 'wastes pipeline iid' do it 'wastes pipeline iid' do
expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0 expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0
end end
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { run_chain }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end end
describe 'pipeline protect' do describe 'pipeline protect' do
......
...@@ -3902,6 +3902,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do ...@@ -3902,6 +3902,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do
pipeline.drop pipeline.drop
end end
end end
context 'with failure_reason' do
let(:pipeline) { create(:ci_pipeline, :running) }
let(:failure_reason) { 'config_error' }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc') }
it 'increments the counter with the failure_reason' do
expect { pipeline.drop!(failure_reason) }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
end end
end end
......
...@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do ...@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do
end end
end end
describe 'set failure_reason when drop' do describe '#drop' do
let(:commit_status) { create(:commit_status, :created) } let(:commit_status) { create(:commit_status, :created) }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_job_failure_reasons, 'desc') }
let(:failure_reason) { reason.to_s }
subject do subject do
commit_status.drop!(reason) commit_status.drop!(reason)
commit_status commit_status
end end
shared_examples 'incrementing failure reason counter' do
it 'increments the counter with the failure_reason' do
expect { subject }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
context 'when failure_reason is nil' do context 'when failure_reason is nil' do
let(:reason) { } let(:reason) { }
let(:failure_reason) { 'unknown_failure' }
it { is_expected.to be_unknown_failure } it { is_expected.to be_unknown_failure }
it_behaves_like 'incrementing failure reason counter'
end end
context 'when failure_reason is script_failure' do context 'when failure_reason is script_failure' do
let(:reason) { :script_failure } let(:reason) { :script_failure }
it { is_expected.to be_script_failure } it { is_expected.to be_script_failure }
it_behaves_like 'incrementing failure reason counter'
end end
context 'when failure_reason is unmet_prerequisites' do context 'when failure_reason is unmet_prerequisites' do
let(:reason) { :unmet_prerequisites } let(:reason) { :unmet_prerequisites }
it { is_expected.to be_unmet_prerequisites } it { is_expected.to be_unmet_prerequisites }
it_behaves_like 'incrementing failure reason counter'
end end
end end
......
...@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do ...@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do
end end
it 'increments the prometheus counter' do it 'increments the prometheus counter' do
expect(Gitlab::Metrics).to receive(:counter) counter = spy('pipeline created counter')
.with(:pipelines_created_total, "Counter of pipelines created")
.and_call_original allow(Gitlab::Ci::Pipeline::Metrics)
allow(Gitlab::Metrics).to receive(:counter).and_call_original # allow other counters .to receive(:pipelines_created_counter).and_return(counter)
pipeline pipeline
expect(counter).to have_received(:increment)
end end
it 'records pipeline size in a prometheus histogram' do it 'records pipeline size in a prometheus histogram' do
histogram = spy('pipeline size histogram') histogram = spy('pipeline size histogram')
allow(Gitlab::Ci::Pipeline::Metrics) allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:new).and_return(histogram) .to receive(:pipeline_size_histogram).and_return(histogram)
execute_service execute_service
...@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do ...@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do
it_behaves_like 'a failed pipeline' it_behaves_like 'a failed pipeline'
it 'increments the error metric' do
stub_ci_pipeline_yaml_file(ci_yaml)
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { execute_service }.to change { counter.get(reason: 'config_error') }.by(1)
end
context 'when receive git commit' do context 'when receive git commit' do
before do before do
allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message } allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message }
......
...@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do
create(:ci_empty_pipeline, ref: 'master', project: project) create(:ci_empty_pipeline, ref: 'master', project: project)
end end
let(:pipeline_processing_events_counter) { double(increment: true) }
let(:legacy_update_jobs_counter) { double(increment: true) }
let(:metrics) do
double(pipeline_processing_events_counter: pipeline_processing_events_counter,
legacy_update_jobs_counter: legacy_update_jobs_counter)
end
subject { described_class.new(pipeline) } subject { described_class.new(pipeline) }
before do before do
...@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do
stub_not_protect_default_branch stub_not_protect_default_branch
project.add_developer(user) project.add_developer(user)
allow(subject).to receive(:metrics).and_return(metrics)
end end
describe 'processing events counter' do describe 'processing events counter' do
let(:metrics) { double('pipeline metrics') }
let(:counter) { double('events counter') }
before do
allow(subject)
.to receive(:metrics).and_return(metrics)
allow(metrics)
.to receive(:pipeline_processing_events_counter)
.and_return(counter)
end
it 'increments processing events counter' do it 'increments processing events counter' do
expect(counter).to receive(:increment) expect(pipeline_processing_events_counter).to receive(:increment)
subject.execute subject.execute
end end
...@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do
expect(all_builds.retried).to contain_exactly(build_retried) expect(all_builds.retried).to contain_exactly(build_retried)
end end
context 'counter ci_legacy_update_jobs_as_retried_total' do it 'increments the counter' do
let(:counter) { double(increment: true) } expect(legacy_update_jobs_counter).to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do before do
allow(Gitlab::Metrics).to receive(:counter).and_call_original build_retried.update_columns(retried: true)
allow(Gitlab::Metrics).to receive(:counter)
.with(:ci_legacy_update_jobs_as_retried_total, anything)
.and_return(counter)
end end
it 'increments the counter' do it 'does not increment the counter' do
expect(counter).to receive(:increment) expect(legacy_update_jobs_counter).not_to receive(:increment)
subject.execute subject.execute
end end
context 'when the previous build has already retried column true' do
before do
build_retried.update_columns(retried: true)
end
it 'does not increment the counter' do
expect(counter).not_to receive(:increment)
subject.execute
end
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment