Commit 1d138989 authored by Qingyu Zhao's avatar Qingyu Zhao

Add Sidekiq job CPU time prometheus metrics

parent 670fa76a
...@@ -19,10 +19,16 @@ module Gitlab ...@@ -19,10 +19,16 @@ module Gitlab
@metrics[:sidekiq_jobs_retried_total].increment(labels, 1) @metrics[:sidekiq_jobs_retried_total].increment(labels, 1)
end end
job_thread_cputime_start = get_thread_cputime
realtime = Benchmark.realtime do realtime = Benchmark.realtime do
yield yield
end end
job_thread_cputime_end = get_thread_cputime
job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, realtime) @metrics[:sidekiq_jobs_completion_seconds].observe(labels, realtime)
rescue Exception # rubocop: disable Lint/RescueException rescue Exception # rubocop: disable Lint/RescueException
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1) @metrics[:sidekiq_jobs_failed_total].increment(labels, 1)
...@@ -35,6 +41,7 @@ module Gitlab ...@@ -35,6 +41,7 @@ module Gitlab
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
...@@ -47,6 +54,10 @@ module Gitlab ...@@ -47,6 +54,10 @@ module Gitlab
queue: queue queue: queue
} }
end end
def get_thread_cputime
defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0
end
end end
end end
end end
...@@ -8,12 +8,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -8,12 +8,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do
let(:worker) { double(:worker) } let(:worker) { double(:worker) }
let(:completion_seconds_metric) { double('completion seconds metric') } let(:completion_seconds_metric) { double('completion seconds metric') }
let(:user_execution_seconds_metric) { double('user execution seconds metric') }
let(:failed_total_metric) { double('failed total metric') } let(:failed_total_metric) { double('failed total metric') }
let(:retried_total_metric) { double('retried total metric') } let(:retried_total_metric) { double('retried total metric') }
let(:running_jobs_metric) { double('running jobs metric') } let(:running_jobs_metric) { double('running jobs metric') }
before do before do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :livesum).and_return(running_jobs_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :livesum).and_return(running_jobs_metric)
...@@ -23,13 +25,16 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -23,13 +25,16 @@ describe Gitlab::SidekiqMiddleware::Metrics do
it 'yields block' do it 'yields block' do
allow(completion_seconds_metric).to receive(:observe) allow(completion_seconds_metric).to receive(:observe)
allow(user_execution_seconds_metric).to receive(:observe)
expect { |b| middleware.call(worker, {}, :test, &b) }.to yield_control.once expect { |b| middleware.call(worker, {}, :test, &b) }.to yield_control.once
end end
it 'sets metrics' do it 'sets metrics' do
labels = { queue: :test } labels = { queue: :test }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 3)
expect(user_execution_seconds_metric).to receive(:observe).with(labels, 2)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1) expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(completion_seconds_metric).to receive(:observe).with(labels, kind_of(Numeric)) expect(completion_seconds_metric).to receive(:observe).with(labels, kind_of(Numeric))
...@@ -37,9 +42,17 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -37,9 +42,17 @@ describe Gitlab::SidekiqMiddleware::Metrics do
middleware.call(worker, {}, :test) { nil } middleware.call(worker, {}, :test) { nil }
end end
it 'ignore user execution when measured 0' do
allow(completion_seconds_metric).to receive(:observe)
allow(middleware).to receive(:get_thread_cputime).and_return(0, 0)
expect(user_execution_seconds_metric).not_to receive(:observe)
end
context 'when job is retried' do context 'when job is retried' do
it 'sets sidekiq_jobs_retried_total metric' do it 'sets sidekiq_jobs_retried_total metric' do
allow(completion_seconds_metric).to receive(:observe) allow(completion_seconds_metric).to receive(:observe)
expect(user_execution_seconds_metric).to receive(:observe)
expect(retried_total_metric).to receive(:increment) expect(retried_total_metric).to receive(:increment)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment