Commit 08650a8d authored by Oswaldo Ferreira's avatar Oswaldo Ferreira

Add Redis metrics of Sidekiq job execution

Adds metrics to measure the amount of calls and
how much time is spent querying Redis.
parent f3f4b432
---
title: Add metrics for Redis usage during Sidekiq job execution
merge_request: 32265
author:
type: added
...@@ -123,9 +123,11 @@ configuration option in `gitlab.yml`. These metrics are served from the ...@@ -123,9 +123,11 @@ configuration option in `gitlab.yml`. These metrics are served from the
| `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_redis_requests_duration_seconds` | Histogram | 13.1 | Duration in seconds that a Sidekiq job spent querying a Redis server | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_redis_requests_total` | Counter | 13.1 | Redis requests during a Sidekiq job execution | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | | | `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | |
| `geo_db_replication_lag_seconds` | Gauge | 10.2 | Database replication lag (seconds) | `url` | | `geo_db_replication_lag_seconds` | Gauge | 10.2 | Database replication lag (seconds) | `url` |
......
...@@ -24,6 +24,9 @@ module Gitlab ...@@ -24,6 +24,9 @@ module Gitlab
REDIS_CALL_DURATION = :redis_call_duration REDIS_CALL_DURATION = :redis_call_duration
REDIS_CALL_DETAILS = :redis_call_details REDIS_CALL_DETAILS = :redis_call_details
# Milliseconds represented in seconds (from 1 to 500 milliseconds).
QUERY_TIME_BUCKETS = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5].freeze
def self.get_request_count def self.get_request_count
::RequestStore[REDIS_REQUEST_COUNT] || 0 ::RequestStore[REDIS_REQUEST_COUNT] || 0
end end
......
...@@ -47,6 +47,8 @@ module Gitlab ...@@ -47,6 +47,8 @@ module Gitlab
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time) @metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
@metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000) @metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000)
@metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, get_gitaly_time(job)) @metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, get_gitaly_time(job))
@metrics[:sidekiq_redis_requests_total].increment(labels, get_redis_calls(job))
@metrics[:sidekiq_redis_requests_duration_seconds].observe(labels, get_redis_time(job))
end end
end end
...@@ -54,15 +56,17 @@ module Gitlab ...@@ -54,15 +56,17 @@ module Gitlab
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_redis_requests_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_redis_requests_duration_seconds, 'Duration in seconds that a Sidekiq job spent requests a Redis server', {}, Gitlab::Instrumentation::Redis::QUERY_TIME_BUCKETS),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all) sidekiq_redis_requests_total: ::Gitlab::Metrics.counter(:sidekiq_redis_requests_total, 'Redis requests during a Sidekiq job execution'),
sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all)
} }
end end
...@@ -70,6 +74,14 @@ module Gitlab ...@@ -70,6 +74,14 @@ module Gitlab
defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0 defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0
end end
def get_redis_time(job)
job.fetch(:redis_duration_s, 0)
end
def get_redis_calls(job)
job.fetch(:redis_calls, 0)
end
def get_gitaly_time(job) def get_gitaly_time(job)
job.fetch(:gitaly_duration_s, 0) job.fetch(:gitaly_duration_s, 0)
end end
......
...@@ -31,7 +31,9 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -31,7 +31,9 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:gitaly_seconds_metric) { double('gitaly seconds metric') } let(:gitaly_seconds_metric) { double('gitaly seconds metric') }
let(:failed_total_metric) { double('failed total metric') } let(:failed_total_metric) { double('failed total metric') }
let(:retried_total_metric) { double('retried total metric') } let(:retried_total_metric) { double('retried total metric') }
let(:redis_requests_total) { double('redis calls total metric') }
let(:running_jobs_metric) { double('running jobs metric') } let(:running_jobs_metric) { double('running jobs metric') }
let(:redis_seconds_metric) { double('redis seconds metric') }
before do before do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds)
...@@ -39,8 +41,10 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -39,8 +41,10 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_redis_requests_duration_seconds, anything, anything, anything).and_return(redis_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_redis_requests_total, anything).and_return(redis_requests_total)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric)
...@@ -69,21 +73,27 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -69,21 +73,27 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:db_duration) { 3 } let(:db_duration) { 3 }
let(:gitaly_duration) { 4 } let(:gitaly_duration) { 4 }
let(:redis_calls) { 2 }
let(:redis_duration) { 0.01 }
before do before do
allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after) allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after) allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job) allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000) allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000)
allow(subject).to receive(:get_gitaly_time).and_return(gitaly_duration)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) job[:gitaly_duration_s] = gitaly_duration
expect(running_jobs_metric).to receive(:increment).with(labels, -1) job[:redis_calls] = redis_calls
job[:redis_duration_s] = redis_duration
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration) allow(running_jobs_metric).to receive(:increment)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration) allow(redis_requests_total).to receive(:increment)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration) allow(queue_duration_seconds).to receive(:observe)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration) allow(user_execution_seconds_metric).to receive(:observe)
allow(db_seconds_metric).to receive(:observe)
allow(gitaly_seconds_metric).to receive(:observe)
allow(completion_seconds_metric).to receive(:observe)
allow(redis_seconds_metric).to receive(:observe)
end end
it 'yields block' do it 'yields block' do
...@@ -91,6 +101,16 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -91,6 +101,16 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
end end
it 'sets queue specific metrics' do it 'sets queue specific metrics' do
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration)
expect(redis_seconds_metric).to receive(:observe).with(labels_with_job_status, redis_duration)
expect(redis_requests_total).to receive(:increment).with(labels_with_job_status, redis_calls)
subject.call(worker, job, :test) { nil } subject.call(worker, job, :test) { nil }
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment