Commit 317cc11c authored by Sean McGivern's avatar Sean McGivern

Merge branch 'osw-add-redis-metrics-to-sidekiq-job-run' into 'master'

Add metrics for Redis usage during Sidekiq job execution

See merge request gitlab-org/gitlab!32265
parents 41b9d34c 08650a8d
---
title: Add metrics for Redis usage during Sidekiq job execution
merge_request: 32265
author:
type: added
...@@ -123,9 +123,11 @@ configuration option in `gitlab.yml`. These metrics are served from the ...@@ -123,9 +123,11 @@ configuration option in `gitlab.yml`. These metrics are served from the
| `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` | | `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_redis_requests_duration_seconds` | Histogram | 13.1 | Duration in seconds that a Sidekiq job spent querying a Redis server | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_redis_requests_total` | Counter | 13.1 | Redis requests during a Sidekiq job execution | `queue`, `boundary`, `external_dependencies`, `feature_category`, `job_status`, `urgency` |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` | | `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | `queue`, `boundary`, `external_dependencies`, `feature_category`, `urgency` |
| `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | | | `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | |
| `geo_db_replication_lag_seconds` | Gauge | 10.2 | Database replication lag (seconds) | `url` | | `geo_db_replication_lag_seconds` | Gauge | 10.2 | Database replication lag (seconds) | `url` |
......
...@@ -24,6 +24,9 @@ module Gitlab ...@@ -24,6 +24,9 @@ module Gitlab
REDIS_CALL_DURATION = :redis_call_duration REDIS_CALL_DURATION = :redis_call_duration
REDIS_CALL_DETAILS = :redis_call_details REDIS_CALL_DETAILS = :redis_call_details
# Milliseconds represented in seconds (from 1 to 500 milliseconds).
QUERY_TIME_BUCKETS = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5].freeze
def self.get_request_count def self.get_request_count
::RequestStore[REDIS_REQUEST_COUNT] || 0 ::RequestStore[REDIS_REQUEST_COUNT] || 0
end end
......
...@@ -47,6 +47,8 @@ module Gitlab ...@@ -47,6 +47,8 @@ module Gitlab
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time) @metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
@metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000) @metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000)
@metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, get_gitaly_time(job)) @metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, get_gitaly_time(job))
@metrics[:sidekiq_redis_requests_total].increment(labels, get_redis_calls(job))
@metrics[:sidekiq_redis_requests_duration_seconds].observe(labels, get_redis_time(job))
end end
end end
...@@ -54,15 +56,17 @@ module Gitlab ...@@ -54,15 +56,17 @@ module Gitlab
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_redis_requests_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_redis_requests_duration_seconds, 'Duration in seconds that a Sidekiq job spent requests a Redis server', {}, Gitlab::Instrumentation::Redis::QUERY_TIME_BUCKETS),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all) sidekiq_redis_requests_total: ::Gitlab::Metrics.counter(:sidekiq_redis_requests_total, 'Redis requests during a Sidekiq job execution'),
sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all),
sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all)
} }
end end
...@@ -70,6 +74,14 @@ module Gitlab ...@@ -70,6 +74,14 @@ module Gitlab
defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0 defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0
end end
def get_redis_time(job)
job.fetch(:redis_duration_s, 0)
end
def get_redis_calls(job)
job.fetch(:redis_calls, 0)
end
def get_gitaly_time(job) def get_gitaly_time(job)
job.fetch(:gitaly_duration_s, 0) job.fetch(:gitaly_duration_s, 0)
end end
......
...@@ -31,7 +31,9 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -31,7 +31,9 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:gitaly_seconds_metric) { double('gitaly seconds metric') } let(:gitaly_seconds_metric) { double('gitaly seconds metric') }
let(:failed_total_metric) { double('failed total metric') } let(:failed_total_metric) { double('failed total metric') }
let(:retried_total_metric) { double('retried total metric') } let(:retried_total_metric) { double('retried total metric') }
let(:redis_requests_total) { double('redis calls total metric') }
let(:running_jobs_metric) { double('running jobs metric') } let(:running_jobs_metric) { double('running jobs metric') }
let(:redis_seconds_metric) { double('redis seconds metric') }
before do before do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds)
...@@ -39,8 +41,10 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -39,8 +41,10 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_redis_requests_duration_seconds, anything, anything, anything).and_return(redis_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_redis_requests_total, anything).and_return(redis_requests_total)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric)
...@@ -69,21 +73,27 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -69,21 +73,27 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:db_duration) { 3 } let(:db_duration) { 3 }
let(:gitaly_duration) { 4 } let(:gitaly_duration) { 4 }
let(:redis_calls) { 2 }
let(:redis_duration) { 0.01 }
before do before do
allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after) allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after) allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job) allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000) allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000)
allow(subject).to receive(:get_gitaly_time).and_return(gitaly_duration)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) job[:gitaly_duration_s] = gitaly_duration
expect(running_jobs_metric).to receive(:increment).with(labels, -1) job[:redis_calls] = redis_calls
job[:redis_duration_s] = redis_duration
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration) allow(running_jobs_metric).to receive(:increment)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration) allow(redis_requests_total).to receive(:increment)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration) allow(queue_duration_seconds).to receive(:observe)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration) allow(user_execution_seconds_metric).to receive(:observe)
allow(db_seconds_metric).to receive(:observe)
allow(gitaly_seconds_metric).to receive(:observe)
allow(completion_seconds_metric).to receive(:observe)
allow(redis_seconds_metric).to receive(:observe)
end end
it 'yields block' do it 'yields block' do
...@@ -91,6 +101,16 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -91,6 +101,16 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
end end
it 'sets queue specific metrics' do it 'sets queue specific metrics' do
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration)
expect(redis_seconds_metric).to receive(:observe).with(labels_with_job_status, redis_duration)
expect(redis_requests_total).to receive(:increment).with(labels_with_job_status, redis_calls)
subject.call(worker, job, :test) { nil } subject.call(worker, job, :test) { nil }
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment