Commit a97c30c9 authored by Sean McGivern's avatar Sean McGivern

Add Sidekiq metrics for Gitaly and DB time

These metrics map to details already in our logs, but as they're in
Prometheus, make it easier to spot trends.
parent 09f08640
---
title: Add Prometheus metrics for Gitaly and database time in background jobs
merge_request: 26384
author:
type: changed
...@@ -86,13 +86,15 @@ The following metrics are available: ...@@ -86,13 +86,15 @@ The following metrics are available:
| `failed_login_captcha_total` | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | | | `failed_login_captcha_total` | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | |
| `successful_login_captcha_total` | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | | | `successful_login_captcha_total` | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | |
| `auto_devops_pipelines_completed_total` | Counter | 12.7 | Counter of completed Auto DevOps pipelines, labeled by status | | | `auto_devops_pipelines_completed_total` | Counter | 12.7 | Counter of completed Auto DevOps pipelines, labeled by status | |
| `sidekiq_jobs_cpu_seconds` | Histogram | 12.4 | Seconds of cpu time to run Sidekiq job | | | `sidekiq_jobs_cpu_seconds` | Histogram | 12.4 | Seconds of cpu time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | | | `sidekiq_jobs_completion_seconds` | Histogram | 12.2 | Seconds to complete Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | | | `sidekiq_jobs_db_seconds` | Histogram | 12.9 | Seconds of DB time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | | | `sidekiq_jobs_gitaly_seconds` | Histogram | 12.9 | Seconds of Gitaly time to run Sidekiq job | queue, boundary, external_dependencies, feature_category, job_status, urgency |
| `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | | | `sidekiq_jobs_queue_duration_seconds` | Histogram | 12.5 | Duration in seconds that a Sidekiq job was queued before being executed | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | | | `sidekiq_jobs_failed_total` | Counter | 12.2 | Sidekiq jobs failed | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | | | `sidekiq_jobs_retried_total` | Counter | 12.2 | Sidekiq jobs retried | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_running_jobs` | Gauge | 12.2 | Number of Sidekiq jobs running | queue, boundary, external_dependencies, feature_category, urgency |
| `sidekiq_concurrency` | Gauge | 12.5 | Maximum number of Sidekiq jobs | |
## Metrics controlled by a feature flag ## Metrics controlled by a feature flag
......
...@@ -45,6 +45,8 @@ module Gitlab ...@@ -45,6 +45,8 @@ module Gitlab
labels[:job_status] = job_succeeded ? "done" : "fail" labels[:job_status] = job_succeeded ? "done" : "fail"
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime) @metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time) @metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
@metrics[:sidekiq_jobs_db_seconds].observe(labels, ActiveRecord::LogSubscriber.runtime / 1000)
@metrics[:sidekiq_jobs_gitaly_seconds].observe(labels, Gitlab::GitalyClient.query_time)
end end
end end
...@@ -54,6 +56,8 @@ module Gitlab ...@@ -54,6 +56,8 @@ module Gitlab
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_db_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_db_seconds, 'Seconds of database time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_gitaly_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_gitaly_seconds, 'Seconds of Gitaly time to run Sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_queue_duration_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_queue_duration_seconds, 'Duration in seconds that a Sidekiq job was queued before being executed', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
......
...@@ -20,6 +20,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -20,6 +20,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:queue_duration_seconds) { double('queue duration seconds metric') } let(:queue_duration_seconds) { double('queue duration seconds metric') }
let(:completion_seconds_metric) { double('completion seconds metric') } let(:completion_seconds_metric) { double('completion seconds metric') }
let(:user_execution_seconds_metric) { double('user execution seconds metric') } let(:user_execution_seconds_metric) { double('user execution seconds metric') }
let(:db_seconds_metric) { double('db seconds metric') }
let(:gitaly_seconds_metric) { double('gitaly seconds metric') }
let(:failed_total_metric) { double('failed total metric') } let(:failed_total_metric) { double('failed total metric') }
let(:retried_total_metric) { double('retried total metric') } let(:retried_total_metric) { double('retried total metric') }
let(:running_jobs_metric) { double('running jobs metric') } let(:running_jobs_metric) { double('running jobs metric') }
...@@ -28,6 +30,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -28,6 +30,8 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_queue_duration_seconds, anything, anything, anything).and_return(queue_duration_seconds)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_db_seconds, anything, anything, anything).and_return(db_seconds_metric)
allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_gitaly_seconds, anything, anything, anything).and_return(gitaly_seconds_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric)
allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric) allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric)
...@@ -55,16 +59,23 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do ...@@ -55,16 +59,23 @@ describe Gitlab::SidekiqMiddleware::ServerMetrics do
let(:queue_duration_for_job) { 0.01 } let(:queue_duration_for_job) { 0.01 }
let(:db_duration) { 3 }
let(:gitaly_duration) { 4 }
before do before do
allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after) allow(subject).to receive(:get_thread_cputime).and_return(thread_cputime_before, thread_cputime_after)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after) allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(monotonic_time_before, monotonic_time_after)
allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job) allow(Gitlab::InstrumentationHelper).to receive(:queue_duration_for_job).with(job).and_return(queue_duration_for_job)
allow(ActiveRecord::LogSubscriber).to receive(:runtime).and_return(db_duration * 1000)
allow(Gitlab::GitalyClient).to receive(:query_time).and_return(gitaly_duration)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1) expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job expect(queue_duration_seconds).to receive(:observe).with(labels, queue_duration_for_job) if queue_duration_for_job
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration) expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, thread_cputime_duration)
expect(db_seconds_metric).to receive(:observe).with(labels_with_job_status, db_duration)
expect(gitaly_seconds_metric).to receive(:observe).with(labels_with_job_status, gitaly_duration)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration) expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, monotonic_time_duration)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment