Commit f44a8165 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '344629-metrics-server-metrics' into 'master'

Sidekiq metrics server exports own metrics

See merge request gitlab-org/gitlab!76553
parents 26a6a3f2 7fe9a980
......@@ -70,17 +70,17 @@ if !Rails.env.test? && Gitlab::Metrics.prometheus_metrics_enabled?
Gitlab::Cluster::LifecycleEvents.on_worker_start do
defined?(::Prometheus::Client.reinitialize_on_pid_change) && ::Prometheus::Client.reinitialize_on_pid_change
Gitlab::Metrics::Samplers::RubySampler.initialize_instance.start
Gitlab::Metrics::Samplers::DatabaseSampler.initialize_instance.start
Gitlab::Metrics::Samplers::ThreadsSampler.initialize_instance.start
logger = Gitlab::AppLogger
Gitlab::Metrics::Samplers::RubySampler.initialize_instance(logger: logger).start
Gitlab::Metrics::Samplers::DatabaseSampler.initialize_instance(logger: logger).start
Gitlab::Metrics::Samplers::ThreadsSampler.initialize_instance(logger: logger).start
if Gitlab::Runtime.web_server?
Gitlab::Metrics::Samplers::ActionCableSampler.instance.start
Gitlab::Metrics::Samplers::ActionCableSampler.instance(logger: logger).start
end
if Gitlab.ee? && Gitlab::Runtime.sidekiq?
Gitlab::Metrics::Samplers::GlobalSearchSampler.instance.start
Gitlab::Metrics::Samplers::GlobalSearchSampler.instance(logger: logger).start
end
Gitlab::Ci::Parsers.instrument!
......
......@@ -6,8 +6,8 @@ module Gitlab
class ActionCableSampler < BaseSampler
DEFAULT_SAMPLING_INTERVAL_SECONDS = 5
def initialize(interval = nil, action_cable: ::ActionCable.server)
super(interval)
def initialize(action_cable: ::ActionCable.server, **options)
super(**options)
@action_cable = action_cable
end
......
......@@ -9,7 +9,10 @@ module Gitlab
attr_reader :interval
# interval - The sampling interval in seconds.
def initialize(interval = nil)
# warmup - When true, takes a single sample eagerly before entering the sampling loop.
# This can be useful to ensure that all metrics files exist after `start` returns,
# since prometheus-client-mmap creates them lazily upon first access.
def initialize(interval: nil, logger: Logger.new($stdout), warmup: false, **options)
interval ||= ENV[interval_env_key]&.to_i
interval ||= self.class::DEFAULT_SAMPLING_INTERVAL_SECONDS
interval_half = interval.to_f / 2
......@@ -17,13 +20,16 @@ module Gitlab
@interval = interval
@interval_steps = (-interval_half..interval_half).step(0.1).to_a
super()
@logger = logger
@warmup = warmup
super(**options)
end
def safe_sample
sample
rescue StandardError => e
::Gitlab::AppLogger.warn("#{self.class}: #{e}, stopping")
@logger.warn("#{self.class}: #{e}, stopping")
stop
end
......@@ -63,6 +69,8 @@ module Gitlab
def start_working
@running = true
safe_sample if @warmup
true
end
......
......@@ -7,12 +7,12 @@ module Gitlab
DEFAULT_SAMPLING_INTERVAL_SECONDS = 60
GC_REPORT_BUCKETS = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1].freeze
def initialize(*)
def initialize(...)
GC::Profiler.clear
metrics[:process_start_time_seconds].set(labels, Time.now.to_i)
super
super(...)
end
def metrics
......
......@@ -6,6 +6,7 @@ require 'fileutils'
require 'active_support/concern'
require 'active_support/inflector'
require 'active_support/core_ext/numeric/bytes'
require 'prometheus/client'
require 'rack'
......@@ -18,6 +19,9 @@ require_relative '../lib/gitlab/utils/strong_memoize'
require_relative '../lib/prometheus/cleanup_multiproc_dir_service'
require_relative '../lib/gitlab/metrics/prometheus'
require_relative '../lib/gitlab/metrics'
require_relative '../lib/gitlab/metrics/system'
require_relative '../lib/gitlab/metrics/samplers/base_sampler'
require_relative '../lib/gitlab/metrics/samplers/ruby_sampler'
require_relative '../lib/gitlab/metrics/exporter/base_exporter'
require_relative '../lib/gitlab/metrics/exporter/sidekiq_exporter'
require_relative '../lib/gitlab/health_checks/probes/collection'
......
......@@ -40,14 +40,20 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
def start
::Prometheus::Client.configure do |config|
config.multiprocess_files_dir = @metrics_dir
config.pid_provider = proc { "#{@target}_exporter" }
end
FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
settings = Settings.new(Settings.monitoring[name])
# We need to `warmup: true` since otherwise the sampler and exporter threads enter
# a race where not all Prometheus db files will be visible to the exporter, resulting
# in missing metrics.
# Warming up ensures that these files exist prior to the exporter starting up.
Gitlab::Metrics::Samplers::RubySampler.initialize_instance(warmup: true).start
exporter_class = "Gitlab::Metrics::Exporter::#{@target.camelize}Exporter".constantize
settings = Settings.new(Settings.monitoring[name])
server = exporter_class.instance(settings, synchronous: true)
server.start
......
# rubocop:disable Naming/FileName
# frozen_string_literal: true
# We need to supply this outside of Rails because:
# RubySampler needs Gitlab::Metrics needs Gitlab::Metrics::Prometheus needs Gitlab::CurrentSettings needs ::Settings
# to check for `prometheus_metrics_enabled`. We therefore simply redirect it to our own Settings type.
module Gitlab
module CurrentSettings
class << self
def prometheus_metrics_enabled
# We make the simplified assumption that when the metrics-server runs,
# Prometheus metrics are enabled. Since the latter is a setting stored
# in the application database, we have no access to it here, so we need
# to hard-code it.
true
end
end
end
end
# rubocop:enable Naming/FileName
......@@ -9,6 +9,11 @@
# Here we make the necessary constants available conditionally.
require_relative 'override_rails_constants' unless Object.const_defined?('Rails')
# We need to supply this outside of Rails because:
# RubySampler needs Gitlab::Metrics needs Gitlab::Metrics::Prometheus needs Gitlab::CurrentSettings needs ::Settings
# to check for `prometheus_metrics_enabled`. We therefore simply redirect it to our own Settings type.
require_relative 'override_gitlab_current_settings' unless Object.const_defined?('Gitlab::CurrentSettings')
require_relative '../config/settings'
# rubocop:enable Naming/FileName
......@@ -5,7 +5,7 @@ require 'spec_helper'
RSpec.describe Gitlab::Metrics::Samplers::ActionCableSampler do
let(:action_cable) { instance_double(ActionCable::Server::Base) }
subject { described_class.new(action_cable: action_cable) }
subject { described_class.new(action_cable: action_cable, logger: double) }
it_behaves_like 'metrics sampler', 'ACTION_CABLE_SAMPLER'
......
......@@ -84,7 +84,7 @@ RSpec.describe Gitlab::Metrics::Samplers::RubySampler do
end
describe '#sample_gc' do
let!(:sampler) { described_class.new(5) }
let!(:sampler) { described_class.new }
let(:gc_reports) { [{ GC_TIME: 0.1 }, { GC_TIME: 0.2 }, { GC_TIME: 0.3 }] }
......
......@@ -8,18 +8,24 @@ require_relative '../support/helpers/next_instance_of'
RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
include NextInstanceOf
let(:metrics_dir) { Dir.mktmpdir }
before do
# We do not want this to have knock-on effects on the test process.
allow(Gitlab::ProcessManagement).to receive(:modify_signals)
end
after do
FileUtils.rm_rf(metrics_dir, secure: true)
end
describe '.spawn' do
context 'when in parent process' do
it 'forks into a new process and detaches it' do
expect(Process).to receive(:fork).and_return(99)
expect(Process).to receive(:detach).with(99)
described_class.spawn('sidekiq', metrics_dir: 'path/to/metrics')
described_class.spawn('sidekiq', metrics_dir: metrics_dir)
end
end
......@@ -35,13 +41,13 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
expect(server).to receive(:start)
end
described_class.spawn('sidekiq', metrics_dir: 'path/to/metrics')
described_class.spawn('sidekiq', metrics_dir: metrics_dir)
end
it 'resets signal handlers from parent process' do
expect(Gitlab::ProcessManagement).to receive(:modify_signals).with(%i[A B], 'DEFAULT')
described_class.spawn('sidekiq', metrics_dir: 'path/to/metrics', trapped_signals: %i[A B])
described_class.spawn('sidekiq', metrics_dir: metrics_dir, trapped_signals: %i[A B])
end
end
end
......@@ -50,9 +56,9 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
let(:exporter_class) { Class.new(Gitlab::Metrics::Exporter::BaseExporter) }
let(:exporter_double) { double('fake_exporter', start: true) }
let(:prometheus_config) { ::Prometheus::Client.configuration }
let(:metrics_dir) { Dir.mktmpdir }
let(:settings) { { "fake_exporter" => { "enabled" => true } } }
let!(:old_metrics_dir) { prometheus_config.multiprocess_files_dir }
let(:ruby_sampler_double) { double(Gitlab::Metrics::Samplers::RubySampler) }
subject(:metrics_server) { described_class.new('fake', metrics_dir, true)}
......@@ -60,11 +66,13 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
stub_const('Gitlab::Metrics::Exporter::FakeExporter', exporter_class)
expect(exporter_class).to receive(:instance).with(settings['fake_exporter'], synchronous: true).and_return(exporter_double)
expect(Settings).to receive(:monitoring).and_return(settings)
allow(Gitlab::Metrics::Samplers::RubySampler).to receive(:initialize_instance).and_return(ruby_sampler_double)
allow(ruby_sampler_double).to receive(:start)
end
after do
Gitlab::Metrics.reset_registry!
FileUtils.rm_rf(metrics_dir, secure: true)
prometheus_config.multiprocess_files_dir = old_metrics_dir
end
......@@ -72,6 +80,7 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
metrics_server.start
expect(prometheus_config.multiprocess_files_dir).to eq metrics_dir
expect(::Prometheus::Client.configuration.pid_provider.call).to eq 'fake_exporter'
end
it 'ensures that metrics directory exists in correct mode (0700)' do
......@@ -105,5 +114,11 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
metrics_server.start
end
it 'starts a RubySampler instance' do
expect(ruby_sampler_double).to receive(:start)
subject.start
end
end
end
......@@ -2,26 +2,98 @@
RSpec.shared_examples 'metrics sampler' do |env_prefix|
context 'when sampling interval is passed explicitly' do
subject { described_class.new(42) }
subject(:sampler) { described_class.new(interval: 42, logger: double) }
specify { expect(subject.interval).to eq(42) }
specify { expect(sampler.interval).to eq(42) }
end
context 'when sampling interval is passed through the environment' do
subject { described_class.new }
subject(:sampler) { described_class.new(logger: double) }
before do
stub_env("#{env_prefix}_INTERVAL_SECONDS", '42')
end
specify { expect(subject.interval).to eq(42) }
specify { expect(sampler.interval).to eq(42) }
end
context 'when no sampling interval is passed anywhere' do
subject { described_class.new }
subject(:sampler) { described_class.new(logger: double) }
it 'uses the hardcoded default' do
expect(subject.interval).to eq(described_class::DEFAULT_SAMPLING_INTERVAL_SECONDS)
expect(sampler.interval).to eq(described_class::DEFAULT_SAMPLING_INTERVAL_SECONDS)
end
end
describe '#start' do
include WaitHelpers
subject(:sampler) { described_class.new(interval: 0.1) }
it 'calls the sample method on the sampler thread' do
sampling_threads = []
expect(sampler).to receive(:sample).at_least(:once) { sampling_threads << Thread.current }
sampler.start
wait_for('sampler has sampled', max_wait_time: 3) { sampling_threads.any? }
expect(sampling_threads.first.name).to eq(sampler.thread_name)
sampler.stop
end
context 'with warmup set to true' do
subject(:sampler) { described_class.new(interval: 0.1, warmup: true) }
it 'calls the sample method first on the caller thread' do
sampling_threads = []
current_thread = Thread.current
# Instead of sampling, we're keeping track of which thread the sampling happened on.
# We want the first sample to be on the spec thread, which would mean a blocking sample
# before the actual sampler thread starts.
expect(sampler).to receive(:sample).at_least(:once) { sampling_threads << Thread.current }
sampler.start
wait_for('sampler has sampled', max_wait_time: 3) { sampling_threads.size == 2 }
expect(sampling_threads.first).to be(current_thread)
expect(sampling_threads.last.name).to eq(sampler.thread_name)
sampler.stop
end
end
end
describe '#safe_sample' do
let(:logger) { Logger.new(File::NULL) }
subject(:sampler) { described_class.new(logger: logger) }
it 'calls #sample once' do
expect(sampler).to receive(:sample)
sampler.safe_sample
end
context 'when sampling fails with error' do
before do
expect(sampler).to receive(:sample).and_raise "something failed"
end
it 'recovers from errors' do
expect { sampler.safe_sample }.not_to raise_error
end
context 'with logger' do
let(:logger) { double('logger') }
it 'logs errors' do
expect(logger).to receive(:warn).with(an_instance_of(String))
expect { sampler.safe_sample }.not_to raise_error
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment