Commit 4d048f64 authored by Mark Chao's avatar Mark Chao

Merge branch '356505-fix-missing-sidekiq-exporter-metrics' into 'master'

Fix missing metrics for Sidekiq exporter server

See merge request gitlab-org/gitlab!83407
parents d4654fc3 0e77c181
......@@ -4,7 +4,7 @@ PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRI
require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER
# Keep separate directories for separate processes
def prometheus_default_multiproc_dir
def metrics_temp_dir
return unless Rails.env.development? || Rails.env.test?
if Gitlab::Runtime.sidekiq?
......@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir
end
end
def puma_metrics_server_process?
Prometheus::PidProvider.worker_id == 'puma_master'
def prometheus_metrics_dir
ENV['prometheus_multiproc_dir'] || metrics_temp_dir
end
def sidekiq_metrics_server_process?
Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0')
def puma_metrics_server_process?
Prometheus::PidProvider.worker_id == 'puma_master'
end
if puma_metrics_server_process? || sidekiq_metrics_server_process?
if puma_metrics_server_process?
# The following is necessary to ensure stale Prometheus metrics don't accumulate over time.
# It needs to be done as early as here to ensure metrics files aren't deleted.
# After we hit our app in `warmup`, first metrics and corresponding files already being created,
# for example in `lib/gitlab/metrics/requests_rack_middleware.rb`.
Prometheus::CleanupMultiprocDirService.new.execute
# It needs to be done as early as possible to ensure new metrics aren't being deleted.
#
# Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the
# sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead,
# since it must happen prior to any worker processes or the metrics server starting up.
Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute
::Prometheus::Client.reinitialize_on_pid_change(force: true)
end
......@@ -37,7 +39,7 @@ end
::Prometheus::Client.configure do |config|
config.logger = Gitlab::AppLogger
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir
config.multiprocess_files_dir = prometheus_metrics_dir
config.pid_provider = ::Prometheus::PidProvider.method(:worker_id)
end
......
......@@ -2,22 +2,17 @@
module Prometheus
class CleanupMultiprocDirService
include Gitlab::Utils::StrongMemoize
def execute
FileUtils.rm_rf(old_metrics) if old_metrics
def initialize(metrics_dir)
@metrics_dir = metrics_dir
end
private
def execute
return if @metrics_dir.blank?
def old_metrics
strong_memoize(:old_metrics) do
Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir
end
end
files_to_delete = Dir[File.join(@metrics_dir, '*.db')]
return if files_to_delete.blank?
def multiprocess_files_dir
::Prometheus::Client.configuration.multiprocess_files_dir
FileUtils.rm_rf(files_to_delete)
end
end
end
......@@ -84,7 +84,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end
FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir
# We need to `warmup: true` since otherwise the sampler and exporter threads enter
# a race where not all Prometheus db files will be visible to the exporter, resulting
......
......@@ -118,6 +118,11 @@ module Gitlab
return if @dryrun
# Make sure we reset the metrics directory prior to:
# - starting a metrics server process
# - starting new workers
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute
ProcessManagement.write_pid(@pid) if @pid
supervisor = SidekiqProcessSupervisor.instance(
......@@ -137,7 +142,7 @@ module Gitlab
# and the metrics server died, restart it.
if supervisor.alive && dead_pids.include?(metrics_server_pid)
@logger.info('Sidekiq metrics server terminated, restarting...')
metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false)
metrics_server_pid = restart_metrics_server
all_pids = worker_pids + Array(metrics_server_pid)
else
# If a worker process died we'll just terminate the whole cluster.
......@@ -154,15 +159,14 @@ module Gitlab
def start_metrics_server
return unless metrics_server_enabled?
restart_metrics_server(wipe_metrics_dir: true)
restart_metrics_server
end
def restart_metrics_server(wipe_metrics_dir: false)
def restart_metrics_server
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
MetricsServer.fork(
'sidekiq',
metrics_dir: @metrics_dir,
wipe_metrics_dir: wipe_metrics_dir,
reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS
)
end
......
......@@ -41,6 +41,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
end
let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) }
before do
stub_env('RAILS_ENV', 'test')
......@@ -54,6 +55,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
allow(supervisor).to receive(:supervise)
allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service)
end
after do
......@@ -300,6 +303,12 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
end
it 'wipes the metrics directory' do
expect(metrics_cleanup_service).to receive(:execute)
cli.run(%w(foo))
end
context 'when there are no sidekiq_health_checks settings set' do
let(:sidekiq_exporter_enabled) { true }
......@@ -379,7 +388,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
with_them do
specify do
if start_metrics_server
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals)
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals)
else
expect(MetricsServer).not_to receive(:fork)
end
......
# frozen_string_literal: true
require 'spec_helper'
require 'fast_spec_helper'
RSpec.describe Prometheus::CleanupMultiprocDirService do
describe '.call' do
subject { described_class.new.execute }
describe '#execute' do
let(:metrics_multiproc_dir) { Dir.mktmpdir }
let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') }
subject(:service) { described_class.new(metrics_dir_arg).execute }
before do
FileUtils.touch(metrics_file_path)
end
after do
FileUtils.rm_r(metrics_multiproc_dir)
FileUtils.rm_rf(metrics_multiproc_dir)
end
context 'when `multiprocess_files_dir` is defined' do
before do
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(metrics_multiproc_dir)
.at_least(:once)
end
let(:metrics_dir_arg) { metrics_multiproc_dir }
it 'removes old metrics' do
expect { subject }
expect { service }
.to change { File.exist?(metrics_file_path) }
.from(true)
.to(false)
......@@ -34,15 +29,10 @@ RSpec.describe Prometheus::CleanupMultiprocDirService do
end
context 'when `multiprocess_files_dir` is not defined' do
before do
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(nil)
.at_least(:once)
end
let(:metrics_dir_arg) { nil }
it 'does not remove any files' do
expect { subject }
expect { service }
.not_to change { File.exist?(metrics_file_path) }
.from(true)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment