Commit 0e77c181 authored by Matthias Käppler's avatar Matthias Käppler Committed by Mark Chao

Fix missing metrics for Sidekiq exporter server

We were accidentally deleting metrics the exporter
server exported about itself. Actual sidekiq worker
metrics were not affected.

This happened because we would wipe the metrics dir
whenever sidekiq_0 starts up, but this happens after
the metrics server starts, so the worker was deleting
those existing metrics.

Changelog: fixed
parent 18cbe3da
......@@ -4,7 +4,7 @@ PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRI
require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER
# Keep separate directories for separate processes
def prometheus_default_multiproc_dir
def metrics_temp_dir
return unless Rails.env.development? || Rails.env.test?
if Gitlab::Runtime.sidekiq?
......@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir
end
end
def puma_metrics_server_process?
Prometheus::PidProvider.worker_id == 'puma_master'
def prometheus_metrics_dir
ENV['prometheus_multiproc_dir'] || metrics_temp_dir
end
def sidekiq_metrics_server_process?
Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0')
def puma_metrics_server_process?
Prometheus::PidProvider.worker_id == 'puma_master'
end
if puma_metrics_server_process? || sidekiq_metrics_server_process?
if puma_metrics_server_process?
# The following is necessary to ensure stale Prometheus metrics don't accumulate over time.
# It needs to be done as early as here to ensure metrics files aren't deleted.
# After we hit our app in `warmup`, first metrics and corresponding files already being created,
# for example in `lib/gitlab/metrics/requests_rack_middleware.rb`.
Prometheus::CleanupMultiprocDirService.new.execute
# It needs to be done as early as possible to ensure new metrics aren't being deleted.
#
# Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the
# sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead,
# since it must happen prior to any worker processes or the metrics server starting up.
Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute
::Prometheus::Client.reinitialize_on_pid_change(force: true)
end
......@@ -37,7 +39,7 @@ end
::Prometheus::Client.configure do |config|
config.logger = Gitlab::AppLogger
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir
config.multiprocess_files_dir = prometheus_metrics_dir
config.pid_provider = ::Prometheus::PidProvider.method(:worker_id)
end
......
......@@ -2,22 +2,17 @@
module Prometheus
class CleanupMultiprocDirService
include Gitlab::Utils::StrongMemoize
def execute
FileUtils.rm_rf(old_metrics) if old_metrics
def initialize(metrics_dir)
@metrics_dir = metrics_dir
end
private
def execute
return if @metrics_dir.blank?
def old_metrics
strong_memoize(:old_metrics) do
Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir
end
end
files_to_delete = Dir[File.join(@metrics_dir, '*.db')]
return if files_to_delete.blank?
def multiprocess_files_dir
::Prometheus::Client.configuration.multiprocess_files_dir
FileUtils.rm_rf(files_to_delete)
end
end
end
......@@ -84,7 +84,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass
end
FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir
# We need to `warmup: true` since otherwise the sampler and exporter threads enter
# a race where not all Prometheus db files will be visible to the exporter, resulting
......
......@@ -118,6 +118,11 @@ module Gitlab
return if @dryrun
# Make sure we reset the metrics directory prior to:
# - starting a metrics server process
# - starting new workers
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute
ProcessManagement.write_pid(@pid) if @pid
supervisor = SidekiqProcessSupervisor.instance(
......@@ -137,7 +142,7 @@ module Gitlab
# and the metrics server died, restart it.
if supervisor.alive && dead_pids.include?(metrics_server_pid)
@logger.info('Sidekiq metrics server terminated, restarting...')
metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false)
metrics_server_pid = restart_metrics_server
all_pids = worker_pids + Array(metrics_server_pid)
else
# If a worker process died we'll just terminate the whole cluster.
......@@ -154,15 +159,14 @@ module Gitlab
def start_metrics_server
return unless metrics_server_enabled?
restart_metrics_server(wipe_metrics_dir: true)
restart_metrics_server
end
def restart_metrics_server(wipe_metrics_dir: false)
def restart_metrics_server
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
MetricsServer.fork(
'sidekiq',
metrics_dir: @metrics_dir,
wipe_metrics_dir: wipe_metrics_dir,
reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS
)
end
......
......@@ -41,6 +41,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
end
let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) }
before do
stub_env('RAILS_ENV', 'test')
......@@ -54,6 +55,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
allow(supervisor).to receive(:supervise)
allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service)
end
after do
......@@ -300,6 +303,12 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
end
it 'wipes the metrics directory' do
expect(metrics_cleanup_service).to receive(:execute)
cli.run(%w(foo))
end
context 'when there are no sidekiq_health_checks settings set' do
let(:sidekiq_exporter_enabled) { true }
......@@ -379,7 +388,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
with_them do
specify do
if start_metrics_server
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals)
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals)
else
expect(MetricsServer).not_to receive(:fork)
end
......
# frozen_string_literal: true
require 'spec_helper'
require 'fast_spec_helper'
RSpec.describe Prometheus::CleanupMultiprocDirService do
describe '.call' do
subject { described_class.new.execute }
describe '#execute' do
let(:metrics_multiproc_dir) { Dir.mktmpdir }
let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') }
subject(:service) { described_class.new(metrics_dir_arg).execute }
before do
FileUtils.touch(metrics_file_path)
end
after do
FileUtils.rm_r(metrics_multiproc_dir)
FileUtils.rm_rf(metrics_multiproc_dir)
end
context 'when `multiprocess_files_dir` is defined' do
before do
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(metrics_multiproc_dir)
.at_least(:once)
end
let(:metrics_dir_arg) { metrics_multiproc_dir }
it 'removes old metrics' do
expect { subject }
expect { service }
.to change { File.exist?(metrics_file_path) }
.from(true)
.to(false)
......@@ -34,15 +29,10 @@ RSpec.describe Prometheus::CleanupMultiprocDirService do
end
context 'when `multiprocess_files_dir` is not defined' do
before do
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(nil)
.at_least(:once)
end
let(:metrics_dir_arg) { nil }
it 'does not remove any files' do
expect { subject }
expect { service }
.not_to change { File.exist?(metrics_file_path) }
.from(true)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment