Commit 62fb215e authored by Qingyu Zhao's avatar Qingyu Zhao

Add Sidekiq memory killer prometheus metrics

parent 2cd6006a
......@@ -41,6 +41,9 @@ Sidekiq.configure_server do |config|
# after all workers have forked, but I don't know how at this point.
::Prometheus::Client.reinitialize_on_pid_change(force: true)
# temporary solution before fix https://gitlab.com/gitlab-org/gitlab/issues/33125
::Prometheus::Client.reinitialize_on_pid_change(force: true)
Gitlab::Metrics::Exporter::SidekiqExporter.instance.start
end
end
......
......@@ -20,15 +20,33 @@ module Gitlab
# Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly
# In case not set, default to 300M. This is for extra-safe.
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
# Phases of memory killer
PHASE = {
running: 1,
above_soft_limit: 2,
stop_fetching_new_jobs: 3,
shutting_down: 4,
killing_sidekiq: 5
}.freeze
def initialize
super
@enabled = true
@metrics = init_metrics
end
private
def init_metrics
{
sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker')
}
end
def run_thread
Sidekiq.logger.info(
class: self.class.to_s,
......@@ -77,27 +95,37 @@ module Gitlab
# Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early
update_metrics(PHASE[:stop_fetching_new_jobs], get_rss, get_soft_limit_rss, get_hard_limit_rss)
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled?
# Tell sidekiq to restart itself
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
update_metrics(PHASE[:shutting_down], get_rss, get_soft_limit_rss, get_hard_limit_rss)
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
return unless enabled?
# Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind
update_metrics(PHASE[:killing_sidekiq], get_rss, get_soft_limit_rss, get_hard_limit_rss)
signal_pgroup('SIGKILL', 'die')
end
def rss_within_range?
phase = PHASE[:running]
current_rss = nil
soft_limit_rss = nil
hard_limit_rss = nil
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
loop do
return true unless enabled?
current_rss = get_rss
soft_limit_rss = get_soft_limit_rss
hard_limit_rss = get_hard_limit_rss
update_metrics(phase, current_rss, soft_limit_rss, hard_limit_rss)
# RSS go above hard limit should trigger forcible shutdown right away
break if current_rss > hard_limit_rss
......@@ -105,6 +133,8 @@ module Gitlab
# RSS go below the soft limit
return true if current_rss < soft_limit_rss
phase = PHASE[:above_soft_limit]
# RSS did not go below the soft limit within deadline, restart
break if Gitlab::Metrics::System.monotonic_time > deadline
......@@ -116,6 +146,13 @@ module Gitlab
false
end
def update_metrics(phase, current_rss, soft_limit_rss, hard_limit_rss)
@metrics[:sidekiq_memory_killer_phase].set({}, phase)
@metrics[:sidekiq_current_rss].set({}, current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, hard_limit_rss)
end
def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
Sidekiq.logger.warn(
class: self.class.to_s,
......@@ -143,11 +180,11 @@ module Gitlab
output.to_i
end
def soft_limit_rss
def get_soft_limit_rss
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
end
def hard_limit_rss
def get_hard_limit_rss
HARD_LIMIT_RSS_KB
end
......
......@@ -5,11 +5,23 @@ require 'spec_helper'
describe Gitlab::SidekiqDaemon::MemoryKiller do
let(:memory_killer) { described_class.new }
let(:pid) { 12345 }
let(:current_rss_metric) { double('current rss metric') }
let(:soft_limit_rss_metric) { double('soft limit rss metric') }
let(:hard_limit_rss_metric) { double('hard limit rss metric') }
let(:current_phase_metric) { double('current phase metric') }
before do
allow(memory_killer).to receive(:pid).and_return(pid)
allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_current_rss, anything).and_return(current_rss_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_memory_killer_soft_limit_rss, anything).and_return(soft_limit_rss_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_memory_killer_hard_limit_rss, anything).and_return(hard_limit_rss_metric)
allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_memory_killer_phase, anything).and_return(current_phase_metric)
allow(memory_killer).to receive(:pid).and_return(pid)
allow(current_rss_metric).to receive(:set)
allow(soft_limit_rss_metric).to receive(:set)
allow(hard_limit_rss_metric).to receive(:set)
allow(current_phase_metric).to receive(:set)
end
describe '#run_thread' do
......@@ -121,8 +133,10 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when everything is within limit' do
expect(memory_killer).to receive(:get_rss).and_return(100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:running], 100, 200, 300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
expect(memory_killer).not_to receive(:log_rss_out_of_range)
......@@ -132,9 +146,10 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return false when rss exceeds hard_limit_rss' do
expect(memory_killer).to receive(:get_rss).and_return(400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:running], 400, 200, 300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200)
......@@ -144,9 +159,11 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return false when rss exceed hard_limit_rss after a while' do
expect(memory_killer).to receive(:get_rss).and_return(250, 400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:running], 250, 200, 300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:above_soft_limit], 400, 200, 300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
......@@ -157,9 +174,11 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).and_return(250, 100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200, 200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300, 300)
expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200, 200)
expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300, 300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:running], 250, 200, 300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:above_soft_limit], 100, 200, 300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
......@@ -170,9 +189,11 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return false when rss exceed soft_limit_rss longer than GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).exactly(4).times.and_return(250)
expect(memory_killer).to receive(:soft_limit_rss).exactly(5).times.and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).exactly(5).times.and_return(300)
expect(memory_killer).to receive(:get_soft_limit_rss).exactly(4).times.and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).exactly(4).times.and_return(300)
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:running], 250, 200, 300)
expect(memory_killer).to receive(:update_metrics).exactly(3).times.with(described_class::PHASE[:above_soft_limit], 250, 200, 300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).exactly(5).times.and_call_original
expect(memory_killer).to receive(:sleep).exactly(3).times.with(check_interval_seconds).and_call_original
......@@ -190,11 +211,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
before do
stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds)
allow(Sidekiq).to receive(:options).and_return(timeout: 9)
allow(memory_killer).to receive(:get_rss).and_return(100)
allow(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
allow(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
end
it 'send signal' do
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:stop_fetching_new_jobs], 100, 200, 300).ordered
expect(memory_killer).to receive(:signal_and_wait).with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs').ordered
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:shutting_down], 100, 200, 300).ordered
expect(memory_killer).to receive(:signal_and_wait).with(11, 'SIGTERM', 'gracefully shut down').ordered
expect(memory_killer).to receive(:update_metrics).with(described_class::PHASE[:killing_sidekiq], 100, 200, 300).ordered
expect(memory_killer).to receive(:signal_pgroup).with('SIGKILL', 'die').ordered
subject
......@@ -401,4 +428,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
expect(subject).to eq(10)
end
end
describe '#update_metrics' do
subject { memory_killer.send(:update_metrics, 2, 150, 200, 300) }
it 'calls gitlab metrics gauge set methods' do
expect(current_phase_metric).to receive(:set).with({}, 2)
expect(current_rss_metric).to receive(:set).with({}, 150)
expect(soft_limit_rss_metric).to receive(:set).with({}, 200)
expect(hard_limit_rss_metric).to receive(:set).with({}, 300)
subject
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment