Commit 147e93ac authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'add-sidekiq-memory-killer-metrics-prometheus' into 'master'

Add Sidekiq memory killer prometheus metrics

See merge request gitlab-org/gitlab!17994
parents 1b354ed3 bb1b81c8
......@@ -21,14 +21,46 @@ module Gitlab
# In case not set, default to 300M. This is for extra-safe.
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
# Phases of memory killer
PHASE = {
running: 1,
above_soft_limit: 2,
stop_fetching_new_jobs: 3,
shutting_down: 4,
killing_sidekiq: 5
}.freeze
def initialize
super
@enabled = true
@metrics = init_metrics
end
private
def init_metrics
{
sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker')
}
end
def refresh_state(phase)
@phase = PHASE.fetch(phase)
@current_rss = get_rss
@soft_limit_rss = get_soft_limit_rss
@hard_limit_rss = get_hard_limit_rss
# track the current state as prometheus gauges
@metrics[:sidekiq_memory_killer_phase].set({}, @phase)
@metrics[:sidekiq_current_rss].set({}, @current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
end
def run_thread
Sidekiq.logger.info(
class: self.class.to_s,
......@@ -77,41 +109,51 @@ module Gitlab
# Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early
refresh_state(:stop_fetching_new_jobs)
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled?
# Tell sidekiq to restart itself
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
refresh_state(:shutting_down)
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
return unless enabled?
# Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind
refresh_state(:killing_sidekiq)
signal_pgroup('SIGKILL', 'die')
end
def rss_within_range?
current_rss = nil
refresh_state(:running)
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
loop do
return true unless enabled?
current_rss = get_rss
# RSS go above hard limit should trigger forcible shutdown right away
break if current_rss > hard_limit_rss
break if @current_rss > @hard_limit_rss
# RSS go below the soft limit
return true if current_rss < soft_limit_rss
return true if @current_rss < @soft_limit_rss
# RSS did not go below the soft limit within deadline, restart
break if Gitlab::Metrics::System.monotonic_time > deadline
sleep(CHECK_INTERVAL_SECONDS)
refresh_state(:above_soft_limit)
end
log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
# There are two chances to break from loop:
# - above hard limit, or
# - above soft limit after deadline
# When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
# So ignore `above hard limit` and always set `above_soft_limit` here
refresh_state(:above_soft_limit)
log_rss_out_of_range(@current_rss, @hard_limit_rss, @soft_limit_rss)
false
end
......@@ -143,11 +185,11 @@ module Gitlab
output.to_i
end
def soft_limit_rss
def get_soft_limit_rss
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
end
def hard_limit_rss
def get_hard_limit_rss
HARD_LIMIT_RSS_KB
end
......
......@@ -7,9 +7,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
let(:pid) { 12345 }
before do
allow(memory_killer).to receive(:pid).and_return(pid)
allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn)
allow(memory_killer).to receive(:pid).and_return(pid)
# make sleep no-op
allow(memory_killer).to receive(:sleep) {}
end
describe '#run_thread' do
......@@ -39,8 +42,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
pid: pid,
message: "Exception from run_thread: My Exception")
expect(memory_killer).to receive(:rss_within_range?).twice.and_raise(StandardError, 'My Exception')
expect(memory_killer).to receive(:sleep).twice.with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS)
expect(memory_killer).to receive(:rss_within_range?)
.twice
.and_raise(StandardError, 'My Exception')
expect { subject }.not_to raise_exception
end
......@@ -52,7 +56,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
pid: pid,
message: "Exception from run_thread: My Exception")
expect(memory_killer).to receive(:rss_within_range?).once.and_raise(Exception, 'My Exception')
expect(memory_killer).to receive(:rss_within_range?)
.once
.and_raise(Exception, 'My Exception')
expect(memory_killer).to receive(:sleep).with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS)
expect(Sidekiq.logger).to receive(:warn).once
......@@ -78,7 +84,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end
it 'not invoke restart_sidekiq when rss in range' do
expect(memory_killer).to receive(:rss_within_range?).twice.and_return(true)
expect(memory_killer).to receive(:rss_within_range?)
.twice
.and_return(true)
expect(memory_killer).not_to receive(:restart_sidekiq)
......@@ -86,9 +94,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end
it 'invoke restart_sidekiq when rss not in range' do
expect(memory_killer).to receive(:rss_within_range?).at_least(:once).and_return(false)
expect(memory_killer).to receive(:rss_within_range?)
.at_least(:once)
.and_return(false)
expect(memory_killer).to receive(:restart_sidekiq).at_least(:once)
expect(memory_killer).to receive(:restart_sidekiq)
.at_least(:once)
subject
end
......@@ -97,10 +108,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
describe '#stop_working' do
subject { memory_killer.send(:stop_working)}
it 'changed enable? to false' do
expect(memory_killer.send(:enabled?)).to be true
subject
expect(memory_killer.send(:enabled?)).to be false
it 'changes enable? to false' do
expect { subject }.to change { memory_killer.send(:enabled?) }
.from(true).to(false)
end
end
......@@ -121,8 +131,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when everything is within limit' do
expect(memory_killer).to receive(:get_rss).and_return(100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
expect(memory_killer).not_to receive(:log_rss_out_of_range)
......@@ -131,9 +145,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end
it 'return false when rss exceeds hard_limit_rss' do
expect(memory_killer).to receive(:get_rss).and_return(400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:get_rss).at_least(:once).and_return(400)
expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
......@@ -143,13 +165,21 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end
it 'return false when rss exceed hard_limit_rss after a while' do
expect(memory_killer).to receive(:get_rss).and_return(250, 400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:get_rss).and_return(250, 400, 400)
expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.at_least(:once)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200)
expect(subject).to be false
......@@ -157,8 +187,16 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).and_return(250, 100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200, 200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300, 300)
expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200, 200)
expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300, 300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
......@@ -168,17 +206,27 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
expect(subject).to be true
end
it 'return false when rss exceed soft_limit_rss longer than GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).exactly(4).times.and_return(250)
expect(memory_killer).to receive(:soft_limit_rss).exactly(5).times.and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).exactly(5).times.and_return(300)
context 'when exceeding GRACE_BALLOON_SECONDS' do
let(:grace_balloon_seconds) { 0 }
expect(Gitlab::Metrics::System).to receive(:monotonic_time).exactly(5).times.and_call_original
expect(memory_killer).to receive(:sleep).exactly(3).times.with(check_interval_seconds).and_call_original
it 'return false when rss exceed soft_limit_rss' do
allow(memory_killer).to receive(:get_rss).and_return(250)
allow(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
allow(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:log_rss_out_of_range).with(250, 300, 200)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(subject).to be false
expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(memory_killer).to receive(:log_rss_out_of_range)
.with(250, 300, 200)
expect(subject).to be false
end
end
end
......@@ -190,19 +238,42 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
before do
stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds)
allow(Sidekiq).to receive(:options).and_return(timeout: 9)
allow(memory_killer).to receive(:get_rss).and_return(100)
allow(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
allow(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
end
it 'send signal' do
expect(memory_killer).to receive(:signal_and_wait).with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs').ordered
expect(memory_killer).to receive(:signal_and_wait).with(11, 'SIGTERM', 'gracefully shut down').ordered
expect(memory_killer).to receive(:signal_pgroup).with('SIGKILL', 'die').ordered
expect(memory_killer).to receive(:refresh_state)
.with(:stop_fetching_new_jobs)
.ordered
.and_call_original
expect(memory_killer).to receive(:signal_and_wait)
.with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs')
.ordered
expect(memory_killer).to receive(:refresh_state)
.with(:shutting_down)
.ordered
.and_call_original
expect(memory_killer).to receive(:signal_and_wait)
.with(11, 'SIGTERM', 'gracefully shut down')
.ordered
expect(memory_killer).to receive(:refresh_state)
.with(:killing_sidekiq)
.ordered
.and_call_original
expect(memory_killer).to receive(:signal_pgroup)
.with('SIGKILL', 'die')
.ordered
subject
end
end
describe '#signal_and_wait' do
let(:time) { 7 }
let(:time) { 0 }
let(:signal) { 'my-signal' }
let(:explanation) { 'my-explanation' }
let(:check_interval_seconds) { 2 }
......@@ -226,14 +297,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end
it 'send signal and wait till deadline if any job not finished' do
expect(Process).to receive(:kill).with(signal, pid).ordered
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original.at_least(:once)
expect(Process).to receive(:kill)
.with(signal, pid)
.ordered
expect(Gitlab::Metrics::System).to receive(:monotonic_time)
.and_call_original
.at_least(:once)
expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:once)
expect(memory_killer).to receive(:any_jobs?).and_return(true).at_least(:once)
expect(memory_killer).to receive(:sleep).and_call_original.exactly(4).times
subject
end
end
......@@ -401,4 +475,27 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
expect(subject).to eq(10)
end
end
describe '#refresh_state' do
let(:metrics) { memory_killer.instance_variable_get(:@metrics) }
subject { memory_killer.send(:refresh_state, :shutting_down) }
it 'calls gitlab metrics gauge set methods' do
expect(memory_killer).to receive(:get_rss) { 1010 }
expect(memory_killer).to receive(:get_soft_limit_rss) { 1020 }
expect(memory_killer).to receive(:get_hard_limit_rss) { 1040 }
expect(metrics[:sidekiq_memory_killer_phase]).to receive(:set)
.with({}, described_class::PHASE[:shutting_down])
expect(metrics[:sidekiq_current_rss]).to receive(:set)
.with({}, 1010)
expect(metrics[:sidekiq_memory_killer_soft_limit_rss]).to receive(:set)
.with({}, 1020)
expect(metrics[:sidekiq_memory_killer_hard_limit_rss]).to receive(:set)
.with({}, 1040)
subject
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment