Commit 147e93ac authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'add-sidekiq-memory-killer-metrics-prometheus' into 'master'

Add Sidekiq memory killer prometheus metrics

See merge request gitlab-org/gitlab!17994
parents 1b354ed3 bb1b81c8
...@@ -21,14 +21,46 @@ module Gitlab ...@@ -21,14 +21,46 @@ module Gitlab
# In case not set, default to 300M. This is for extra-safe. # In case not set, default to 300M. This is for extra-safe.
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000 DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
# Phases of memory killer
PHASE = {
running: 1,
above_soft_limit: 2,
stop_fetching_new_jobs: 3,
shutting_down: 4,
killing_sidekiq: 5
}.freeze
def initialize def initialize
super super
@enabled = true @enabled = true
@metrics = init_metrics
end end
private private
def init_metrics
{
sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker')
}
end
def refresh_state(phase)
@phase = PHASE.fetch(phase)
@current_rss = get_rss
@soft_limit_rss = get_soft_limit_rss
@hard_limit_rss = get_hard_limit_rss
# track the current state as prometheus gauges
@metrics[:sidekiq_memory_killer_phase].set({}, @phase)
@metrics[:sidekiq_current_rss].set({}, @current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
end
def run_thread def run_thread
Sidekiq.logger.info( Sidekiq.logger.info(
class: self.class.to_s, class: self.class.to_s,
...@@ -77,41 +109,51 @@ module Gitlab ...@@ -77,41 +109,51 @@ module Gitlab
# Tell Sidekiq to stop fetching new jobs # Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time # We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early # We also monitor a number of running jobs and allow to restart early
refresh_state(:stop_fetching_new_jobs)
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs') signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled? return unless enabled?
# Tell sidekiq to restart itself # Tell sidekiq to restart itself
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
refresh_state(:shutting_down)
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down') signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
return unless enabled? return unless enabled?
# Ideally we should never reach this condition # Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind # Kill the whole pgroup, so we can be sure no children are left behind
refresh_state(:killing_sidekiq)
signal_pgroup('SIGKILL', 'die') signal_pgroup('SIGKILL', 'die')
end end
def rss_within_range? def rss_within_range?
current_rss = nil refresh_state(:running)
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
loop do loop do
return true unless enabled? return true unless enabled?
current_rss = get_rss
# RSS go above hard limit should trigger forcible shutdown right away # RSS go above hard limit should trigger forcible shutdown right away
break if current_rss > hard_limit_rss break if @current_rss > @hard_limit_rss
# RSS go below the soft limit # RSS go below the soft limit
return true if current_rss < soft_limit_rss return true if @current_rss < @soft_limit_rss
# RSS did not go below the soft limit within deadline, restart # RSS did not go below the soft limit within deadline, restart
break if Gitlab::Metrics::System.monotonic_time > deadline break if Gitlab::Metrics::System.monotonic_time > deadline
sleep(CHECK_INTERVAL_SECONDS) sleep(CHECK_INTERVAL_SECONDS)
refresh_state(:above_soft_limit)
end end
log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss) # There are two chances to break from loop:
# - above hard limit, or
# - above soft limit after deadline
# When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
# So ignore `above hard limit` and always set `above_soft_limit` here
refresh_state(:above_soft_limit)
log_rss_out_of_range(@current_rss, @hard_limit_rss, @soft_limit_rss)
false false
end end
...@@ -143,11 +185,11 @@ module Gitlab ...@@ -143,11 +185,11 @@ module Gitlab
output.to_i output.to_i
end end
def soft_limit_rss def get_soft_limit_rss
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
end end
def hard_limit_rss def get_hard_limit_rss
HARD_LIMIT_RSS_KB HARD_LIMIT_RSS_KB
end end
......
...@@ -7,9 +7,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -7,9 +7,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
let(:pid) { 12345 } let(:pid) { 12345 }
before do before do
allow(memory_killer).to receive(:pid).and_return(pid)
allow(Sidekiq.logger).to receive(:info) allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn) allow(Sidekiq.logger).to receive(:warn)
allow(memory_killer).to receive(:pid).and_return(pid)
# make sleep no-op
allow(memory_killer).to receive(:sleep) {}
end end
describe '#run_thread' do describe '#run_thread' do
...@@ -39,8 +42,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -39,8 +42,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
pid: pid, pid: pid,
message: "Exception from run_thread: My Exception") message: "Exception from run_thread: My Exception")
expect(memory_killer).to receive(:rss_within_range?).twice.and_raise(StandardError, 'My Exception') expect(memory_killer).to receive(:rss_within_range?)
expect(memory_killer).to receive(:sleep).twice.with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS) .twice
.and_raise(StandardError, 'My Exception')
expect { subject }.not_to raise_exception expect { subject }.not_to raise_exception
end end
...@@ -52,7 +56,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -52,7 +56,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
pid: pid, pid: pid,
message: "Exception from run_thread: My Exception") message: "Exception from run_thread: My Exception")
expect(memory_killer).to receive(:rss_within_range?).once.and_raise(Exception, 'My Exception') expect(memory_killer).to receive(:rss_within_range?)
.once
.and_raise(Exception, 'My Exception')
expect(memory_killer).to receive(:sleep).with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS) expect(memory_killer).to receive(:sleep).with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS)
expect(Sidekiq.logger).to receive(:warn).once expect(Sidekiq.logger).to receive(:warn).once
...@@ -78,7 +84,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -78,7 +84,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end end
it 'not invoke restart_sidekiq when rss in range' do it 'not invoke restart_sidekiq when rss in range' do
expect(memory_killer).to receive(:rss_within_range?).twice.and_return(true) expect(memory_killer).to receive(:rss_within_range?)
.twice
.and_return(true)
expect(memory_killer).not_to receive(:restart_sidekiq) expect(memory_killer).not_to receive(:restart_sidekiq)
...@@ -86,9 +94,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -86,9 +94,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end end
it 'invoke restart_sidekiq when rss not in range' do it 'invoke restart_sidekiq when rss not in range' do
expect(memory_killer).to receive(:rss_within_range?).at_least(:once).and_return(false) expect(memory_killer).to receive(:rss_within_range?)
.at_least(:once)
.and_return(false)
expect(memory_killer).to receive(:restart_sidekiq).at_least(:once) expect(memory_killer).to receive(:restart_sidekiq)
.at_least(:once)
subject subject
end end
...@@ -97,10 +108,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -97,10 +108,9 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
describe '#stop_working' do describe '#stop_working' do
subject { memory_killer.send(:stop_working)} subject { memory_killer.send(:stop_working)}
it 'changed enable? to false' do it 'changes enable? to false' do
expect(memory_killer.send(:enabled?)).to be true expect { subject }.to change { memory_killer.send(:enabled?) }
subject .from(true).to(false)
expect(memory_killer.send(:enabled?)).to be false
end end
end end
...@@ -121,8 +131,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -121,8 +131,12 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when everything is within limit' do it 'return true when everything is within limit' do
expect(memory_killer).to receive(:get_rss).and_return(100) expect(memory_killer).to receive(:get_rss).and_return(100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200) expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300) expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
expect(memory_killer).not_to receive(:log_rss_out_of_range) expect(memory_killer).not_to receive(:log_rss_out_of_range)
...@@ -131,9 +145,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -131,9 +145,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end end
it 'return false when rss exceeds hard_limit_rss' do it 'return false when rss exceeds hard_limit_rss' do
expect(memory_killer).to receive(:get_rss).and_return(400) expect(memory_killer).to receive(:get_rss).at_least(:once).and_return(400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200) expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300) expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original
...@@ -143,13 +165,21 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -143,13 +165,21 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end end
it 'return false when rss exceed hard_limit_rss after a while' do it 'return false when rss exceed hard_limit_rss after a while' do
expect(memory_killer).to receive(:get_rss).and_return(250, 400) expect(memory_killer).to receive(:get_rss).and_return(250, 400, 400)
expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200) expect(memory_killer).to receive(:get_soft_limit_rss).at_least(:once).and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300) expect(memory_killer).to receive(:get_hard_limit_rss).at_least(:once).and_return(300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.at_least(:once)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds) expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200) expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200)
expect(subject).to be false expect(subject).to be false
...@@ -157,8 +187,16 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -157,8 +187,16 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).and_return(250, 100) expect(memory_killer).to receive(:get_rss).and_return(250, 100)
expect(memory_killer).to receive(:soft_limit_rss).and_return(200, 200) expect(memory_killer).to receive(:get_soft_limit_rss).and_return(200, 200)
expect(memory_killer).to receive(:hard_limit_rss).and_return(300, 300) expect(memory_killer).to receive(:get_hard_limit_rss).and_return(300, 300)
expect(memory_killer).to receive(:refresh_state)
.with(:running)
.and_call_original
expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original
expect(memory_killer).to receive(:sleep).with(check_interval_seconds) expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
...@@ -168,19 +206,29 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -168,19 +206,29 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
expect(subject).to be true expect(subject).to be true
end end
it 'return false when rss exceed soft_limit_rss longer than GRACE_BALLOON_SECONDS' do context 'when exceeding GRACE_BALLOON_SECONDS' do
expect(memory_killer).to receive(:get_rss).exactly(4).times.and_return(250) let(:grace_balloon_seconds) { 0 }
expect(memory_killer).to receive(:soft_limit_rss).exactly(5).times.and_return(200)
expect(memory_killer).to receive(:hard_limit_rss).exactly(5).times.and_return(300) it 'return false when rss exceed soft_limit_rss' do
allow(memory_killer).to receive(:get_rss).and_return(250)
allow(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
allow(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).exactly(5).times.and_call_original expect(memory_killer).to receive(:refresh_state)
expect(memory_killer).to receive(:sleep).exactly(3).times.with(check_interval_seconds).and_call_original .with(:running)
.and_call_original
expect(memory_killer).to receive(:log_rss_out_of_range).with(250, 300, 200) expect(memory_killer).to receive(:refresh_state)
.with(:above_soft_limit)
.and_call_original
expect(memory_killer).to receive(:log_rss_out_of_range)
.with(250, 300, 200)
expect(subject).to be false expect(subject).to be false
end end
end end
end
describe '#restart_sidekiq' do describe '#restart_sidekiq' do
let(:shutdown_timeout_seconds) { 7 } let(:shutdown_timeout_seconds) { 7 }
...@@ -190,19 +238,42 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -190,19 +238,42 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
before do before do
stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds) stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds)
allow(Sidekiq).to receive(:options).and_return(timeout: 9) allow(Sidekiq).to receive(:options).and_return(timeout: 9)
allow(memory_killer).to receive(:get_rss).and_return(100)
allow(memory_killer).to receive(:get_soft_limit_rss).and_return(200)
allow(memory_killer).to receive(:get_hard_limit_rss).and_return(300)
end end
it 'send signal' do it 'send signal' do
expect(memory_killer).to receive(:signal_and_wait).with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs').ordered expect(memory_killer).to receive(:refresh_state)
expect(memory_killer).to receive(:signal_and_wait).with(11, 'SIGTERM', 'gracefully shut down').ordered .with(:stop_fetching_new_jobs)
expect(memory_killer).to receive(:signal_pgroup).with('SIGKILL', 'die').ordered .ordered
.and_call_original
expect(memory_killer).to receive(:signal_and_wait)
.with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs')
.ordered
expect(memory_killer).to receive(:refresh_state)
.with(:shutting_down)
.ordered
.and_call_original
expect(memory_killer).to receive(:signal_and_wait)
.with(11, 'SIGTERM', 'gracefully shut down')
.ordered
expect(memory_killer).to receive(:refresh_state)
.with(:killing_sidekiq)
.ordered
.and_call_original
expect(memory_killer).to receive(:signal_pgroup)
.with('SIGKILL', 'die')
.ordered
subject subject
end end
end end
describe '#signal_and_wait' do describe '#signal_and_wait' do
let(:time) { 7 } let(:time) { 0 }
let(:signal) { 'my-signal' } let(:signal) { 'my-signal' }
let(:explanation) { 'my-explanation' } let(:explanation) { 'my-explanation' }
let(:check_interval_seconds) { 2 } let(:check_interval_seconds) { 2 }
...@@ -226,14 +297,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -226,14 +297,17 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
end end
it 'send signal and wait till deadline if any job not finished' do it 'send signal and wait till deadline if any job not finished' do
expect(Process).to receive(:kill).with(signal, pid).ordered expect(Process).to receive(:kill)
expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original.at_least(:once) .with(signal, pid)
.ordered
expect(Gitlab::Metrics::System).to receive(:monotonic_time)
.and_call_original
.at_least(:once)
expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:once) expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:once)
expect(memory_killer).to receive(:any_jobs?).and_return(true).at_least(:once) expect(memory_killer).to receive(:any_jobs?).and_return(true).at_least(:once)
expect(memory_killer).to receive(:sleep).and_call_original.exactly(4).times
subject subject
end end
end end
...@@ -401,4 +475,27 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do ...@@ -401,4 +475,27 @@ describe Gitlab::SidekiqDaemon::MemoryKiller do
expect(subject).to eq(10) expect(subject).to eq(10)
end end
end end
describe '#refresh_state' do
let(:metrics) { memory_killer.instance_variable_get(:@metrics) }
subject { memory_killer.send(:refresh_state, :shutting_down) }
it 'calls gitlab metrics gauge set methods' do
expect(memory_killer).to receive(:get_rss) { 1010 }
expect(memory_killer).to receive(:get_soft_limit_rss) { 1020 }
expect(memory_killer).to receive(:get_hard_limit_rss) { 1040 }
expect(metrics[:sidekiq_memory_killer_phase]).to receive(:set)
.with({}, described_class::PHASE[:shutting_down])
expect(metrics[:sidekiq_current_rss]).to receive(:set)
.with({}, 1010)
expect(metrics[:sidekiq_memory_killer_soft_limit_rss]).to receive(:set)
.with({}, 1020)
expect(metrics[:sidekiq_memory_killer_hard_limit_rss]).to receive(:set)
.with({}, 1040)
subject
end
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment