Commit bb1b81c8 authored by Kamil Trzciński's avatar Kamil Trzciński Committed by Qingyu Zhao

Introduce and use `refresh_state`

This updates metrics on demand,
and tracks the current state as gauge.

This also improves tests execution time.
parent 62fb215e
...@@ -41,9 +41,6 @@ Sidekiq.configure_server do |config| ...@@ -41,9 +41,6 @@ Sidekiq.configure_server do |config|
# after all workers have forked, but I don't know how at this point. # after all workers have forked, but I don't know how at this point.
::Prometheus::Client.reinitialize_on_pid_change(force: true) ::Prometheus::Client.reinitialize_on_pid_change(force: true)
# temporary solution before fix https://gitlab.com/gitlab-org/gitlab/issues/33125
::Prometheus::Client.reinitialize_on_pid_change(force: true)
Gitlab::Metrics::Exporter::SidekiqExporter.instance.start Gitlab::Metrics::Exporter::SidekiqExporter.instance.start
end end
end end
......
...@@ -20,6 +20,7 @@ module Gitlab ...@@ -20,6 +20,7 @@ module Gitlab
# Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly # Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly
# In case not set, default to 300M. This is for extra-safe. # In case not set, default to 300M. This is for extra-safe.
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000 DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
# Phases of memory killer # Phases of memory killer
PHASE = { PHASE = {
running: 1, running: 1,
...@@ -47,6 +48,19 @@ module Gitlab ...@@ -47,6 +48,19 @@ module Gitlab
} }
end end
def refresh_state(phase)
@phase = PHASE.fetch(phase)
@current_rss = get_rss
@soft_limit_rss = get_soft_limit_rss
@hard_limit_rss = get_hard_limit_rss
# track the current state as prometheus gauges
@metrics[:sidekiq_memory_killer_phase].set({}, @phase)
@metrics[:sidekiq_current_rss].set({}, @current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
end
def run_thread def run_thread
Sidekiq.logger.info( Sidekiq.logger.info(
class: self.class.to_s, class: self.class.to_s,
...@@ -95,64 +109,55 @@ module Gitlab ...@@ -95,64 +109,55 @@ module Gitlab
# Tell Sidekiq to stop fetching new jobs # Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time # We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early # We also monitor a number of running jobs and allow to restart early
update_metrics(PHASE[:stop_fetching_new_jobs], get_rss, get_soft_limit_rss, get_hard_limit_rss) refresh_state(:stop_fetching_new_jobs)
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs') signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled? return unless enabled?
# Tell sidekiq to restart itself # Tell sidekiq to restart itself
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
update_metrics(PHASE[:shutting_down], get_rss, get_soft_limit_rss, get_hard_limit_rss) refresh_state(:shutting_down)
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down') signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
return unless enabled? return unless enabled?
# Ideally we should never reach this condition # Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind # Kill the whole pgroup, so we can be sure no children are left behind
update_metrics(PHASE[:killing_sidekiq], get_rss, get_soft_limit_rss, get_hard_limit_rss) refresh_state(:killing_sidekiq)
signal_pgroup('SIGKILL', 'die') signal_pgroup('SIGKILL', 'die')
end end
def rss_within_range? def rss_within_range?
phase = PHASE[:running] refresh_state(:running)
current_rss = nil
soft_limit_rss = nil
hard_limit_rss = nil
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
loop do loop do
return true unless enabled? return true unless enabled?
current_rss = get_rss
soft_limit_rss = get_soft_limit_rss
hard_limit_rss = get_hard_limit_rss
update_metrics(phase, current_rss, soft_limit_rss, hard_limit_rss)
# RSS go above hard limit should trigger forcible shutdown right away # RSS go above hard limit should trigger forcible shutdown right away
break if current_rss > hard_limit_rss break if @current_rss > @hard_limit_rss
# RSS go below the soft limit # RSS go below the soft limit
return true if current_rss < soft_limit_rss return true if @current_rss < @soft_limit_rss
phase = PHASE[:above_soft_limit]
# RSS did not go below the soft limit within deadline, restart # RSS did not go below the soft limit within deadline, restart
break if Gitlab::Metrics::System.monotonic_time > deadline break if Gitlab::Metrics::System.monotonic_time > deadline
sleep(CHECK_INTERVAL_SECONDS) sleep(CHECK_INTERVAL_SECONDS)
refresh_state(:above_soft_limit)
end end
log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss) # There are two chances to break from loop:
# - above hard limit, or
# - above soft limit after deadline
# When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
# So ignore `above hard limit` and always set `above_soft_limit` here
refresh_state(:above_soft_limit)
log_rss_out_of_range(@current_rss, @hard_limit_rss, @soft_limit_rss)
false false
end end
def update_metrics(phase, current_rss, soft_limit_rss, hard_limit_rss)
@metrics[:sidekiq_memory_killer_phase].set({}, phase)
@metrics[:sidekiq_current_rss].set({}, current_rss)
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, soft_limit_rss)
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, hard_limit_rss)
end
def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss) def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
Sidekiq.logger.warn( Sidekiq.logger.warn(
class: self.class.to_s, class: self.class.to_s,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment