Commit 51de613f authored by Stan Hu's avatar Stan Hu

Merge branch '36859-implement-db-lock-retry-mechanism' into 'master'

Implement with_lock_retries migration helper

Closes #36859

See merge request gitlab-org/gitlab!22479
parents 306bbfb4 4fc05a36
...@@ -280,6 +280,46 @@ module Gitlab ...@@ -280,6 +280,46 @@ module Gitlab
end end
end end
# Executes the block with a retry mechanism that alters the +lock_timeout+ and +sleep_time+ between attempts.
# The timings can be controlled via the +timing_configuration+ parameter.
# If the lock was not acquired within the retry period, a last attempt is made without using +lock_timeout+.
#
# ==== Examples
# # Invoking without parameters
# with_lock_retries do
# drop_table :my_table
# end
#
# # Invoking with custom +timing_configuration+
# t = [
# [1.second, 1.second],
# [2.seconds, 2.seconds]
# ]
#
# with_lock_retries(timing_configuration: t) do
# drop_table :my_table # this will be retried twice
# end
#
# # Disabling the retries using an environment variable
# > export DISABLE_LOCK_RETRIES=true
#
# with_lock_retries do
# drop_table :my_table # one invocation, it will not retry at all
# end
#
# ==== Parameters
# * +timing_configuration+ - [[ActiveSupport::Duration, ActiveSupport::Duration], ...] lock timeout for the block, sleep time before the next iteration, defaults to `Gitlab::Database::WithLockRetries::DEFAULT_TIMING_CONFIGURATION`
# * +logger+ - [Gitlab::JsonLogger]
# * +env+ - [Hash] custom environment hash, see the example with `DISABLE_LOCK_RETRIES`
def with_lock_retries(**args, &block)
merged_args = {
klass: self.class,
logger: Gitlab::BackgroundMigration::Logger
}.merge(args)
Gitlab::Database::WithLockRetries.new(merged_args).run(&block)
end
def true_value def true_value
Database.true_value Database.true_value
end end
......
# frozen_string_literal: true
module Gitlab
module Database
class WithLockRetries
NULL_LOGGER = Gitlab::JsonLogger.new('/dev/null')
# Each element of the array represents a retry iteration.
# - DEFAULT_TIMING_CONFIGURATION.size provides the iteration count.
# - First element: DB lock_timeout
# - Second element: Sleep time after unsuccessful lock attempt (LockWaitTimeout error raised)
# - Worst case, this configuration would retry for about 40 minutes.
DEFAULT_TIMING_CONFIGURATION = [
[0.1.seconds, 0.05.seconds], # short timings, lock_timeout: 100ms, sleep after LockWaitTimeout: 50ms
[0.1.seconds, 0.05.seconds],
[0.2.seconds, 0.05.seconds],
[0.3.seconds, 0.10.seconds],
[0.4.seconds, 0.15.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[1.second, 5.seconds], # probably high traffic, increase timings
[1.second, 1.minute],
[0.1.seconds, 0.05.seconds],
[0.1.seconds, 0.05.seconds],
[0.2.seconds, 0.05.seconds],
[0.3.seconds, 0.10.seconds],
[0.4.seconds, 0.15.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[3.seconds, 3.minutes], # probably high traffic or long locks, increase timings
[0.1.seconds, 0.05.seconds],
[0.1.seconds, 0.05.seconds],
[0.5.seconds, 2.seconds],
[0.5.seconds, 2.seconds],
[5.seconds, 2.minutes],
[0.5.seconds, 0.5.seconds],
[0.5.seconds, 0.5.seconds],
[7.seconds, 5.minutes],
[0.5.seconds, 0.5.seconds],
[0.5.seconds, 0.5.seconds],
[7.seconds, 5.minutes],
[0.5.seconds, 0.5.seconds],
[0.5.seconds, 0.5.seconds],
[7.seconds, 5.minutes],
[0.1.seconds, 0.05.seconds],
[0.1.seconds, 0.05.seconds],
[0.5.seconds, 2.seconds],
[10.seconds, 10.minutes],
[0.1.seconds, 0.05.seconds],
[0.5.seconds, 2.seconds],
[10.seconds, 10.minutes]
].freeze
def initialize(logger: NULL_LOGGER, timing_configuration: DEFAULT_TIMING_CONFIGURATION, klass: nil, env: ENV)
@logger = logger
@klass = klass
@timing_configuration = timing_configuration
@env = env
@current_iteration = 1
@log_params = { method: 'with_lock_retries', class: klass.to_s }
end
def run(&block)
raise 'no block given' unless block_given?
@block = block
if lock_retries_disabled?
log(message: 'DISABLE_LOCK_RETRIES environment variable is true, executing the block without retry')
return run_block
end
begin
run_block_with_transaction
rescue ActiveRecord::LockWaitTimeout
if retry_with_lock_timeout?
wait_until_next_retry
retry
else
run_block_without_lock_timeout
end
end
end
private
attr_reader :logger, :env, :block, :current_iteration, :log_params, :timing_configuration
def run_block
block.call
end
def run_block_with_transaction
ActiveRecord::Base.transaction(requires_new: true) do
execute("SET LOCAL lock_timeout TO '#{current_lock_timeout_in_ms}ms'")
log(message: 'Lock timeout is set', current_iteration: current_iteration, lock_timeout_in_ms: current_lock_timeout_in_ms)
run_block
log(message: 'Migration finished', current_iteration: current_iteration, lock_timeout_in_ms: current_lock_timeout_in_ms)
end
end
def retry_with_lock_timeout?
current_iteration != retry_count
end
def wait_until_next_retry
log(message: 'ActiveRecord::LockWaitTimeout error, retrying after sleep', current_iteration: current_iteration, sleep_time_in_seconds: current_sleep_time_in_seconds)
sleep(current_sleep_time_in_seconds)
@current_iteration += 1
end
def run_block_without_lock_timeout
log(message: "Couldn't acquire lock to perform the migration", current_iteration: current_iteration)
log(message: "Executing the migration without lock timeout", current_iteration: current_iteration)
execute("SET LOCAL lock_timeout TO '0'")
run_block
log(message: 'Migration finished', current_iteration: current_iteration)
end
def lock_retries_disabled?
Gitlab::Utils.to_boolean(env['DISABLE_LOCK_RETRIES'])
end
def log(params)
logger.info(log_params.merge(params))
end
def execute(statement)
ActiveRecord::Base.connection.execute(statement)
end
def retry_count
timing_configuration.size
end
def current_lock_timeout_in_ms
timing_configuration[current_iteration - 1][0].in_milliseconds
end
def current_sleep_time_in_seconds
timing_configuration[current_iteration - 1][1].to_i
end
end
end
end
...@@ -1518,4 +1518,17 @@ describe Gitlab::Database::MigrationHelpers do ...@@ -1518,4 +1518,17 @@ describe Gitlab::Database::MigrationHelpers do
model.create_or_update_plan_limit('project_hooks', 'free', 10) model.create_or_update_plan_limit('project_hooks', 'free', 10)
end end
end end
describe '#with_lock_retries' do
let(:buffer) { StringIO.new }
let(:in_memory_logger) { Gitlab::JsonLogger.new(buffer) }
let(:env) { { 'DISABLE_LOCK_RETRIES' => 'true' } }
it 'sets the migration class name in the logs' do
model.with_lock_retries(env: env, logger: in_memory_logger) { }
buffer.rewind
expect(buffer.read).to include("\"class\":\"#{model.class}\"")
end
end
end end
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::Database::WithLockRetries do
let(:env) { {} }
let(:logger) { Gitlab::Database::WithLockRetries::NULL_LOGGER }
let(:subject) { described_class.new(env: env, logger: logger, timing_configuration: timing_configuration) }
let(:timing_configuration) do
[
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second]
]
end
describe '#run' do
it 'requires block' do
expect { subject.run }.to raise_error(StandardError, 'no block given')
end
context 'when DISABLE_LOCK_RETRIES is set' do
let(:env) { { 'DISABLE_LOCK_RETRIES' => 'true' } }
it 'executes the passed block without retrying' do
object = double
expect(object).to receive(:method).once
subject.run { object.method }
end
end
context 'when lock retry is enabled' do
class ActiveRecordSecond < ActiveRecord::Base
end
let(:lock_fiber) do
Fiber.new do
# Initiating a second DB connection for the lock
conn = ActiveRecordSecond.establish_connection(Rails.configuration.database_configuration[Rails.env]).connection
conn.transaction do
conn.execute("LOCK TABLE #{Project.table_name} in exclusive mode")
Fiber.yield
end
ActiveRecordSecond.remove_connection # force disconnect
end
end
before do
lock_fiber.resume # start the transaction and lock the table
end
context 'lock_fiber' do
it 'acquires lock successfully' do
check_exclusive_lock_query = """
SELECT 1
FROM pg_locks l
JOIN pg_class t ON l.relation = t.oid
WHERE t.relkind = 'r' AND l.mode = 'ExclusiveLock' AND t.relname = '#{Project.table_name}'
"""
expect(ActiveRecord::Base.connection.execute(check_exclusive_lock_query).to_a).to be_present
end
end
shared_examples 'retriable exclusive lock on `projects`' do
it 'succeeds executing the given block' do
lock_attempts = 0
lock_acquired = false
expect_any_instance_of(Gitlab::Database::WithLockRetries).to receive(:sleep).exactly(retry_count - 1).times # we don't sleep in the last iteration
allow_any_instance_of(Gitlab::Database::WithLockRetries).to receive(:run_block_with_transaction).and_wrap_original do |method|
lock_fiber.resume if lock_attempts == retry_count
method.call
end
subject.run do
lock_attempts += 1
if lock_attempts == retry_count # we reached the last retry iteration, if we kill the thread, the last try (no lock_timeout) will succeed)
lock_fiber.resume
end
ActiveRecord::Base.transaction do
ActiveRecord::Base.connection.execute("LOCK TABLE #{Project.table_name} in exclusive mode")
lock_acquired = true
end
end
expect(lock_attempts).to eq(retry_count)
expect(lock_acquired).to eq(true)
end
end
context 'after 3 iterations' do
let(:retry_count) { 4 }
it_behaves_like 'retriable exclusive lock on `projects`'
end
context 'after the retries, without setting lock_timeout' do
let(:retry_count) { timing_configuration.size }
it_behaves_like 'retriable exclusive lock on `projects`'
end
context 'when statement timeout is reached' do
it 'raises QueryCanceled error' do
lock_acquired = false
ActiveRecord::Base.connection.execute("SET LOCAL statement_timeout='100ms'")
expect do
subject.run do
ActiveRecord::Base.connection.execute("SELECT 1 FROM pg_sleep(0.11)") # 110ms
lock_acquired = true
end
end.to raise_error(ActiveRecord::QueryCanceled)
expect(lock_acquired).to eq(false)
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment