Commit 606831f9 authored by Andreas Brandl's avatar Andreas Brandl Committed by Yannis Roussos

Drop index with lock timeout and retry

This avoids a long running query: Instead of setting a high statement
timeout, we use rather long lock timeouts and retries in between
attempts. This should gracefully resolve the situation when a vacuum
operation is blocking the `DROP INDEX` statement.

Still, if we run out of retries, we force the execution
without lock timeout (and potentially see a long running query then).

Relates to https://gitlab.com/gitlab-org/gitlab/-/issues/329724
parent 52e6c365
......@@ -11,7 +11,14 @@ module Gitlab
PG_IDENTIFIER_LENGTH = 63
TEMPORARY_INDEX_PREFIX = 'tmp_reindex_'
REPLACED_INDEX_PREFIX = 'old_reindex_'
STATEMENT_TIMEOUT = 6.hours
STATEMENT_TIMEOUT = 9.hours
# When dropping an index, we acquire a SHARE UPDATE EXCLUSIVE lock,
# which only conflicts with DDL and vacuum. We therefore execute this with a rather
# high lock timeout and a long pause in between retries. This is an alternative to
# setting a high statement timeout, which would lead to a long running query with effects
# on e.g. vacuum.
REMOVE_INDEX_RETRY_CONFIG = [[1.minute, 9.minutes]] * 30
attr_reader :index, :logger
......@@ -95,7 +102,13 @@ module Gitlab
def remove_index(schema, name)
logger.info("Removing index #{schema}.#{name}")
set_statement_timeout do
retries = Gitlab::Database::WithLockRetriesOutsideTransaction.new(
timing_configuration: REMOVE_INDEX_RETRY_CONFIG,
klass: self.class,
logger: logger
)
retries.run(raise_on_exhaustion: false) do
connection.execute(<<~SQL)
DROP INDEX CONCURRENTLY
IF EXISTS #{quote_table_name(schema)}.#{quote_table_name(name)}
......@@ -121,7 +134,6 @@ module Gitlab
def with_lock_retries(&block)
arguments = { klass: self.class, logger: logger }
Gitlab::Database::WithLockRetries.new(**arguments).run(raise_on_exhaustion: true, &block)
end
......
......@@ -92,7 +92,7 @@ module Gitlab
end
begin
run_block_with_transaction
run_block_with_lock_timeout
rescue ActiveRecord::LockWaitTimeout
if retry_with_lock_timeout?
disable_idle_in_transaction_timeout if ActiveRecord::Base.connection.transaction_open?
......@@ -121,7 +121,7 @@ module Gitlab
block.call
end
def run_block_with_transaction
def run_block_with_lock_timeout
ActiveRecord::Base.transaction(requires_new: true) do
execute("SET LOCAL lock_timeout TO '#{current_lock_timeout_in_ms}ms'")
......
# frozen_string_literal: true
module Gitlab
module Database
# This retry method behaves similar to WithLockRetries
# except it does not wrap itself into a transaction scope.
#
# In our context, this is only useful if directly connected to
# PostgreSQL. When going through pgbouncer, this method **won't work**
# as it relies on using `SET` outside transactions (and hence can be
# multiplexed across different connections).
class WithLockRetriesOutsideTransaction < WithLockRetries
private
def run_block_with_lock_timeout
execute("SET lock_timeout TO '#{current_lock_timeout_in_ms}ms'")
log(message: 'Lock timeout is set', current_iteration: current_iteration, lock_timeout_in_ms: current_lock_timeout_in_ms)
run_block
log(message: 'Migration finished', current_iteration: current_iteration, lock_timeout_in_ms: current_lock_timeout_in_ms)
end
def run_block_without_lock_timeout
log(message: "Couldn't acquire lock to perform the migration", current_iteration: current_iteration)
log(message: "Executing without lock timeout", current_iteration: current_iteration)
disable_lock_timeout
run_block
log(message: 'Migration finished', current_iteration: current_iteration)
end
def disable_lock_timeout
execute("SET lock_timeout TO '0'")
end
end
end
end
......@@ -111,7 +111,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
end
it 'replaces the existing index with an identical index' do
expect(connection).to receive(:execute).with('SET statement_timeout TO \'21600s\'').twice
expect(connection).to receive(:execute).with('SET statement_timeout TO \'32400s\'')
expect_to_execute_concurrently_in_order(create_index)
......@@ -123,6 +123,10 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect_index_rename(replacement_name, index.name)
expect_index_rename(replaced_name, replacement_name)
expect_next_instance_of(::Gitlab::Database::WithLockRetries) do |instance|
expect(instance).to receive(:run).with(raise_on_exhaustion: false).and_yield
end
expect_to_execute_concurrently_in_order(drop_index)
subject.perform
......@@ -136,7 +140,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
end
it 'rebuilds table statistics before dropping the original index' do
expect(connection).to receive(:execute).with('SET statement_timeout TO \'21600s\'').twice
expect(connection).to receive(:execute).with('SET statement_timeout TO \'32400s\'')
expect_to_execute_concurrently_in_order(create_index)
......@@ -152,7 +156,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect_index_rename(replacement_name, index.name)
expect_index_rename(replaced_name, replacement_name)
expect_to_execute_concurrently_in_order(drop_index)
expect_index_drop(drop_index)
subject.perform
......@@ -166,9 +170,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
end
it 'replaces the existing index with an identical index' do
expect(connection).to receive(:execute).with('SET statement_timeout TO \'21600s\'').exactly(3).times
expect_to_execute_concurrently_in_order(drop_index)
expect_index_drop(drop_index)
expect_to_execute_concurrently_in_order(create_index)
expect_next_instance_of(::Gitlab::Database::WithLockRetries) do |instance|
......@@ -179,7 +181,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect_index_rename(replacement_name, index.name)
expect_index_rename(replaced_name, replacement_name)
expect_to_execute_concurrently_in_order(drop_index)
expect_index_drop(drop_index)
subject.perform
......@@ -192,6 +194,10 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect(connection).to receive(:execute).with(create_index).ordered
.and_raise(ActiveRecord::ConnectionTimeoutError, 'connect timeout')
expect_next_instance_of(::Gitlab::Database::WithLockRetries) do |instance|
expect(instance).to receive(:run).with(raise_on_exhaustion: false).and_yield
end
expect_to_execute_concurrently_in_order(drop_index)
expect { subject.perform }.to raise_error(ActiveRecord::ConnectionTimeoutError, /connect timeout/)
......@@ -207,6 +213,10 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect_to_execute_concurrently_in_order(create_index)
expect_next_instance_of(::Gitlab::Database::WithLockRetries) do |instance|
expect(instance).to receive(:run).with(raise_on_exhaustion: false).and_yield
end
expect_to_execute_concurrently_in_order(drop_index)
expect { subject.perform }.to raise_error(described_class::ReindexError, /replacement index was created as INVALID/)
......@@ -228,7 +238,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
expect_index_rename(index.name, replaced_name).and_raise(ActiveRecord::ConnectionTimeoutError, 'connect timeout')
expect_to_execute_concurrently_in_order(drop_index)
expect_index_drop(drop_index)
expect { subject.perform }.to raise_error(ActiveRecord::ConnectionTimeoutError, /connect timeout/)
......@@ -245,7 +255,7 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
.and_raise(::Gitlab::Database::WithLockRetries::AttemptsExhaustedError, 'exhausted')
end
expect_to_execute_concurrently_in_order(drop_index)
expect_index_drop(drop_index)
expect { subject.perform }.to raise_error(::Gitlab::Database::WithLockRetries::AttemptsExhaustedError, /exhausted/)
......@@ -270,6 +280,14 @@ RSpec.describe Gitlab::Database::Reindexing::ConcurrentReindex, '#perform' do
SQL
end
def expect_index_drop(drop_index)
expect_next_instance_of(::Gitlab::Database::WithLockRetries) do |instance|
expect(instance).to receive(:run).with(raise_on_exhaustion: false).and_yield
end
expect_to_execute_concurrently_in_order(drop_index)
end
def find_index_create_statement
ActiveRecord::Base.connection.select_value(<<~SQL)
SELECT indexdef
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::WithLockRetriesOutsideTransaction do
let(:env) { {} }
let(:logger) { Gitlab::Database::WithLockRetries::NULL_LOGGER }
let(:subject) { described_class.new(env: env, logger: logger, timing_configuration: timing_configuration) }
let(:timing_configuration) do
[
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second],
[1.second, 1.second]
]
end
describe '#run' do
it 'requires block' do
expect { subject.run }.to raise_error(StandardError, 'no block given')
end
context 'when DISABLE_LOCK_RETRIES is set' do
let(:env) { { 'DISABLE_LOCK_RETRIES' => 'true' } }
it 'executes the passed block without retrying' do
object = double
expect(object).to receive(:method).once
subject.run { object.method }
end
end
context 'when lock retry is enabled' do
let(:lock_fiber) do
Fiber.new do
# Initiating a second DB connection for the lock
conn = ActiveRecordSecond.establish_connection(Rails.configuration.database_configuration[Rails.env]).connection
conn.transaction do
conn.execute("LOCK TABLE #{Project.table_name} in exclusive mode")
Fiber.yield
end
ActiveRecordSecond.remove_connection # force disconnect
end
end
before do
stub_const('ActiveRecordSecond', Class.new(ActiveRecord::Base))
lock_fiber.resume # start the transaction and lock the table
end
after do
lock_fiber.resume if lock_fiber.alive?
end
context 'lock_fiber' do
it 'acquires lock successfully' do
check_exclusive_lock_query = """
SELECT 1
FROM pg_locks l
JOIN pg_class t ON l.relation = t.oid
WHERE t.relkind = 'r' AND l.mode = 'ExclusiveLock' AND t.relname = '#{Project.table_name}'
"""
expect(ActiveRecord::Base.connection.execute(check_exclusive_lock_query).to_a).to be_present
end
end
shared_examples 'retriable exclusive lock on `projects`' do
it 'succeeds executing the given block' do
lock_attempts = 0
lock_acquired = false
# the actual number of attempts to run_block_with_lock_timeout can never exceed the number of
# timings_configurations, so here we limit the retry_count if it exceeds that value
#
# also, there is no call to sleep after the final attempt, which is why it will always be one less
expected_runs_with_timeout = [retry_count, timing_configuration.size].min
expect(subject).to receive(:sleep).exactly(expected_runs_with_timeout - 1).times
expect(subject).to receive(:run_block_with_lock_timeout).exactly(expected_runs_with_timeout).times.and_wrap_original do |method|
lock_fiber.resume if lock_attempts == retry_count
method.call
end
subject.run do
lock_attempts += 1
if lock_attempts == retry_count # we reached the last retry iteration, if we kill the thread, the last try (no lock_timeout) will succeed
lock_fiber.resume
end
ActiveRecord::Base.transaction do
ActiveRecord::Base.connection.execute("LOCK TABLE #{Project.table_name} in exclusive mode")
lock_acquired = true
end
end
expect(lock_attempts).to eq(retry_count)
expect(lock_acquired).to eq(true)
end
end
context 'after 3 iterations' do
it_behaves_like 'retriable exclusive lock on `projects`' do
let(:retry_count) { 4 }
end
context 'setting the idle transaction timeout' do
context 'when there is no outer transaction: disable_ddl_transaction! is set in the migration' do
it 'does not disable the idle transaction timeout' do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
allow(subject).to receive(:run_block_with_lock_timeout).once.and_raise(ActiveRecord::LockWaitTimeout)
allow(subject).to receive(:run_block_with_lock_timeout).once
expect(subject).not_to receive(:disable_idle_in_transaction_timeout)
subject.run {}
end
end
context 'when there is outer transaction: disable_ddl_transaction! is not set in the migration' do
it 'disables the idle transaction timeout so the code can sleep and retry' do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(true)
n = 0
allow(subject).to receive(:run_block_with_lock_timeout).twice do
n += 1
raise(ActiveRecord::LockWaitTimeout) if n == 1
end
expect(subject).to receive(:disable_idle_in_transaction_timeout).once
subject.run {}
end
end
end
end
context 'after the retries are exhausted' do
let(:timing_configuration) do
[
[1.second, 1.second]
]
end
it 'disables the lock_timeout' do
allow(subject).to receive(:run_block_with_lock_timeout).once.and_raise(ActiveRecord::LockWaitTimeout)
expect(subject).to receive(:disable_lock_timeout)
subject.run {}
end
end
context 'after the retries, without setting lock_timeout' do
let(:retry_count) { timing_configuration.size + 1 }
it_behaves_like 'retriable exclusive lock on `projects`' do
before do
expect(subject).to receive(:run_block_without_lock_timeout).and_call_original
end
end
end
context 'after the retries, when requested to raise an error' do
let(:expected_attempts_with_timeout) { timing_configuration.size }
let(:retry_count) { timing_configuration.size + 1 }
it 'raises an error instead of waiting indefinitely for the lock' do
lock_attempts = 0
lock_acquired = false
expect(subject).to receive(:sleep).exactly(expected_attempts_with_timeout - 1).times
expect(subject).to receive(:run_block_with_lock_timeout).exactly(expected_attempts_with_timeout).times.and_call_original
expect do
subject.run(raise_on_exhaustion: true) do
lock_attempts += 1
ActiveRecord::Base.transaction do
ActiveRecord::Base.connection.execute("LOCK TABLE #{Project.table_name} in exclusive mode")
lock_acquired = true
end
end
end.to raise_error(described_class::AttemptsExhaustedError)
expect(lock_attempts).to eq(retry_count - 1)
expect(lock_acquired).to eq(false)
end
end
context 'when statement timeout is reached' do
it 'raises StatementInvalid error' do
lock_acquired = false
ActiveRecord::Base.connection.execute("SET statement_timeout='100ms'")
expect do
subject.run do
ActiveRecord::Base.connection.execute("SELECT 1 FROM pg_sleep(0.11)") # 110ms
lock_acquired = true
end
end.to raise_error(ActiveRecord::StatementInvalid)
expect(lock_acquired).to eq(false)
end
end
end
end
context 'restore local database variables' do
it do
expect { subject.run {} }.not_to change { ActiveRecord::Base.connection.execute("SHOW lock_timeout").to_a }
end
it do
expect { subject.run {} }.not_to change { ActiveRecord::Base.connection.execute("SHOW idle_in_transaction_session_timeout").to_a }
end
end
context 'casting durations correctly' do
let(:timing_configuration) { [[0.015.seconds, 0.025.seconds], [0.015.seconds, 0.025.seconds]] } # 15ms, 25ms
it 'executes `SET lock_timeout` using the configured timeout value in milliseconds' do
expect(ActiveRecord::Base.connection).to receive(:execute).with('RESET idle_in_transaction_session_timeout; RESET lock_timeout').and_call_original
expect(ActiveRecord::Base.connection).to receive(:execute).with("SET lock_timeout TO '15ms'").and_call_original
subject.run { }
end
it 'calls `sleep` after the first iteration fails, using the configured sleep time' do
expect(subject).to receive(:run_block_with_lock_timeout).and_raise(ActiveRecord::LockWaitTimeout).twice
expect(subject).to receive(:sleep).with(0.025)
subject.run { }
end
end
end
......@@ -76,14 +76,14 @@ RSpec.describe Gitlab::Database::WithLockRetries do
lock_attempts = 0
lock_acquired = false
# the actual number of attempts to run_block_with_transaction can never exceed the number of
# the actual number of attempts to run_block_with_lock_timeout can never exceed the number of
# timings_configurations, so here we limit the retry_count if it exceeds that value
#
# also, there is no call to sleep after the final attempt, which is why it will always be one less
expected_runs_with_timeout = [retry_count, timing_configuration.size].min
expect(subject).to receive(:sleep).exactly(expected_runs_with_timeout - 1).times
expect(subject).to receive(:run_block_with_transaction).exactly(expected_runs_with_timeout).times.and_wrap_original do |method|
expect(subject).to receive(:run_block_with_lock_timeout).exactly(expected_runs_with_timeout).times.and_wrap_original do |method|
lock_fiber.resume if lock_attempts == retry_count
method.call
......@@ -116,8 +116,8 @@ RSpec.describe Gitlab::Database::WithLockRetries do
context 'when there is no outer transaction: disable_ddl_transaction! is set in the migration' do
it 'does not disable the idle transaction timeout' do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
allow(subject).to receive(:run_block_with_transaction).once.and_raise(ActiveRecord::LockWaitTimeout)
allow(subject).to receive(:run_block_with_transaction).once
allow(subject).to receive(:run_block_with_lock_timeout).once.and_raise(ActiveRecord::LockWaitTimeout)
allow(subject).to receive(:run_block_with_lock_timeout).once
expect(subject).not_to receive(:disable_idle_in_transaction_timeout)
......@@ -130,7 +130,7 @@ RSpec.describe Gitlab::Database::WithLockRetries do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(true)
n = 0
allow(subject).to receive(:run_block_with_transaction).twice do
allow(subject).to receive(:run_block_with_lock_timeout).twice do
n += 1
raise(ActiveRecord::LockWaitTimeout) if n == 1
end
......@@ -153,7 +153,7 @@ RSpec.describe Gitlab::Database::WithLockRetries do
context 'when there is no outer transaction: disable_ddl_transaction! is set in the migration' do
it 'does not disable the lock_timeout' do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(false)
allow(subject).to receive(:run_block_with_transaction).once.and_raise(ActiveRecord::LockWaitTimeout)
allow(subject).to receive(:run_block_with_lock_timeout).once.and_raise(ActiveRecord::LockWaitTimeout)
expect(subject).not_to receive(:disable_lock_timeout)
......@@ -164,7 +164,7 @@ RSpec.describe Gitlab::Database::WithLockRetries do
context 'when there is outer transaction: disable_ddl_transaction! is not set in the migration' do
it 'disables the lock_timeout' do
allow(ActiveRecord::Base.connection).to receive(:transaction_open?).and_return(true)
allow(subject).to receive(:run_block_with_transaction).once.and_raise(ActiveRecord::LockWaitTimeout)
allow(subject).to receive(:run_block_with_lock_timeout).once.and_raise(ActiveRecord::LockWaitTimeout)
expect(subject).to receive(:disable_lock_timeout)
......@@ -192,7 +192,7 @@ RSpec.describe Gitlab::Database::WithLockRetries do
lock_acquired = false
expect(subject).to receive(:sleep).exactly(expected_attempts_with_timeout - 1).times
expect(subject).to receive(:run_block_with_transaction).exactly(expected_attempts_with_timeout).times.and_call_original
expect(subject).to receive(:run_block_with_lock_timeout).exactly(expected_attempts_with_timeout).times.and_call_original
expect do
subject.run(raise_on_exhaustion: true) do
......@@ -251,7 +251,7 @@ RSpec.describe Gitlab::Database::WithLockRetries do
end
it 'calls `sleep` after the first iteration fails, using the configured sleep time' do
expect(subject).to receive(:run_block_with_transaction).and_raise(ActiveRecord::LockWaitTimeout).twice
expect(subject).to receive(:run_block_with_lock_timeout).and_raise(ActiveRecord::LockWaitTimeout).twice
expect(subject).to receive(:sleep).with(0.025)
subject.run { }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment