Commit 4985088f authored by Grzegorz Bizon's avatar Grzegorz Bizon

Merge branch 'fix/gb/fix-chunks-migration-race' into 'master'

Use optimistic locking to safely migrate a build trace chunk

See merge request gitlab-org/gitlab!44588
parents ba729e60 07f36f22
......@@ -7,6 +7,7 @@ module Ci
include ::FastDestroyAll
include ::Checksummable
include ::Gitlab::ExclusiveLeaseHelpers
include ::Gitlab::OptimisticLocking
belongs_to :build, class_name: "Ci::Build", foreign_key: :build_id
......@@ -116,12 +117,8 @@ module Ci
(start_offset...end_offset)
end
def persist_data!
in_lock(*lock_params) { unsafe_persist_data! }
end
def schedule_to_persist!
return if persisted?
return if flushed?
Ci::BuildTraceChunkFlushWorker.perform_async(id)
end
......@@ -131,13 +128,30 @@ module Ci
# happen that a chunk gets migrated after being loaded by another worker
# but before the worker acquires a lock to perform the migration.
#
# We want to reset a chunk in that case and retry migration. If it fails
# again, we want to re-raise the exception.
# We are using Redis locking to ensure that we perform this operation
# inside an exclusive lock, but this does not prevent us from running into
# race conditions related to updating a model representation in the
# database. Optimistic locking is another mechanism that help here.
#
# We are using optimistic locking combined with Redis locking to ensure
# that a chunk gets migrated properly.
#
def flush!
persist_data!
rescue FailedToPersistDataError
self.reset.persist_data!
def persist_data!
in_lock(*lock_params) do # exclusive Redis lock is acquired first
raise FailedToPersistDataError, 'Modifed build trace chunk detected' if has_changes_to_save?
self.reset.then do |chunk| # we ensure having latest lock_version
chunk.unsafe_persist_data! # we migrate the data and update data store
end
end
rescue ActiveRecord::StaleObjectError
raise FailedToPersistDataError, <<~MSG
Data migration race condition detected
store: #{data_store}
build: #{build.id}
index: #{chunk_index}
MSG
end
##
......@@ -149,10 +163,14 @@ module Ci
build.pending_state.present? && chunks_max_index == chunk_index
end
def persisted?
def flushed?
!redis?
end
def migrated?
flushed?
end
def live?
redis?
end
......@@ -163,7 +181,7 @@ module Ci
self.chunk_index <=> other.chunk_index
end
private
protected
def get_data
# Redis / database return UTF-8 encoded string by default
......@@ -182,7 +200,7 @@ module Ci
data is not fulfilled in a bucket
size: #{current_size}
state: #{build.pending_state.present?}
state: #{pending_state?}
max: #{chunks_max_index}
index: #{chunk_index}
MSG
......@@ -239,6 +257,12 @@ module Ci
size == CHUNK_SIZE
end
private
def pending_state?
build.pending_state.present?
end
def current_store
self.class.get_store_class(data_store)
end
......
......@@ -8,8 +8,10 @@ module Ci
idempotent!
# rubocop: disable CodeReuse/ActiveRecord
def perform(chunk_id)
::Ci::BuildTraceChunk.find_by(id: chunk_id).try(&:flush!)
def perform(id)
::Ci::BuildTraceChunk.find_by(id: id).try do |chunk|
chunk.persist_data!
end
end
# rubocop: enable CodeReuse/ActiveRecord
end
......
---
title: Use optimistic locking to safely migrate a build trace chunk
merge_request: 44588
author:
type: fixed
# frozen_string_literal: true
class AddLockVersionToCiBuildTraceChunk < ActiveRecord::Migration[6.0]
DOWNTIME = false
def change
add_column :ci_build_trace_chunks, :lock_version, :integer, default: 0, null: false
end
end
761cad9a584d98e3086e716f7a5c1d9b4aba87b084efcfcee7272cfdf1179372
\ No newline at end of file
......@@ -9911,7 +9911,8 @@ CREATE TABLE ci_build_trace_chunks (
chunk_index integer NOT NULL,
data_store integer NOT NULL,
raw_data bytea,
checksum bytea
checksum bytea,
lock_version integer DEFAULT 0 NOT NULL
);
CREATE SEQUENCE ci_build_trace_chunks_id_seq
......
......@@ -502,6 +502,10 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
describe '#persist_data!' do
let(:build) { create(:ci_build, :running) }
before do
build_trace_chunk.save!
end
subject { build_trace_chunk.persist_data! }
shared_examples_for 'Atomic operation' do
......@@ -575,6 +579,33 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(build_trace_chunk.fog?).to be_truthy
end
end
context 'when the chunk has been modifed by a different worker' do
it 'reloads the chunk before migration' do
described_class
.find(build_trace_chunk.id)
.update!(data_store: :fog)
build_trace_chunk.persist_data!
end
it 'verifies the operation using optimistic locking' do
allow(build_trace_chunk)
.to receive(:save!)
.and_raise(ActiveRecord::StaleObjectError)
expect { build_trace_chunk.persist_data! }
.to raise_error(described_class::FailedToPersistDataError)
end
it 'does not allow flushing unpersisted chunk' do
build_trace_chunk.checksum = '12345'
expect { build_trace_chunk.persist_data! }
.to raise_error(described_class::FailedToPersistDataError,
/Modifed build trace chunk detected/)
end
end
end
end
......@@ -780,51 +811,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
end
end
describe '#flush!' do
context 'when chunk can be flushed without problems' do
before do
allow(build_trace_chunk).to receive(:persist_data!)
end
it 'completes migration successfully' do
expect { build_trace_chunk.flush! }.not_to raise_error
end
end
context 'when the flush operation fails at first' do
it 'retries reloads the chunk' do
expect(build_trace_chunk)
.to receive(:persist_data!)
.and_raise(described_class::FailedToPersistDataError)
.ordered
expect(build_trace_chunk).to receive(:reset)
.and_return(build_trace_chunk)
.ordered
expect(build_trace_chunk)
.to receive(:persist_data!)
.ordered
build_trace_chunk.flush!
end
end
context 'when the flush constatly fails' do
before do
allow(build_trace_chunk)
.to receive(:persist_data!)
.and_raise(described_class::FailedToPersistDataError)
end
it 'attems to reset the chunk but eventually fails too' do
expect(build_trace_chunk).to receive(:reset)
.and_return(build_trace_chunk)
expect { build_trace_chunk.flush! }
.to raise_error(described_class::FailedToPersistDataError)
end
end
end
describe 'comparable build trace chunks' do
describe '#<=>' do
context 'when chunks are associated with different builds' do
......
......@@ -14,7 +14,7 @@ RSpec.describe Ci::BuildTraceChunkFlushWorker do
described_class.new.perform(chunk.id)
expect(chunk.reload).to be_persisted
expect(chunk.reload).to be_migrated
end
describe '#perform' do
......@@ -24,7 +24,7 @@ RSpec.describe Ci::BuildTraceChunkFlushWorker do
it 'migrates build trace chunk to a safe store' do
subject
expect(chunk.reload).to be_persisted
expect(chunk.reload).to be_migrated
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment