Commit a475ea20 authored by Mayra Cabrera's avatar Mayra Cabrera

Merge branch 'pb-update-conversion-helper-for-batched-migrations' into 'master'

Split up the int column conversion helpers

See merge request gitlab-org/gitlab!56788
parents 6badaecc 263fb869
......@@ -3,12 +3,12 @@
module Gitlab
module Database
module BackgroundMigration
class Scheduler
def perform(migration_wrapper: BatchedMigrationWrapper.new)
active_migration = BatchedMigration.active.queue_order.first
return unless active_migration&.interval_elapsed?
class BatchedMigrationRunner
def initialize(migration_wrapper = BatchedMigrationWrapper.new)
@migration_wrapper = migration_wrapper
end
def run_migration_job(active_migration)
if next_batched_job = create_next_batched_job!(active_migration)
migration_wrapper.perform(next_batched_job)
else
......@@ -16,8 +16,18 @@ module Gitlab
end
end
def run_entire_migration(migration)
while migration.active?
run_migration_job(migration)
migration.reload_last_job
end
end
private
attr_reader :migration_wrapper
def create_next_batched_job!(active_migration)
next_batch_range = find_next_batch_range(active_migration)
......
......@@ -4,6 +4,7 @@ module Gitlab
module Database
module MigrationHelpers
include Migrations::BackgroundMigrationHelpers
include DynamicModelHelpers
# https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
MAX_IDENTIFIER_NAME_LENGTH = 63
......@@ -927,19 +928,67 @@ module Gitlab
# This is crucial for Primary Key conversions, because setting a column
# as the PK converts even check constraints to NOT NULL constraints
# and forces an inline re-verification of the whole table.
# - It backfills the new column with the values of the existing primary key
# by scheduling background jobs.
# - It tracks the scheduled background jobs through the use of
# Gitlab::Database::BackgroundMigrationJob
# - It sets up a trigger to keep the two columns in sync.
#
# Note: this helper is intended to be used in a regular (pre-deployment) migration.
#
# This helper is part 1 of a multi-step migration process:
# 1. initialize_conversion_of_integer_to_bigint to create the new column and database triggers
# 2. backfill_conversion_of_integer_to_bigint to copy historic data using background migrations
# 3. remaining steps TBD, see #288005
#
# table - The name of the database table containing the column
# column - The name of the column that we want to convert to bigint.
# primary_key - The name of the primary key column (most often :id)
def initialize_conversion_of_integer_to_bigint(table, column, primary_key: :id)
unless table_exists?(table)
raise "Table #{table} does not exist"
end
unless column_exists?(table, primary_key)
raise "Column #{primary_key} does not exist on #{table}"
end
unless column_exists?(table, column)
raise "Column #{column} does not exist on #{table}"
end
check_trigger_permissions!(table)
old_column = column_for(table, column)
tmp_column = "#{column}_convert_to_bigint"
with_lock_retries do
if (column.to_s == primary_key.to_s) || !old_column.null
# If the column to be converted is either a PK or is defined as NOT NULL,
# set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow
# That way, we skip the expensive validation step required to add
# a NOT NULL constraint at the end of the process
add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false)
else
add_column(table, tmp_column, :bigint, default: old_column.default)
end
install_rename_triggers(table, column, tmp_column)
end
end
# Backfills the new column used in the conversion of an integer column to bigint using background migrations.
#
# - This helper should be called from a post-deployment migration.
# - In order for this helper to work properly, the new column must be first initialized with
# the `initialize_conversion_of_integer_to_bigint` helper.
# - It tracks the scheduled background jobs through Gitlab::Database::BackgroundMigration::BatchedMigration,
# which allows a more thorough check that all jobs succeeded in the
# cleanup migration and is way faster for very large tables.
# - It sets up a trigger to keep the two columns in sync
# - It does not schedule a cleanup job: we have to do that with followup
# post deployment migrations in the next release.
#
# This needs to be done manually by using the
# `cleanup_initialize_conversion_of_integer_to_bigint`
# (not yet implemented - check #288005)
# Note: this helper is intended to be used in a post-deployment migration, to ensure any new code is
# deployed (including background job changes) before we begin processing the background migration.
#
# This helper is part 2 of a multi-step migration process:
# 1. initialize_conversion_of_integer_to_bigint to create the new column and database triggers
# 2. backfill_conversion_of_integer_to_bigint to copy historic data using background migrations
# 3. remaining steps TBD, see #288005
#
# table - The name of the database table containing the column
# column - The name of the column that we want to convert to bigint.
......@@ -960,7 +1009,7 @@ module Gitlab
# and set the batch_size to 50_000 which will require
# ~50s = (50000 / 200) * (0.1 + 0.1) to complete and leaves breathing space
# between the scheduled jobs
def initialize_conversion_of_integer_to_bigint(
def backfill_conversion_of_integer_to_bigint(
table,
column,
primary_key: :id,
......@@ -969,10 +1018,6 @@ module Gitlab
interval: 2.minutes
)
if transaction_open?
raise 'initialize_conversion_of_integer_to_bigint can not be run inside a transaction'
end
unless table_exists?(table)
raise "Table #{table} does not exist"
end
......@@ -985,46 +1030,26 @@ module Gitlab
raise "Column #{column} does not exist on #{table}"
end
check_trigger_permissions!(table)
old_column = column_for(table, column)
tmp_column = "#{column}_convert_to_bigint"
with_lock_retries do
if (column.to_s == primary_key.to_s) || !old_column.null
# If the column to be converted is either a PK or is defined as NOT NULL,
# set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow
# That way, we skip the expensive validation step required to add
# a NOT NULL constraint at the end of the process
add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false)
else
add_column(table, tmp_column, :bigint, default: old_column.default)
end
install_rename_triggers(table, column, tmp_column)
end
source_model = Class.new(ActiveRecord::Base) do
include EachBatch
self.table_name = table
self.inheritance_column = :_type_disabled
unless column_exists?(table, tmp_column)
raise 'The temporary column does not exist, initialize it with `initialize_conversion_of_integer_to_bigint`'
end
queue_background_migration_jobs_by_range_at_intervals(
source_model,
batched_migration = queue_batched_background_migration(
'CopyColumnUsingBackgroundMigrationJob',
interval,
table,
primary_key,
column,
tmp_column,
job_interval: interval,
batch_size: batch_size,
other_job_arguments: [table, primary_key, sub_batch_size, column, tmp_column],
track_jobs: true,
primary_column_name: primary_key
)
sub_batch_size: sub_batch_size)
if perform_background_migration_inline?
# To ensure the schema is up to date immediately we perform the
# migration inline in dev / test environments.
Gitlab::BackgroundMigration.steal('CopyColumnUsingBackgroundMigrationJob')
Gitlab::Database::BackgroundMigration::BatchedMigrationRunner.new.run_entire_migration(batched_migration)
end
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::BackgroundMigration::BatchedMigrationRunner do
let(:migration_wrapper) { double('test wrapper') }
let(:runner) { described_class.new(migration_wrapper) }
describe '#run_migration_job' do
shared_examples_for 'it has completed the migration' do
it 'does not create and run a migration job' do
expect(migration_wrapper).not_to receive(:perform)
expect do
runner.run_migration_job(migration)
end.not_to change { Gitlab::Database::BackgroundMigration::BatchedJob.count }
end
it 'marks the migration as finished' do
relation = Gitlab::Database::BackgroundMigration::BatchedMigration.finished.where(id: migration.id)
expect { runner.run_migration_job(migration) }.to change { relation.count }.by(1)
end
end
context 'when the migration has no previous jobs' do
let(:migration) { create(:batched_background_migration, :active, batch_size: 2) }
let(:job_relation) do
Gitlab::Database::BackgroundMigration::BatchedJob.where(batched_background_migration_id: migration.id)
end
context 'when the migration has batches to process' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
it 'runs the job for the first batch' do
migration.update!(min_value: event1.id, max_value: event2.id)
expect(migration_wrapper).to receive(:perform) do |job_record|
expect(job_record).to eq(job_relation.first)
end
expect { runner.run_migration_job(migration) }.to change { job_relation.count }.by(1)
expect(job_relation.first).to have_attributes(
min_value: event1.id,
max_value: event2.id,
batch_size: migration.batch_size,
sub_batch_size: migration.sub_batch_size)
end
end
context 'when the batch maximum exceeds the migration maximum' do
let!(:events) { create_list(:event, 3) }
let(:event1) { events[0] }
let(:event2) { events[1] }
it 'clamps the batch maximum to the migration maximum' do
migration.update!(min_value: event1.id, max_value: event2.id, batch_size: 5)
expect(migration_wrapper).to receive(:perform)
expect { runner.run_migration_job(migration) }.to change { job_relation.count }.by(1)
expect(job_relation.first).to have_attributes(
min_value: event1.id,
max_value: event2.id,
batch_size: migration.batch_size,
sub_batch_size: migration.sub_batch_size)
end
end
context 'when the migration has no batches to process' do
it_behaves_like 'it has completed the migration'
end
end
context 'when the migration has previous jobs' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
let!(:migration) do
create(:batched_background_migration, :active, batch_size: 2, min_value: event1.id, max_value: event3.id)
end
let!(:previous_job) do
create(:batched_background_migration_job,
batched_migration: migration,
min_value: event1.id,
max_value: event2.id,
batch_size: 2,
sub_batch_size: 1)
end
let(:job_relation) do
Gitlab::Database::BackgroundMigration::BatchedJob.where(batched_background_migration_id: migration.id)
end
context 'when the migration has batches to process' do
it 'runs the migration job for the next batch' do
expect(migration_wrapper).to receive(:perform) do |job_record|
expect(job_record).to eq(job_relation.last)
end
expect { runner.run_migration_job(migration) }.to change { job_relation.count }.by(1)
expect(job_relation.last).to have_attributes(
min_value: event3.id,
max_value: event3.id,
batch_size: migration.batch_size,
sub_batch_size: migration.sub_batch_size)
end
context 'when the batch minimum exceeds the migration maximum' do
before do
migration.update!(batch_size: 5, max_value: event2.id)
end
it_behaves_like 'it has completed the migration'
end
end
context 'when the migration has no batches remaining' do
before do
create(:batched_background_migration_job,
batched_migration: migration,
min_value: event3.id,
max_value: event3.id,
batch_size: 2,
sub_batch_size: 1)
end
it_behaves_like 'it has completed the migration'
end
end
end
describe '#run_entire_migration' do
context 'when the given migration is not active' do
it 'does not create and run migration jobs' do
migration = build(:batched_background_migration, :finished)
expect(migration_wrapper).not_to receive(:perform)
expect do
runner.run_entire_migration(migration)
end.not_to change { Gitlab::Database::BackgroundMigration::BatchedJob.count }
end
end
context 'when the given migration is active' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
let!(:migration) do
create(:batched_background_migration, :active, batch_size: 2, min_value: event1.id, max_value: event3.id)
end
let(:job_relation) do
Gitlab::Database::BackgroundMigration::BatchedJob.where(batched_background_migration_id: migration.id)
end
it 'runs all jobs inline until finishing the migration' do
expect(migration_wrapper).to receive(:perform) do |job_record|
expect(job_record).to eq(job_relation.first)
end
expect(migration_wrapper).to receive(:perform) do |job_record|
expect(job_record).to eq(job_relation.last)
end
expect { runner.run_entire_migration(migration) }.to change { job_relation.count }.by(2)
expect(job_relation.first).to have_attributes(min_value: event1.id, max_value: event2.id)
expect(job_relation.last).to have_attributes(min_value: event3.id, max_value: event3.id)
expect(migration.reload).to be_finished
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::BackgroundMigration::Scheduler, '#perform' do
let(:scheduler) { described_class.new }
shared_examples_for 'it has no jobs to run' do
it 'does not create and run a migration job' do
test_wrapper = double('test wrapper')
expect(test_wrapper).not_to receive(:perform)
expect do
scheduler.perform(migration_wrapper: test_wrapper)
end.not_to change { Gitlab::Database::BackgroundMigration::BatchedJob.count }
end
end
context 'when there are no active migrations' do
let!(:migration) { create(:batched_background_migration, :finished) }
it_behaves_like 'it has no jobs to run'
end
shared_examples_for 'it has completed the migration' do
it 'marks the migration as finished' do
relation = Gitlab::Database::BackgroundMigration::BatchedMigration.finished.where(id: first_migration.id)
expect { scheduler.perform }.to change { relation.count }.by(1)
end
end
context 'when there are active migrations' do
let!(:first_migration) { create(:batched_background_migration, :active, batch_size: 2) }
let!(:last_migration) { create(:batched_background_migration, :active) }
let(:job_relation) do
Gitlab::Database::BackgroundMigration::BatchedJob.where(batched_background_migration_id: first_migration.id)
end
context 'when the migration interval has not elapsed' do
before do
expect_next_found_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigration) do |migration|
expect(migration).to receive(:interval_elapsed?).and_return(false)
end
end
it_behaves_like 'it has no jobs to run'
end
context 'when the interval has elapsed' do
before do
expect_next_found_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigration) do |migration|
expect(migration).to receive(:interval_elapsed?).and_return(true)
end
end
context 'when the first migration has no previous jobs' do
context 'when the migration has batches to process' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
it 'runs the job for the first batch' do
first_migration.update!(min_value: event1.id, max_value: event3.id)
expect_next_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigrationWrapper) do |wrapper|
expect(wrapper).to receive(:perform).and_wrap_original do |_, job_record|
expect(job_record).to eq(job_relation.first)
end
end
expect { scheduler.perform }.to change { job_relation.count }.by(1)
expect(job_relation.first).to have_attributes(
min_value: event1.id,
max_value: event2.id,
batch_size: first_migration.batch_size,
sub_batch_size: first_migration.sub_batch_size)
end
end
context 'when the migration has no batches to process' do
it_behaves_like 'it has no jobs to run'
it_behaves_like 'it has completed the migration'
end
end
context 'when the first migration has previous jobs' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
let!(:previous_job) do
create(:batched_background_migration_job,
batched_migration: first_migration,
min_value: event1.id,
max_value: event2.id,
batch_size: 2,
sub_batch_size: 1)
end
context 'when the migration is ready to process another job' do
it 'runs the migration job for the next batch' do
first_migration.update!(min_value: event1.id, max_value: event3.id)
expect_next_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigrationWrapper) do |wrapper|
expect(wrapper).to receive(:perform).and_wrap_original do |_, job_record|
expect(job_record).to eq(job_relation.last)
end
end
expect { scheduler.perform }.to change { job_relation.count }.by(1)
expect(job_relation.last).to have_attributes(
min_value: event3.id,
max_value: event3.id,
batch_size: first_migration.batch_size,
sub_batch_size: first_migration.sub_batch_size)
end
end
context 'when the migration has no batches remaining' do
let!(:final_job) do
create(:batched_background_migration_job,
batched_migration: first_migration,
min_value: event3.id,
max_value: event3.id,
batch_size: 2,
sub_batch_size: 1)
end
it_behaves_like 'it has no jobs to run'
it_behaves_like 'it has completed the migration'
end
end
context 'when the bounds of the next batch exceed the migration maximum value' do
let!(:events) { create_list(:event, 3) }
let(:event1) { events[0] }
let(:event2) { events[1] }
context 'when the batch maximum exceeds the migration maximum' do
it 'clamps the batch maximum to the migration maximum' do
first_migration.update!(batch_size: 5, min_value: event1.id, max_value: event2.id)
expect_next_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigrationWrapper) do |wrapper|
expect(wrapper).to receive(:perform)
end
expect { scheduler.perform }.to change { job_relation.count }.by(1)
expect(job_relation.first).to have_attributes(
min_value: event1.id,
max_value: event2.id,
batch_size: first_migration.batch_size,
sub_batch_size: first_migration.sub_batch_size)
end
end
context 'when the batch minimum exceeds the migration maximum' do
let!(:previous_job) do
create(:batched_background_migration_job,
batched_migration: first_migration,
min_value: event1.id,
max_value: event2.id,
batch_size: 5,
sub_batch_size: 1)
end
before do
first_migration.update!(batch_size: 5, min_value: 1, max_value: event2.id)
end
it_behaves_like 'it has no jobs to run'
it_behaves_like 'it has completed the migration'
end
end
end
end
end
......@@ -1702,65 +1702,171 @@ RSpec.describe Gitlab::Database::MigrationHelpers do
end
describe '#initialize_conversion_of_integer_to_bigint' do
let(:user) { create(:user) }
let(:project) { create(:project, :repository) }
let(:issue) { create(:issue, project: project) }
let!(:event) do
create(:event, :created, project: project, target: issue, author: user)
let(:table) { :test_table }
let(:column) { :id }
let(:tmp_column) { "#{column}_convert_to_bigint" }
before do
model.create_table table, id: false do |t|
t.integer :id, primary_key: true
t.integer :non_nullable_column, null: false
t.integer :nullable_column
t.timestamps
end
end
context 'in a transaction' do
it 'raises RuntimeError' do
allow(model).to receive(:transaction_open?).and_return(true)
context 'when the target table does not exist' do
it 'raises an error' do
expect { model.initialize_conversion_of_integer_to_bigint(:this_table_is_not_real, column) }
.to raise_error('Table this_table_is_not_real does not exist')
end
end
expect { model.initialize_conversion_of_integer_to_bigint(:events, :id) }
.to raise_error(RuntimeError)
context 'when the primary key does not exist' do
it 'raises an error' do
expect { model.initialize_conversion_of_integer_to_bigint(table, column, primary_key: :foobar) }
.to raise_error("Column foobar does not exist on #{table}")
end
end
context 'outside a transaction' do
before do
allow(model).to receive(:transaction_open?).and_return(false)
context 'when the column to convert does not exist' do
let(:column) { :foobar }
it 'raises an error' do
expect { model.initialize_conversion_of_integer_to_bigint(table, column) }
.to raise_error("Column #{column} does not exist on #{table}")
end
end
it 'creates a bigint column and starts backfilling it' do
expect(model)
.to receive(:add_column)
.with(
:events,
'id_convert_to_bigint',
:bigint,
default: 0,
null: false
)
context 'when the column to convert is the primary key' do
it 'creates a not-null bigint column and installs triggers' do
expect(model).to receive(:add_column).with(table, tmp_column, :bigint, default: 0, null: false)
expect(model)
.to receive(:install_rename_triggers)
.with(:events, :id, 'id_convert_to_bigint')
expect(model).to receive(:install_rename_triggers).with(table, column, tmp_column)
expect(model).to receive(:queue_background_migration_jobs_by_range_at_intervals).and_call_original
model.initialize_conversion_of_integer_to_bigint(table, column)
end
end
expect(BackgroundMigrationWorker)
.to receive(:perform_in)
.ordered
.with(
2.minutes,
'CopyColumnUsingBackgroundMigrationJob',
[event.id, event.id, :events, :id, 100, :id, 'id_convert_to_bigint']
)
context 'when the column to convert is not the primary key, but non-nullable' do
let(:column) { :non_nullable_column }
it 'creates a not-null bigint column and installs triggers' do
expect(model).to receive(:add_column).with(table, tmp_column, :bigint, default: 0, null: false)
expect(model).to receive(:install_rename_triggers).with(table, column, tmp_column)
model.initialize_conversion_of_integer_to_bigint(table, column)
end
end
context 'when the column to convert is not the primary key, but nullable' do
let(:column) { :nullable_column }
it 'creates a nullable bigint column and installs triggers' do
expect(model).to receive(:add_column).with(table, tmp_column, :bigint, default: nil)
expect(model).to receive(:install_rename_triggers).with(table, column, tmp_column)
model.initialize_conversion_of_integer_to_bigint(table, column)
end
end
end
describe '#backfill_conversion_of_integer_to_bigint' do
let(:table) { :_test_backfill_table }
let(:column) { :id }
let(:tmp_column) { "#{column}_convert_to_bigint" }
before do
model.create_table table, id: false do |t|
t.integer :id, primary_key: true
t.text :message, null: false
t.timestamps
end
expect(Gitlab::BackgroundMigration)
.to receive(:steal)
.ordered
.with('CopyColumnUsingBackgroundMigrationJob')
allow(model).to receive(:perform_background_migration_inline?).and_return(false)
end
context 'when the target table does not exist' do
it 'raises an error' do
expect { model.backfill_conversion_of_integer_to_bigint(:this_table_is_not_real, column) }
.to raise_error('Table this_table_is_not_real does not exist')
end
end
model.initialize_conversion_of_integer_to_bigint(
:events,
:id,
batch_size: 300,
sub_batch_size: 100
context 'when the primary key does not exist' do
it 'raises an error' do
expect { model.backfill_conversion_of_integer_to_bigint(table, column, primary_key: :foobar) }
.to raise_error("Column foobar does not exist on #{table}")
end
end
context 'when the column to convert does not exist' do
let(:column) { :foobar }
it 'raises an error' do
expect { model.backfill_conversion_of_integer_to_bigint(table, column) }
.to raise_error("Column #{column} does not exist on #{table}")
end
end
context 'when the temporary column does not exist' do
it 'raises an error' do
expect { model.backfill_conversion_of_integer_to_bigint(table, column) }
.to raise_error('The temporary column does not exist, initialize it with `initialize_conversion_of_integer_to_bigint`')
end
end
context 'when the conversion is properly initialized' do
let(:model_class) do
Class.new(ActiveRecord::Base) do
self.table_name = :_test_backfill_table
end
end
let(:migration_relation) { Gitlab::Database::BackgroundMigration::BatchedMigration.active }
before do
model.initialize_conversion_of_integer_to_bigint(table, column)
model_class.create!(message: 'hello')
model_class.create!(message: 'so long')
end
it 'creates the batched migration tracking record' do
last_record = model_class.create!(message: 'goodbye')
expect do
model.backfill_conversion_of_integer_to_bigint(table, column, batch_size: 2, sub_batch_size: 1)
end.to change { migration_relation.count }.by(1)
expect(migration_relation.last).to have_attributes(
job_class_name: 'CopyColumnUsingBackgroundMigrationJob',
table_name: table.to_s,
column_name: column.to_s,
min_value: 1,
max_value: last_record.id,
interval: 120,
batch_size: 2,
sub_batch_size: 1,
job_arguments: [column.to_s, "#{column}_convert_to_bigint"]
)
end
context 'when the migration should be performed inline' do
it 'calls the runner to run the entire migration' do
expect(model).to receive(:perform_background_migration_inline?).and_return(true)
expect_next_instance_of(Gitlab::Database::BackgroundMigration::BatchedMigrationRunner) do |scheduler|
expect(scheduler).to receive(:run_entire_migration) do |batched_migration|
expect(batched_migration).to eq(migration_relation.last)
end
end
model.backfill_conversion_of_integer_to_bigint(table, column, batch_size: 2, sub_batch_size: 1)
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment