Commit 0849c691 authored by Yannis Roussos's avatar Yannis Roussos

Merge branch '292841-add-migration-helpers-for-initializing-int-to-bigint-convertion' into 'master'

Add migration helpers for converting int columns to bigint

See merge request gitlab-org/gitlab!49778
parents 599ff623 ecc0fe2b
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# Background migration that extends CopyColumn to update the value of a
# column using the value of another column in the same table.
#
# - The {start_id, end_id} arguments are at the start so that it can be used
# with `queue_background_migration_jobs_by_range_at_intervals`
# - Provides support for background job tracking through the use of
# Gitlab::Database::BackgroundMigrationJob
# - Uses sub-batching so that we can keep each update's execution time at
# low 100s ms, while being able to update more records per 2 minutes
# that we allow background migration jobs to be scheduled one after the other
# - We skip the NULL checks as they may result in not using an index scan
# - The table that is migrated does _not_ need `id` as the primary key
# We use the provided primary_key column to perform the update.
class CopyColumnUsingBackgroundMigrationJob
include Gitlab::Database::DynamicModelHelpers
PAUSE_SECONDS = 0.1
# start_id - The start ID of the range of rows to update.
# end_id - The end ID of the range of rows to update.
# table - The name of the table that contains the columns.
# primary_key - The primary key column of the table.
# copy_from - The column containing the data to copy.
# copy_to - The column to copy the data to.
# sub_batch_size - We don't want updates to take more than ~100ms
# This allows us to run multiple smaller batches during
# the minimum 2.minute interval that we can schedule jobs
def perform(start_id, end_id, table, primary_key, copy_from, copy_to, sub_batch_size)
quoted_copy_from = connection.quote_column_name(copy_from)
quoted_copy_to = connection.quote_column_name(copy_to)
parent_batch_relation = relation_scoped_to_range(table, primary_key, start_id, end_id)
parent_batch_relation.each_batch(column: primary_key, of: sub_batch_size) do |sub_batch|
sub_batch.update_all("#{quoted_copy_to}=#{quoted_copy_from}")
sleep(PAUSE_SECONDS)
end
# We have to add all arguments when marking a job as succeeded as they
# are all used to track the job by `queue_background_migration_jobs_by_range_at_intervals`
mark_job_as_succeeded(start_id, end_id, table, primary_key, copy_from, copy_to, sub_batch_size)
end
private
def connection
ActiveRecord::Base.connection
end
def mark_job_as_succeeded(*arguments)
Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(self.class.name, arguments)
end
def relation_scoped_to_range(source_table, source_key_column, start_id, stop_id)
define_batchable_model(source_table).where(source_key_column => start_id..stop_id)
end
end
end
end
...@@ -858,6 +858,120 @@ module Gitlab ...@@ -858,6 +858,120 @@ module Gitlab
end end
end end
# Initializes the conversion of an integer column to bigint
#
# It can be used for converting both a Primary Key and any Foreign Keys
# that may reference it or any other integer column that we may want to
# upgrade (e.g. columns that store IDs, but are not set as FKs).
#
# - For primary keys and Foreign Keys (or other columns) defined as NOT NULL,
# the new bigint column is added with a hardcoded NOT NULL DEFAULT 0
# which allows us to skip a very costly verification step once we
# are ready to switch it.
# This is crucial for Primary Key conversions, because setting a column
# as the PK converts even check constraints to NOT NULL constraints
# and forces an inline re-verification of the whole table.
# - It backfills the new column with the values of the existing primary key
# by scheduling background jobs.
# - It tracks the scheduled background jobs through the use of
# Gitlab::Database::BackgroundMigrationJob
# which allows a more thorough check that all jobs succeeded in the
# cleanup migration and is way faster for very large tables.
# - It sets up a trigger to keep the two columns in sync
# - It does not schedule a cleanup job: we have to do that with followup
# post deployment migrations in the next release.
#
# This needs to be done manually by using the
# `cleanup_initialize_conversion_of_integer_to_bigint`
# (not yet implemented - check #288005)
#
# table - The name of the database table containing the column
# column - The name of the column that we want to convert to bigint.
# primary_key - The name of the primary key column (most often :id)
# batch_size - The number of rows to schedule in a single background migration
# sub_batch_size - The smaller batches that will be used by each scheduled job
# to update the table. Useful to keep each update at ~100ms while executing
# more updates per interval (2.minutes)
# Note that each execution of a sub-batch adds a constant 100ms sleep
# time in between the updates, which must be taken into account
# while calculating the batch, sub_batch and interval values.
# interval - The time interval between every background migration
#
# example:
# Assume that we have figured out that updating 200 records of the events
# table takes ~100ms on average.
# We can set the sub_batch_size to 200, leave the interval to the default
# and set the batch_size to 50_000 which will require
# ~50s = (50000 / 200) * (0.1 + 0.1) to complete and leaves breathing space
# between the scheduled jobs
def initialize_conversion_of_integer_to_bigint(
table,
column,
primary_key: :id,
batch_size: 20_000,
sub_batch_size: 1000,
interval: 2.minutes
)
if transaction_open?
raise 'initialize_conversion_of_integer_to_bigint can not be run inside a transaction'
end
unless table_exists?(table)
raise "Table #{table} does not exist"
end
unless column_exists?(table, primary_key)
raise "Column #{primary_key} does not exist on #{table}"
end
unless column_exists?(table, column)
raise "Column #{column} does not exist on #{table}"
end
check_trigger_permissions!(table)
old_column = column_for(table, column)
tmp_column = "#{column}_convert_to_bigint"
with_lock_retries do
if (column.to_s == primary_key.to_s) || !old_column.null
# If the column to be converted is either a PK or is defined as NOT NULL,
# set it to `NOT NULL DEFAULT 0` and we'll copy paste the correct values bellow
# That way, we skip the expensive validation step required to add
# a NOT NULL constraint at the end of the process
add_column(table, tmp_column, :bigint, default: old_column.default || 0, null: false)
else
add_column(table, tmp_column, :bigint, default: old_column.default)
end
install_rename_triggers(table, column, tmp_column)
end
source_model = Class.new(ActiveRecord::Base) do
include EachBatch
self.table_name = table
self.inheritance_column = :_type_disabled
end
queue_background_migration_jobs_by_range_at_intervals(
source_model,
'CopyColumnUsingBackgroundMigrationJob',
interval,
batch_size: batch_size,
other_job_arguments: [table, primary_key, column, tmp_column, sub_batch_size],
track_jobs: true,
primary_column_name: primary_key
)
if perform_background_migration_inline?
# To ensure the schema is up to date immediately we perform the
# migration inline in dev / test environments.
Gitlab::BackgroundMigration.steal('CopyColumnUsingBackgroundMigrationJob')
end
end
# Performs a concurrent column rename when using PostgreSQL. # Performs a concurrent column rename when using PostgreSQL.
def install_rename_triggers_for_postgresql(trigger, table, old, new) def install_rename_triggers_for_postgresql(trigger, table, old, new)
execute <<-EOF.strip_heredoc execute <<-EOF.strip_heredoc
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob do
let(:table_name) { :copy_primary_key_test }
let(:test_table) { table(table_name) }
let(:sub_batch_size) { 1000 }
before do
ActiveRecord::Base.connection.execute(<<~SQL)
CREATE TABLE #{table_name}
(
id integer NOT NULL,
name character varying,
fk integer NOT NULL,
id_convert_to_bigint bigint DEFAULT 0 NOT NULL,
fk_convert_to_bigint bigint DEFAULT 0 NOT NULL,
name_convert_to_text text DEFAULT 'no name'
);
SQL
# Insert some data, it doesn't make a difference
test_table.create!(id: 11, name: 'test1', fk: 1)
test_table.create!(id: 12, name: 'test2', fk: 2)
test_table.create!(id: 15, name: nil, fk: 3)
test_table.create!(id: 19, name: 'test4', fk: 4)
end
after do
# Make sure that the temp table we created is dropped (it is not removed by the database_cleaner)
ActiveRecord::Base.connection.execute(<<~SQL)
DROP TABLE IF EXISTS #{table_name};
SQL
end
subject { described_class.new }
describe '#perform' do
let(:migration_class) { described_class.name }
let!(:job1) do
table(:background_migration_jobs).create!(
class_name: migration_class,
arguments: [1, 10, table_name, 'id', 'id', 'id_convert_to_bigint', sub_batch_size]
)
end
let!(:job2) do
table(:background_migration_jobs).create!(
class_name: migration_class,
arguments: [11, 20, table_name, 'id', 'id', 'id_convert_to_bigint', sub_batch_size]
)
end
it 'copies all primary keys in range' do
subject.perform(12, 15, table_name, 'id', 'id', 'id_convert_to_bigint', sub_batch_size)
expect(test_table.where('id = id_convert_to_bigint').pluck(:id)).to contain_exactly(12, 15)
expect(test_table.where(id_convert_to_bigint: 0).pluck(:id)).to contain_exactly(11, 19)
expect(test_table.all.count).to eq(4)
end
it 'copies all foreign keys in range' do
subject.perform(10, 14, table_name, 'id', 'fk', 'fk_convert_to_bigint', sub_batch_size)
expect(test_table.where('fk = fk_convert_to_bigint').pluck(:id)).to contain_exactly(11, 12)
expect(test_table.where(fk_convert_to_bigint: 0).pluck(:id)).to contain_exactly(15, 19)
expect(test_table.all.count).to eq(4)
end
it 'copies columns with NULLs' do
expect(test_table.where("name_convert_to_text = 'no name'").count).to eq(4)
subject.perform(10, 20, table_name, 'id', 'name', 'name_convert_to_text', sub_batch_size)
expect(test_table.where('name = name_convert_to_text').pluck(:id)).to contain_exactly(11, 12, 19)
expect(test_table.where('name is NULL and name_convert_to_text is NULL').pluck(:id)).to contain_exactly(15)
expect(test_table.where("name_convert_to_text = 'no name'").count).to eq(0)
end
it 'tracks completion with BackgroundMigrationJob' do
expect do
subject.perform(11, 20, table_name, 'id', 'id', 'id_convert_to_bigint', sub_batch_size)
end.to change { Gitlab::Database::BackgroundMigrationJob.succeeded.count }.from(0).to(1)
expect(job1.reload.status).to eq(0)
expect(job2.reload.status).to eq(1)
expect(test_table.where('id = id_convert_to_bigint').count).to eq(4)
end
end
end
...@@ -1548,6 +1548,69 @@ RSpec.describe Gitlab::Database::MigrationHelpers do ...@@ -1548,6 +1548,69 @@ RSpec.describe Gitlab::Database::MigrationHelpers do
end end
end end
describe '#initialize_conversion_of_integer_to_bigint' do
let(:user) { create(:user) }
let(:project) { create(:project, :repository) }
let(:issue) { create(:issue, project: project) }
let!(:event) do
create(:event, :created, project: project, target: issue, author: user)
end
context 'in a transaction' do
it 'raises RuntimeError' do
allow(model).to receive(:transaction_open?).and_return(true)
expect { model.initialize_conversion_of_integer_to_bigint(:events, :id) }
.to raise_error(RuntimeError)
end
end
context 'outside a transaction' do
before do
allow(model).to receive(:transaction_open?).and_return(false)
end
it 'creates a bigint column and starts backfilling it' do
expect(model)
.to receive(:add_column)
.with(
:events,
'id_convert_to_bigint',
:bigint,
default: 0,
null: false
)
expect(model)
.to receive(:install_rename_triggers)
.with(:events, :id, 'id_convert_to_bigint')
expect(model).to receive(:queue_background_migration_jobs_by_range_at_intervals).and_call_original
expect(BackgroundMigrationWorker)
.to receive(:perform_in)
.ordered
.with(
2.minutes,
'CopyColumnUsingBackgroundMigrationJob',
[event.id, event.id, :events, :id, :id, 'id_convert_to_bigint', 100]
)
expect(Gitlab::BackgroundMigration)
.to receive(:steal)
.ordered
.with('CopyColumnUsingBackgroundMigrationJob')
model.initialize_conversion_of_integer_to_bigint(
:events,
:id,
batch_size: 300,
sub_batch_size: 100
)
end
end
end
describe '#index_exists_by_name?' do describe '#index_exists_by_name?' do
it 'returns true if an index exists' do it 'returns true if an index exists' do
ActiveRecord::Base.connection.execute( ActiveRecord::Base.connection.execute(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment