Commit 502eec17 authored by Adam Hegyi's avatar Adam Hegyi

Merge branch '337100_backfill_project_namespaces' into 'master'

Add migration for backfilling project namespaces

See merge request gitlab-org/gitlab!72527
parents 083d9a51 6af41560
......@@ -17,6 +17,9 @@ class Namespace < ApplicationRecord
include EachBatch
ignore_column :delayed_project_removal, remove_with: '14.1', remove_after: '2021-05-22'
# Temporary column used for back-filling project namespaces.
# Remove it once the back-filling of all project namespaces is done.
ignore_column :tmp_project_id, remove_with: '14.7', remove_after: '2022-01-22'
# Tells ActiveRecord not to store the full class name, in order to save some space
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/69794
......
# frozen_string_literal: true
class AddTmpProjectIdColumnToNamespaces < Gitlab::Database::Migration[1.0]
enable_lock_retries!
def change
# this is a temporary column to be able to batch insert records into namespaces table and then be able to link these
# to projects table.
add_column :namespaces, :tmp_project_id, :integer # rubocop: disable Migration/AddColumnsToWideTables
end
end
# frozen_string_literal: true
class AddIndexToTmpProjectIdColumnOnNamespacesTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
INDEX_NAME = 'tmp_index_on_tmp_project_id_on_namespaces'
def up
add_concurrent_index :namespaces, :tmp_project_id, name: INDEX_NAME, unique: true
end
def down
remove_concurrent_index_by_name :namespaces, INDEX_NAME
end
end
# frozen_string_literal: true
class AddFkToTmpProjectIdColumnOnNamespacesTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
def up
add_concurrent_foreign_key :namespaces, :projects, column: :tmp_project_id
end
def down
remove_foreign_key :namespaces, column: :tmp_project_id
end
end
# frozen_string_literal: true
class AddIndexToGroupIdColumnOnWebhooksTable < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
INDEX_NAME = 'index_on_group_id_on_webhooks'
def up
add_concurrent_index :web_hooks, :group_id, name: INDEX_NAME
end
def down
remove_concurrent_index_by_name :web_hooks, INDEX_NAME
end
end
1cadc3a932d5b62cfeafcd4090eddc37b44997dbbd0b34da1c7c87a5774bb683
\ No newline at end of file
9a62f0ec43ab295619d82494090c38539cb16408c8971bdde86bb8d02546f558
\ No newline at end of file
30e9632877d3ad33528be0f56962c0ab57f5eee3889183d9638cbaea903a3d82
\ No newline at end of file
14bb815cbdad2db56dafb7eaaff893de96116a1a9e8d6c5ed95f4bef9b9717fc
\ No newline at end of file
......@@ -16375,7 +16375,8 @@ CREATE TABLE namespaces (
push_rule_id bigint,
shared_runners_enabled boolean DEFAULT true NOT NULL,
allow_descendants_override_disabled_shared_runners boolean DEFAULT false NOT NULL,
traversal_ids integer[] DEFAULT '{}'::integer[] NOT NULL
traversal_ids integer[] DEFAULT '{}'::integer[] NOT NULL,
tmp_project_id integer
);
CREATE SEQUENCE namespaces_id_seq
......@@ -26591,6 +26592,8 @@ CREATE INDEX index_oauth_openid_requests_on_access_grant_id ON oauth_openid_requ
CREATE UNIQUE INDEX index_on_deploy_keys_id_and_type_and_public ON keys USING btree (id, type) WHERE (public = true);
CREATE INDEX index_on_group_id_on_webhooks ON web_hooks USING btree (group_id);
CREATE INDEX index_on_identities_lower_extern_uid_and_provider ON identities USING btree (lower((extern_uid)::text), provider);
CREATE UNIQUE INDEX index_on_instance_statistics_recorded_at_and_identifier ON analytics_usage_trends_measurements USING btree (identifier, recorded_at);
......@@ -27769,6 +27772,8 @@ CREATE INDEX tmp_index_namespaces_empty_traversal_ids_with_child_namespaces ON n
CREATE INDEX tmp_index_namespaces_empty_traversal_ids_with_root_namespaces ON namespaces USING btree (id) WHERE ((parent_id IS NULL) AND (traversal_ids = '{}'::integer[]));
CREATE UNIQUE INDEX tmp_index_on_tmp_project_id_on_namespaces ON namespaces USING btree (tmp_project_id);
CREATE INDEX tmp_index_on_vulnerabilities_non_dismissed ON vulnerabilities USING btree (id) WHERE (state <> 2);
CREATE UNIQUE INDEX uniq_pkgs_deb_grp_architectures_on_distribution_id_and_name ON packages_debian_group_architectures USING btree (distribution_id, name);
......@@ -29012,6 +29017,9 @@ ALTER TABLE ONLY application_settings
ALTER TABLE ONLY merge_requests
ADD CONSTRAINT fk_6a5165a692 FOREIGN KEY (milestone_id) REFERENCES milestones(id) ON DELETE SET NULL;
ALTER TABLE ONLY namespaces
ADD CONSTRAINT fk_6a77f66919 FOREIGN KEY (tmp_project_id) REFERENCES projects(id) ON DELETE CASCADE;
ALTER TABLE ONLY geo_event_log
ADD CONSTRAINT fk_6ada82d42a FOREIGN KEY (container_repository_updated_event_id) REFERENCES geo_container_repository_updated_events(id) ON DELETE CASCADE;
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
# Back-fill project namespaces for projects that do not yet have a namespace.
#
# TODO: remove this comment when an actuall backfill migration is added.
#
# This is first being added without an actual migration as we need to initially test
# if backfilling project namespaces affects performance in any significant way.
# rubocop: disable Metrics/ClassLength
class BackfillProjectNamespaces
BATCH_SIZE = 100
DELETE_BATCH_SIZE = 10
PROJECT_NAMESPACE_STI_NAME = 'Project'
IsolatedModels = ::Gitlab::BackgroundMigration::ProjectNamespaces::Models
def perform(start_id, end_id, namespace_id, migration_type = 'up')
load_project_ids(start_id, end_id, namespace_id)
case migration_type
when 'up'
backfill_project_namespaces(namespace_id)
mark_job_as_succeeded(start_id, end_id, namespace_id, 'up')
when 'down'
cleanup_backfilled_project_namespaces(namespace_id)
mark_job_as_succeeded(start_id, end_id, namespace_id, 'down')
else
raise "Unknown migration type"
end
end
private
attr_accessor :project_ids
def backfill_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
# We need to lock these project records for the period when we create project namespaces
# and link them to projects so that if a project is modified in the time between creating
# project namespaces `batch_insert_namespaces` and linking them to projects `batch_update_projects`
# we do not get them out of sync.
#
# see https://gitlab.com/gitlab-org/gitlab/-/merge_requests/72527#note_730679469
Project.transaction do
Project.where(id: project_ids).select(:id).lock!('FOR UPDATE')
batch_insert_namespaces(project_ids)
batch_update_projects(project_ids)
end
batch_update_project_namespaces_traversal_ids(project_ids)
end
end
def cleanup_backfilled_project_namespaces(namespace_id)
project_ids.each_slice(BATCH_SIZE) do |project_ids|
# IMPORTANT: first nullify project_namespace_id in projects table to avoid removing projects when records
# from namespaces are deleted due to FK/triggers
nullify_project_namespaces_in_projects(project_ids)
delete_project_namespace_records(project_ids)
end
end
def batch_insert_namespaces(project_ids)
projects = IsolatedModels::Project.where(id: project_ids)
.select("projects.id, projects.name, projects.path, projects.namespace_id, projects.visibility_level, shared_runners_enabled, '#{PROJECT_NAMESPACE_STI_NAME}', now(), now()")
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO namespaces (tmp_project_id, name, path, parent_id, visibility_level, shared_runners_enabled, type, created_at, updated_at)
#{projects.to_sql}
ON CONFLICT DO NOTHING;
SQL
end
def batch_update_projects(project_ids)
projects = IsolatedModels::Project.where(id: project_ids)
.joins("INNER JOIN namespaces ON projects.id = namespaces.tmp_project_id")
.select("namespaces.id, namespaces.tmp_project_id")
ActiveRecord::Base.connection.execute <<~SQL
WITH cte(project_namespace_id, project_id) AS #{::Gitlab::Database::AsWithMaterialized.materialized_if_supported} (
#{projects.to_sql}
)
UPDATE projects
SET project_namespace_id = cte.project_namespace_id
FROM cte
WHERE id = cte.project_id AND projects.project_namespace_id IS DISTINCT FROM cte.project_namespace_id
SQL
end
def batch_update_project_namespaces_traversal_ids(project_ids)
namespaces = Namespace.where(tmp_project_id: project_ids)
.joins("INNER JOIN namespaces n2 ON namespaces.parent_id = n2.id")
.select("namespaces.id as project_namespace_id, n2.traversal_ids")
ActiveRecord::Base.connection.execute <<~SQL
UPDATE namespaces
SET traversal_ids = array_append(project_namespaces.traversal_ids, project_namespaces.project_namespace_id)
FROM (#{namespaces.to_sql}) as project_namespaces(project_namespace_id, traversal_ids)
WHERE id = project_namespaces.project_namespace_id
SQL
end
def nullify_project_namespaces_in_projects(project_ids)
IsolatedModels::Project.where(id: project_ids).update_all(project_namespace_id: nil)
end
def delete_project_namespace_records(project_ids)
project_ids.each_slice(DELETE_BATCH_SIZE) do |p_ids|
IsolatedModels::Namespace.where(type: PROJECT_NAMESPACE_STI_NAME).where(tmp_project_id: p_ids).delete_all
end
end
def load_project_ids(start_id, end_id, namespace_id)
projects = IsolatedModels::Project.arel_table
relation = IsolatedModels::Project.where(projects[:id].between(start_id..end_id))
relation = relation.where(projects[:namespace_id].in(Arel::Nodes::SqlLiteral.new(hierarchy_cte(namespace_id)))) if namespace_id
@project_ids = relation.pluck(:id)
end
def mark_job_as_succeeded(*arguments)
::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('BackfillProjectNamespaces', arguments)
end
def hierarchy_cte(root_namespace_id)
<<-SQL
WITH RECURSIVE "base_and_descendants" AS (
(
SELECT "namespaces"."id"
FROM "namespaces"
WHERE "namespaces"."type" = 'Group' AND "namespaces"."id" = #{root_namespace_id.to_i}
)
UNION
(
SELECT "namespaces"."id"
FROM "namespaces", "base_and_descendants"
WHERE "namespaces"."type" = 'Group' AND "namespaces"."parent_id" = "base_and_descendants"."id"
)
)
SELECT "id" FROM "base_and_descendants" AS "namespaces"
SQL
end
end
# rubocop: enable Metrics/ClassLength
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
module Models
# isolated Namespace model
class Namespace < ActiveRecord::Base
include EachBatch
self.table_name = 'namespaces'
self.inheritance_column = :_type_disabled
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module ProjectNamespaces
module Models
# isolated Project model
class Project < ActiveRecord::Base
include EachBatch
self.table_name = 'projects'
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::ProjectNamespaces::BackfillProjectNamespaces, :migration do
include MigrationsHelpers
context 'when migrating data', :aggregate_failures do
let(:projects) { table(:projects) }
let(:namespaces) { table(:namespaces) }
let(:parent_group1) { namespaces.create!(name: 'parent_group1', path: 'parent_group1', visibility_level: 20, type: 'Group') }
let(:parent_group2) { namespaces.create!(name: 'test1', path: 'test1', runners_token: 'my-token1', project_creation_level: 1, visibility_level: 20, type: 'Group') }
let(:parent_group1_project) { projects.create!(name: 'parent_group1_project', path: 'parent_group1_project', namespace_id: parent_group1.id, visibility_level: 20) }
let(:parent_group2_project) { projects.create!(name: 'parent_group2_project', path: 'parent_group2_project', namespace_id: parent_group2.id, visibility_level: 20) }
let(:child_nodes_count) { 2 }
let(:tree_depth) { 3 }
let(:backfilled_namespace) { nil }
before do
BackfillProjectNamespaces::TreeGenerator.new(namespaces, projects, [parent_group1, parent_group2], child_nodes_count, tree_depth).build_tree
end
describe '#up' do
shared_examples 'back-fill project namespaces' do
it 'back-fills all project namespaces' do
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
projects_count = ::Project.count
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_count = ::Namespace.where(type: 'Project').count
migration = described_class.new
expect(projects_count).not_to eq(project_namespaces_count)
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(Namespace.where(type: 'Project'), :count)
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
check_projects_in_sync_with(Namespace.where(type: 'Project'))
end
context 'when passing specific group as parameter' do
let(:backfilled_namespace) { parent_group1 }
it 'back-fills project namespaces for the specified group hierarchy' do
backfilled_namespace_projects = base_ancestor(backfilled_namespace).first.all_projects
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
expect(project_namespaces_in_hierarchy.count).to eq(0)
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect(group_projects_count).to eq(14)
expect(project_namespaces_in_hierarchy.count).to eq(0)
migration.perform(start_id, end_id, backfilled_namespace.id, 'up')
expect(project_namespaces_in_hierarchy.count).to eq(14)
check_projects_in_sync_with(project_namespaces_in_hierarchy)
end
end
context 'when projects already have project namespaces' do
before do
hierarchy1_projects = base_ancestor(parent_group1).first.all_projects
start_id = hierarchy1_projects.minimum(:id)
end_id = hierarchy1_projects.maximum(:id)
described_class.new.perform(start_id, end_id, parent_group1.id, 'up')
end
it 'does not duplicate project namespaces' do
# check there are already some project namespaces but not for all
projects_count = ::Project.count
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces = ::Namespace.where(type: 'Project')
migration = described_class.new
expect(project_namespaces_in_hierarchy(base_ancestor(parent_group1)).count).to be >= 14
expect(project_namespaces_in_hierarchy(base_ancestor(parent_group2)).count).to eq(0)
expect(projects_count).not_to eq(project_namespaces.count)
# run migration again to test we do not generate extra project namespaces
expect(migration).to receive(:batch_insert_namespaces).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:batch_update_project_namespaces_traversal_ids).exactly(batches_count).and_call_original
expect { migration.perform(start_id, end_id, nil, 'up') }.to change(project_namespaces, :count).by(14)
expect(projects_count).to eq(project_namespaces.count)
end
end
end
it 'checks no project namespaces exist in the defined hierarchies' do
hierarchy1_project_namespaces = project_namespaces_in_hierarchy(base_ancestor(parent_group1))
hierarchy2_project_namespaces = project_namespaces_in_hierarchy(base_ancestor(parent_group2))
hierarchy1_projects_count = base_ancestor(parent_group1).first.all_projects.count
hierarchy2_projects_count = base_ancestor(parent_group2).first.all_projects.count
expect(hierarchy1_project_namespaces).to be_empty
expect(hierarchy2_project_namespaces).to be_empty
expect(hierarchy1_projects_count).to eq(14)
expect(hierarchy2_projects_count).to eq(14)
end
context 'back-fill project namespaces in a single batch' do
it_behaves_like 'back-fill project namespaces'
end
context 'back-fill project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
end
it_behaves_like 'back-fill project namespaces'
end
end
describe '#down' do
before do
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
# back-fill first
described_class.new.perform(start_id, end_id, nil, 'up')
end
shared_examples 'cleanup project namespaces' do
it 'removes project namespaces' do
projects_count = ::Project.count
start_id = ::Project.minimum(:id)
end_id = ::Project.maximum(:id)
migration = described_class.new
batches_count = (projects_count / described_class::BATCH_SIZE.to_f).ceil
expect(projects_count).to be > 0
expect(projects_count).to eq(::Namespace.where(type: 'Project').count)
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, nil, 'down')
expect(::Project.count).to be > 0
expect(::Namespace.where(type: 'Project').count).to eq(0)
end
context 'when passing specific group as parameter' do
let(:backfilled_namespace) { parent_group1 }
it 'removes project namespaces only for the specific group hierarchy' do
backfilled_namespace_projects = base_ancestor(backfilled_namespace).first.all_projects
start_id = backfilled_namespace_projects.minimum(:id)
end_id = backfilled_namespace_projects.maximum(:id)
group_projects_count = backfilled_namespace_projects.count
batches_count = (group_projects_count / described_class::BATCH_SIZE.to_f).ceil
project_namespaces_in_hierarchy = project_namespaces_in_hierarchy(base_ancestor(backfilled_namespace))
migration = described_class.new
expect(project_namespaces_in_hierarchy.count).to eq(14)
expect(migration).to receive(:nullify_project_namespaces_in_projects).exactly(batches_count).and_call_original
expect(migration).to receive(:delete_project_namespace_records).exactly(batches_count).and_call_original
migration.perform(start_id, end_id, backfilled_namespace.id, 'down')
expect(::Namespace.where(type: 'Project').count).to be > 0
expect(project_namespaces_in_hierarchy.count).to eq(0)
end
end
end
context 'cleanup project namespaces in a single batch' do
it_behaves_like 'cleanup project namespaces'
end
context 'cleanup project namespaces in batches' do
before do
stub_const("#{described_class.name}::BATCH_SIZE", 2)
end
it_behaves_like 'cleanup project namespaces'
end
end
end
def base_ancestor(ancestor)
::Namespace.where(id: ancestor.id)
end
def project_namespaces_in_hierarchy(base_node)
Gitlab::ObjectHierarchy.new(base_node).base_and_descendants.where(type: 'Project')
end
def check_projects_in_sync_with(namespaces)
project_namespaces_attrs = namespaces.order(:id).pluck(:id, :name, :path, :parent_id, :visibility_level, :shared_runners_enabled)
corresponding_projects_attrs = Project.where(project_namespace_id: project_namespaces_attrs.map(&:first))
.order(:project_namespace_id).pluck(:project_namespace_id, :name, :path, :namespace_id, :visibility_level, :shared_runners_enabled)
expect(project_namespaces_attrs).to eq(corresponding_projects_attrs)
end
end
module BackfillProjectNamespaces
class TreeGenerator
def initialize(namespaces, projects, parent_nodes, child_nodes_count, tree_depth)
parent_nodes_ids = parent_nodes.map(&:id)
@namespaces = namespaces
@projects = projects
@subgroups_depth = tree_depth
@resource_count = child_nodes_count
@all_groups = [parent_nodes_ids]
end
def build_tree
(1..@subgroups_depth).each do |level|
parent_level = level - 1
current_level = level
parent_groups = @all_groups[parent_level]
parent_groups.each do |parent_id|
@resource_count.times do |i|
group_path = "child#{i}_level#{level}"
project_path = "project#{i}_level#{level}"
sub_group = @namespaces.create!(name: group_path, path: group_path, parent_id: parent_id, visibility_level: 20, type: 'Group')
@projects.create!(name: project_path, path: project_path, namespace_id: sub_group.id, visibility_level: 20)
track_group_id(current_level, sub_group.id)
end
end
end
end
def track_group_id(depth_level, group_id)
@all_groups[depth_level] ||= []
@all_groups[depth_level] << group_id
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment