Merge branch '338004-backfill-issue-work-item-type-id' into 'master'

Backfill work_item_type_id for all Issues See merge request gitlab-org/gitlab!71869

Merge branch '338004-backfill-issue-work-item-type-id' into 'master'
Backfill work_item_type_id for all Issues See merge request gitlab-org/gitlab!71869
98da9924 · Krasimir Angelov · cf1dc5fd · 1e078444 · 98da9924 · 98da9924
Commit 98da9924 authored Mar 23, 2022 by Krasimir Angelov
11 changed files
--- a/db/post_migrate/20220315181125_remove_issues_work_item_type_id_index.rb
+++ b/db/post_migrate/20220315181125_remove_issues_work_item_type_id_index.rb
+# frozen_string_literal: true
+
+class RemoveIssuesWorkItemTypeIdIndex < Gitlab::Database::Migration[1.0]
+  disable_ddl_transaction!
+
+  INDEX_NAME = 'index_issues_on_work_item_type_id'
+
+  def up
+    remove_concurrent_index_by_name :issues, name: INDEX_NAME
+  end
+
+  def down
+    add_concurrent_index :issues, :work_item_type_id, name: INDEX_NAME
+  end
+end
--- a/db/post_migrate/20220315181130_add_temporary_issue_type_index_for_work_item_types.rb
+++ b/db/post_migrate/20220315181130_add_temporary_issue_type_index_for_work_item_types.rb
+# frozen_string_literal: true
+
+class AddTemporaryIssueTypeIndexForWorkItemTypes < Gitlab::Database::Migration[1.0]
+  disable_ddl_transaction!
+
+  INDEX_NAME = 'tmp_index_issues_on_issue_type_and_id'
+
+  def up
+    add_concurrent_index :issues, [:issue_type, :id], name: INDEX_NAME
+  end
+
+  def down
+    remove_concurrent_index_by_name :issues, INDEX_NAME
+  end
+end
--- a/db/post_migrate/20220315181136_backfill_work_item_type_id_on_issues.rb
+++ b/db/post_migrate/20220315181136_backfill_work_item_type_id_on_issues.rb
+# frozen_string_literal: true
+
+class BackfillWorkItemTypeIdOnIssues < Gitlab::Database::Migration[1.0]
+  MIGRATION = 'BackfillWorkItemTypeIdForIssues'
+  BATCH_SIZE = 10_000
+  MAX_BATCH_SIZE = 30_000
+  SUB_BATCH_SIZE = 100
+  INTERVAL = 2.minutes
+
+  class MigrationWorkItemType < ApplicationRecord
+    self.table_name = 'work_item_types'
+
+    def self.id_by_type
+      where(namespace_id: nil).order(:base_type).pluck(:base_type, :id).to_h
+    end
+  end
+
+  def up
+    # We expect no more than 5 types. Only 3 of them are expected to have associated issues at the moment
+    MigrationWorkItemType.id_by_type.each do |base_type, type_id|
+      queue_batched_background_migration(
+        MIGRATION,
+        :issues,
+        :id,
+        base_type,
+        type_id,
+        job_interval: INTERVAL,
+        batch_size: BATCH_SIZE,
+        max_batch_size: MAX_BATCH_SIZE,
+        sub_batch_size: SUB_BATCH_SIZE
+      )
+    end
+  end
+
+  def down
+    Gitlab::Database::BackgroundMigration::BatchedMigration.where(job_class_name: MIGRATION).delete_all
+  end
+end
--- a/db/schema_migrations/20220315181125
+++ b/db/schema_migrations/20220315181125
+1276e86db3e2922cd1535e98f17768d1ab0dd653317f918fb8e6819760893a47
\ No newline at end of file
--- a/db/schema_migrations/20220315181130
+++ b/db/schema_migrations/20220315181130
+8ab13f76037450704a29220ca6af0174b373ce743620952e3bc197c952d00c1d
\ No newline at end of file
--- a/db/schema_migrations/20220315181136
+++ b/db/schema_migrations/20220315181136
+7dc3d314e76f26a539c0aa009bd45ea6ccc9c6108f798728909fd4b18e3c31b9
\ No newline at end of file
--- a/db/structure.sql
+++ b/db/structure.sql
@@ -27961,8 +27961,6 @@ CREATE INDEX index_issues_on_updated_at ON issues USING btree (updated_at);

 CREATE INDEX index_issues_on_updated_by_id ON issues USING btree (updated_by_id) WHERE (updated_by_id IS NOT NULL);

-CREATE INDEX index_issues_on_work_item_type_id ON issues USING btree (work_item_type_id);
-
 CREATE INDEX index_iterations_cadences_on_group_id ON iterations_cadences USING btree (group_id);

 CREATE UNIQUE INDEX index_jira_connect_installations_on_client_key ON jira_connect_installations USING btree (client_key);
@@ -29615,6 +29613,8 @@ CREATE INDEX tmp_index_for_namespace_id_migration_on_group_members ON members US

 CREATE INDEX tmp_index_for_namespace_id_migration_on_routes ON routes USING btree (id) WHERE ((namespace_id IS NULL) AND ((source_type)::text = 'Namespace'::text));

+CREATE INDEX tmp_index_issues_on_issue_type_and_id ON issues USING btree (issue_type, id);
+
 CREATE INDEX tmp_index_members_on_state ON members USING btree (state) WHERE (state = 2);

 CREATE INDEX tmp_index_namespaces_empty_traversal_ids_with_child_namespaces ON namespaces USING btree (id) WHERE ((parent_id IS NOT NULL) AND (traversal_ids = '{}'::integer[]));
--- a/lib/gitlab/background_migration/backfill_work_item_type_id_for_issues.rb
+++ b/lib/gitlab/background_migration/backfill_work_item_type_id_for_issues.rb
+# frozen_string_literal: true
+
+module Gitlab
+  module BackgroundMigration
+    # Backfills the `issues.work_item_type_id` column, replacing any
+    # instances of `NULL` with the appropriate `work_item_types.id` based on `issues.issue_type`
+    class BackfillWorkItemTypeIdForIssues
+      # Basic AR model for issues table
+      class MigrationIssue < ApplicationRecord
+        include ::EachBatch
+
+        self.table_name = 'issues'
+
+        scope :base_query, ->(base_type) { where(work_item_type_id: nil, issue_type: base_type) }
+      end
+
+      MAX_UPDATE_RETRIES = 3
+
+      def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, pause_ms, base_type, base_type_id)
+        parent_batch_relation = relation_scoped_to_range(batch_table, batch_column, start_id, end_id, base_type)
+
+        parent_batch_relation.each_batch(column: batch_column, of: sub_batch_size) do |sub_batch|
+          first, last = sub_batch.pluck(Arel.sql('min(id), max(id)')).first
+
+          # The query need to be reconstructed because .each_batch modifies the default scope
+          # See: https://gitlab.com/gitlab-org/gitlab/-/issues/330510
+          reconstructed_sub_batch = MigrationIssue.unscoped.base_query(base_type).where(id: first..last)
+
+          batch_metrics.time_operation(:update_all) do
+            update_with_retry(reconstructed_sub_batch, base_type_id)
+          end
+
+          pause_ms = 0 if pause_ms < 0
+          sleep(pause_ms * 0.001)
+        end
+      end
+
+      def batch_metrics
+        @batch_metrics ||= Gitlab::Database::BackgroundMigration::BatchMetrics.new
+      end
+
+      private
+
+      # Retry mechanism required as update statements on the issues table will randomly take longer than
+      # expected due to gin indexes https://gitlab.com/gitlab-org/gitlab/-/merge_requests/71869#note_775796352
+      def update_with_retry(sub_batch, base_type_id)
+        update_attempt = 1
+
+        begin
+          update_batch(sub_batch, base_type_id)
+        rescue ActiveRecord::StatementTimeout => e
+          update_attempt += 1
+
+          if update_attempt <= MAX_UPDATE_RETRIES
+            # sleeping 30 seconds as it might take a long time to clean the gin index pending list
+            sleep(30)
+            retry
+          end
+
+          raise e
+        end
+      end
+
+      def update_batch(sub_batch, base_type_id)
+        sub_batch.update_all(work_item_type_id: base_type_id)
+      end
+
+      def relation_scoped_to_range(source_table, source_key_column, start_id, end_id, base_type)
+        MigrationIssue.where(source_key_column => start_id..end_id).base_query(base_type)
+      end
+    end
+  end
+end
--- a/spec/db/schema_spec.rb
+++ b/spec/db/schema_spec.rb
@@ -10,6 +10,10 @@ RSpec.describe 'Database schema' do
  let(:tables) { connection.tables }
  let(:columns_name_with_jsonb) { retrieve_columns_name_with_jsonb }

+  IGNORED_INDEXES_ON_FKS = {
+    issues: %w[work_item_type_id]
+  }.with_indifferent_access.freeze
+
  # List of columns historically missing a FK, don't add more columns
  # See: https://docs.gitlab.com/ee/development/foreign_keys.html#naming-foreign-keys
  IGNORED_FK_COLUMNS = {
@@ -115,6 +119,7 @@ RSpec.describe 'Database schema' do
              columns.first.chomp
            end
            foreign_keys_columns = all_foreign_keys.map(&:column)
+            required_indexed_columns = foreign_keys_columns - ignored_index_columns(table)

            # Add the primary key column to the list of indexed columns because
            # postgres and mysql both automatically create an index on the primary
@@ -122,7 +127,7 @@ RSpec.describe 'Database schema' do
            # automatically generated indexes (like the primary key index).
            first_indexed_column.push(primary_key_column)

-            expect(first_indexed_column.uniq).to include(*foreign_keys_columns)
+            expect(first_indexed_column.uniq).to include(*required_indexed_columns)
          end
        end

@@ -305,8 +310,12 @@ RSpec.describe 'Database schema' do
    @models_by_table_name ||= ApplicationRecord.descendants.reject(&:abstract_class).group_by(&:table_name)
  end

-  def ignored_fk_columns(column)
-    IGNORED_FK_COLUMNS.fetch(column, [])
+  def ignored_fk_columns(table)
+    IGNORED_FK_COLUMNS.fetch(table, [])
+  end
+
+  def ignored_index_columns(table)
+    IGNORED_INDEXES_ON_FKS.fetch(table, [])
  end

  def ignored_limit_enums(model)

--- a/spec/lib/gitlab/background_migration/backfill_work_item_type_id_for_issues_spec.rb
+++ b/spec/lib/gitlab/background_migration/backfill_work_item_type_id_for_issues_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Gitlab::BackgroundMigration::BackfillWorkItemTypeIdForIssues do
+  subject(:migrate) { migration.perform(start_id, end_id, batch_table, batch_column, sub_batch_size, pause_ms, issue_type_enum[:issue], issue_type.id) }
+
+  let(:migration) { described_class.new }
+
+  let(:batch_table) { 'issues' }
+  let(:batch_column) { 'id' }
+  let(:sub_batch_size) { 2 }
+  let(:pause_ms) { 0 }
+
+  # let_it_be can't be used in migration specs because all tables but `work_item_types` are deleted after each spec
+  let(:issue_type_enum) { { issue: 0, incident: 1, test_case: 2, requirement: 3, task: 4 } }
+  let(:namespace) { table(:namespaces).create!(name: 'namespace', path: 'namespace') }
+  let(:project) { table(:projects).create!(namespace_id: namespace.id) }
+  let(:issues_table) { table(:issues) }
+  let(:issue_type) { table(:work_item_types).find_by!(namespace_id: nil, base_type: issue_type_enum[:issue]) }
+
+  let(:issue1) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:issue]) }
+  let(:issue2) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:issue]) }
+  let(:issue3) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:issue]) }
+  let(:incident1) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:incident]) }
+  # test_case and requirement are EE only, but enum values exist on the FOSS model
+  let(:test_case1) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:test_case]) }
+  let(:requirement1) { issues_table.create!(project_id: project.id, issue_type: issue_type_enum[:requirement]) }
+
+  let(:start_id) { issue1.id }
+  let(:end_id) { requirement1.id }
+
+  let(:all_issues) { [issue1, issue2, issue3, incident1, test_case1, requirement1] }
+
+  it 'sets work_item_type_id only for the given type' do
+    expect(all_issues).to all(have_attributes(work_item_type_id: nil))
+
+    expect { migrate }.to make_queries_matching(/UPDATE \"issues\" SET "work_item_type_id"/, 2)
+    all_issues.each(&:reload)
+
+    expect([issue1, issue2, issue3]).to all(have_attributes(work_item_type_id: issue_type.id))
+    expect(all_issues - [issue1, issue2, issue3]).to all(have_attributes(work_item_type_id: nil))
+  end
+
+  it 'tracks timings of queries' do
+    expect(migration.batch_metrics.timings).to be_empty
+
+    expect { migrate }.to change { migration.batch_metrics.timings }
+  end
+
+  it 'retries on ActiveRecord::StatementTimeout' do
+    expect(migration).to receive(:update_batch).exactly(3).times.and_raise(ActiveRecord::StatementTimeout)
+    expect(migration).to receive(:sleep).with(30).twice
+
+    expect do
+      migrate
+    end.to raise_error(ActiveRecord::StatementTimeout)
+  end
+end
--- a/spec/migrations/backfill_work_item_type_id_on_issues_spec.rb
+++ b/spec/migrations/backfill_work_item_type_id_on_issues_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+require_migration!
+
+RSpec.describe BackfillWorkItemTypeIdOnIssues, :migration do
+  let_it_be(:migration) { described_class::MIGRATION }
+  let_it_be(:interval) { 2.minutes }
+  let_it_be(:issue_type_enum) { { issue: 0, incident: 1, test_case: 2, requirement: 3, task: 4 } }
+  let_it_be(:base_work_item_type_ids) do
+    table(:work_item_types).where(namespace_id: nil).order(:base_type).each_with_object({}) do |type, hash|
+      hash[type.base_type] = type.id
+    end
+  end
+
+  describe '#up' do
+    it 'correctly schedules background migrations' do
+      Sidekiq::Testing.fake! do
+        freeze_time do
+          migrate!
+
+          scheduled_migrations = Gitlab::Database::BackgroundMigration::BatchedMigration.where(job_class_name: migration)
+          work_item_types = table(:work_item_types).where(namespace_id: nil)
+
+          expect(scheduled_migrations.count).to eq(work_item_types.count)
+
+          [:issue, :incident, :test_case, :requirement, :task].each do |issue_type|
+            expect(migration).to have_scheduled_batched_migration(
+              table_name: :issues,
+              column_name: :id,
+              job_arguments: [issue_type_enum[issue_type], base_work_item_type_ids[issue_type_enum[issue_type]]],
+              interval: interval,
+              batch_size: described_class::BATCH_SIZE,
+              max_batch_size: described_class::MAX_BATCH_SIZE,
+              sub_batch_size: described_class::SUB_BATCH_SIZE
+            )
+          end
+        end
+      end
+    end
+  end
+
+  describe '#down' do
+    it 'deletes all batched migration records' do
+      migrate!
+      schema_migrate_down!
+
+      expect(migration).not_to have_scheduled_batched_migration
+    end
+  end
+end