Commit 9b553e50 authored by Yorick Peterse's avatar Yorick Peterse

Fix MR commits with missing committers/authors

In MR https://gitlab.com/gitlab-org/gitlab/-/merge_requests/63669 we
introduced a new data format for storing merge request diff commit
authors and committers. As part of this work we made changes to the
import/export code to support this new format, and added a set of
migrations to migrate existing data to this new format. At this time we
supported reading and writing of data in both the old and new format,
allowing us to gradually migrate data over to the new format.

In https://gitlab.com/gitlab-org/gitlab/-/merge_requests/72219 we
ensured all migrations are done, stopped using the old data format, and
removed the columns storing this data.

Unfortunately, this chain of events uncovered a bug in our import/export
logic. Consider the following timeline of events:

1. You export project "Cooking Recipes" from a GitLab instance running a
   version earlier than 14.1 (e.g. 14.0).
2. The instance you intend to import this project into is running 14.1
   or newer. Existing data has been fully migrated already.
3. You import the project into this new instance.

At this point, the imported data is using the old format, not the
format. This is because we forgot to take into account users importing
exports using GitLab 14.0 or older, instead only covering exports
generated using GitLab 14.1 or newer. Because the background migrations
finished, or the data imported would fall in a "bucket" (= a chunk or
rows to migrate) that had already been migrated, the data would never be
updated to the new format.

In this commit we resolve this problem in two steps. First, we change
the import/export logic to support importing data in both the old and
new format. Exports still use the new format. In addition, we include a
background migration that processes all projects created using a GitLab
import/export since the first mentioned merge request was introduced.
For each such project we scan over the merge request diff commits and
fix any that are missing the commit author or committer details.

For small self-hosted instances this process is unlikely to take more
than a few minutes. On GitLab.com however we expect this process to take
a few days, as we have to process around 200 000 projects imported since
July. This means we'll likely need additional manual intervention
similar to the manual work needed for
https://gitlab.com/gitlab-org/gitlab/-/issues/334394.

See https://gitlab.com/gitlab-org/gitlab/-/issues/344080 for additional
details.

Changelog: fixed
parent e1d2685e
# frozen_string_literal: true
class ScheduleFixMergeRequestDiffCommitUsersMigration < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
MIGRATION_CLASS = 'FixMergeRequestDiffCommitUsers'
class Project < ApplicationRecord
include EachBatch
self.table_name = 'projects'
end
def up
# This is the day on which we merged
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/63669. Since the
# deploy of this MR we may have imported projects using the old format, but
# after their merge_request_diff_id range had been migrated by Sidekiq. As a
# result, there may be rows without a committer_id or commit_author_id
# field.
date = '2021-07-07 00:00:00'
transaction do
Project.each_batch(of: 10_000) do |batch|
time = Time.now.utc
rows = batch
.where('created_at >= ?', date)
.where(import_type: 'gitlab_project')
.pluck(:id)
.map do |id|
Gitlab::Database::BackgroundMigrationJob.new(
class_name: MIGRATION_CLASS,
arguments: [id],
created_at: time,
updated_at: time
)
end
Gitlab::Database::BackgroundMigrationJob
.bulk_insert!(rows, validate: false)
end
end
job = Gitlab::Database::BackgroundMigrationJob
.for_migration_class(MIGRATION_CLASS)
.pending
.first
migrate_in(2.minutes, MIGRATION_CLASS, job.arguments) if job
end
def down
# no-op
end
end
385c540b1f80c31a5ba009ae3507d2b855a737389bcb75c07a8872f26f8a9b44
\ No newline at end of file
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# Background migration for fixing merge_request_diff_commit rows that don't
# have committer/author details due to
# https://gitlab.com/gitlab-org/gitlab/-/issues/344080.
#
# This migration acts on a single project and corrects its data. Because
# this process needs Git/Gitaly access, and duplicating all that code is far
# too much, this migration relies on global models such as Project,
# MergeRequest, etc.
class FixMergeRequestDiffCommitUsers
def initialize
@commits = {}
@users = {}
end
def perform(project_id)
if (project = ::Project.find_by_id(project_id))
process(project)
end
::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(
'FixMergeRequestDiffCommitUsers',
[project_id]
)
schedule_next_job
end
def process(project)
# Loading everything using one big query may result in timeouts (e.g.
# for projects the size of gitlab-org/gitlab). So instead we query
# data on a per merge request basis.
project.merge_requests.each_batch do |mrs|
::MergeRequestDiffCommit
.select([
:merge_request_diff_id,
:relative_order,
:sha,
:committer_id,
:commit_author_id
])
.joins(merge_request_diff: :merge_request)
.where(merge_requests: { id: mrs.select(:id) })
.where('commit_author_id IS NULL OR committer_id IS NULL')
.each do |commit|
update_commit(project, commit)
end
end
end
# rubocop: disable Metrics/AbcSize
def update_commit(project, row)
commit = find_commit(project, row.sha)
updates = []
unless row.commit_author_id
author_id = find_or_create_user(commit, :author_name, :author_email)
updates << [arel_table[:commit_author_id], author_id] if author_id
end
unless row.committer_id
committer_id =
find_or_create_user(commit, :committer_name, :committer_email)
updates << [arel_table[:committer_id], committer_id] if committer_id
end
return if updates.empty?
update = Arel::UpdateManager
.new
.table(MergeRequestDiffCommit.arel_table)
.where(matches_row(row))
.set(updates)
.to_sql
MergeRequestDiffCommit.connection.execute(update)
end
# rubocop: enable Metrics/AbcSize
def schedule_next_job
job = Database::BackgroundMigrationJob
.for_migration_class('FixMergeRequestDiffCommitUsers')
.pending
.first
return unless job
BackgroundMigrationWorker.perform_in(
2.minutes,
'FixMergeRequestDiffCommitUsers',
job.arguments
)
end
def find_commit(project, sha)
@commits[sha] ||= (project.commit(sha)&.to_hash || {})
end
def find_or_create_user(commit, name_field, email_field)
name = commit[name_field]
email = commit[email_field]
return unless name && email
@users[[name, email]] ||=
MergeRequest::DiffCommitUser.find_or_create(name, email).id
end
def matches_row(row)
primary_key = Arel::Nodes::Grouping
.new([arel_table[:merge_request_diff_id], arel_table[:relative_order]])
primary_val = Arel::Nodes::Grouping
.new([row.merge_request_diff_id, row.relative_order])
primary_key.eq(primary_val)
end
def arel_table
MergeRequestDiffCommit.arel_table
end
end
end
end
......@@ -4,6 +4,7 @@ module Gitlab
module Database
class BackgroundMigrationJob < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord
include EachBatch
include BulkInsertSafe
self.table_name = :background_migration_jobs
......
......@@ -47,15 +47,15 @@ module Gitlab
attributes
end
private
def find_with_cache(key = cache_key)
return yield unless lru_cache && key
attr_reader :klass, :attributes, :lru_cache, :cache_key
lru_cache[key] ||= yield
end
def find_with_cache
return yield unless lru_cache && cache_key
private
lru_cache[cache_key] ||= yield
end
attr_reader :klass, :attributes, :lru_cache, :cache_key
def cache_from_request_store
Gitlab::SafeRequestStore[:lru_cache] ||= LruRedux::Cache.new(LRU_CACHE_SIZE)
......
......@@ -401,10 +401,6 @@ excluded_attributes:
- :verification_checksum
- :verification_failure
merge_request_diff_commits:
- :author_name
- :author_email
- :committer_name
- :committer_email
- :merge_request_diff_id
- :commit_author_id
- :committer_id
......
......@@ -29,6 +29,7 @@ module Gitlab
def find
return if epic? && group.nil?
return find_diff_commit_user if diff_commit_user?
return find_diff_commit if diff_commit?
super
end
......@@ -83,9 +84,38 @@ module Gitlab
end
def find_diff_commit_user
find_with_cache do
MergeRequest::DiffCommitUser
.find_or_create(@attributes['name'], @attributes['email'])
find_or_create_diff_commit_user(@attributes['name'], @attributes['email'])
end
def find_diff_commit
row = @attributes.dup
# Diff commits come in two formats:
#
# 1. The old format where author/committer details are separate fields
# 2. The new format where author/committer details are nested objects,
# and pre-processed by `find_diff_commit_user`.
#
# The code here ensures we support both the old and new format.
aname = row.delete('author_name')
amail = row.delete('author_email')
cname = row.delete('committer_name')
cmail = row.delete('committer_email')
author = row.delete('commit_author')
committer = row.delete('committer')
row['commit_author'] = author ||
find_or_create_diff_commit_user(aname, amail)
row['committer'] = committer ||
find_or_create_diff_commit_user(cname, cmail)
MergeRequestDiffCommit.new(row)
end
def find_or_create_diff_commit_user(name, email)
find_with_cache([MergeRequest::DiffCommitUser, name, email]) do
MergeRequest::DiffCommitUser.find_or_create(name, email)
end
end
......@@ -113,6 +143,10 @@ module Gitlab
klass == MergeRequest::DiffCommitUser
end
def diff_commit?
klass == MergeRequestDiffCommit
end
# If an existing group milestone used the IID
# claim the IID back and set the group milestone to use one available
# This is necessary to fix situations like the following:
......
......@@ -33,7 +33,8 @@ module Gitlab
links: 'Releases::Link',
metrics_setting: 'ProjectMetricsSetting',
commit_author: 'MergeRequest::DiffCommitUser',
committer: 'MergeRequest::DiffCommitUser' }.freeze
committer: 'MergeRequest::DiffCommitUser',
merge_request_diff_commits: 'MergeRequestDiffCommit' }.freeze
BUILD_MODELS = %i[Ci::Build commit_status].freeze
......@@ -59,6 +60,7 @@ module Gitlab
external_pull_requests
DesignManagement::Design
MergeRequest::DiffCommitUser
MergeRequestDiffCommit
].freeze
def create
......
# frozen_string_literal: true
require 'spec_helper'
# The underlying migration relies on the global models (e.g. Project). This
# means we also need to use FactoryBot factories to ensure everything is
# operating using the same types. If we use `table()` and similar methods we
# would have to duplicate a lot of logic just for these tests.
#
# rubocop: disable RSpec/FactoriesInMigrationSpecs
RSpec.describe Gitlab::BackgroundMigration::FixMergeRequestDiffCommitUsers do
let(:migration) { described_class.new }
describe '#perform' do
context 'when the project exists' do
it 'processes the project' do
project = create(:project)
expect(migration).to receive(:process).with(project)
expect(migration).to receive(:schedule_next_job)
migration.perform(project.id)
end
it 'marks the background job as finished' do
project = create(:project)
Gitlab::Database::BackgroundMigrationJob.create!(
class_name: 'FixMergeRequestDiffCommitUsers',
arguments: [project.id]
)
migration.perform(project.id)
job = Gitlab::Database::BackgroundMigrationJob
.find_by(class_name: 'FixMergeRequestDiffCommitUsers')
expect(job.status).to eq('succeeded')
end
end
context 'when the project does not exist' do
it 'does nothing' do
expect(migration).not_to receive(:process)
expect(migration).to receive(:schedule_next_job)
migration.perform(-1)
end
end
end
describe '#update_commit' do
let(:project) { create(:project, :repository) }
let(:mr) do
create(
:merge_request_with_diffs,
source_project: project,
target_project: project
)
end
let(:diff) { mr.merge_request_diffs.first }
let(:commit) { project.commit }
def update_row(migration, project, diff, row)
migration.update_commit(project, row)
diff
.merge_request_diff_commits
.find_by(sha: row.sha, relative_order: row.relative_order)
end
it 'populates missing commit authors' do
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.commit_author.name).to eq(commit.to_hash[:author_name])
expect(updated.commit_author.email).to eq(commit.to_hash[:author_email])
end
it 'populates missing committers' do
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.committer.name).to eq(commit.to_hash[:committer_name])
expect(updated.committer.email).to eq(commit.to_hash[:committer_email])
end
it 'leaves existing commit authors as-is' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
commit_author: user
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.commit_author).to eq(user)
end
it 'leaves existing committers as-is' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.committer).to eq(user)
end
it 'does nothing when both the author and committer are present' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user,
commit_author: user
)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
it 'does nothing if the commit does not exist in Git' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: 'kittens',
relative_order: 9000,
committer: user,
commit_author: user
)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
it 'does nothing when the committer/author are missing in the Git commit' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user,
commit_author: user
)
allow(migration).to receive(:find_or_create_user).and_return(nil)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
end
describe '#schedule_next_job' do
it 'schedules the next background migration' do
Gitlab::Database::BackgroundMigrationJob
.create!(class_name: 'FixMergeRequestDiffCommitUsers', arguments: [42])
expect(BackgroundMigrationWorker)
.to receive(:perform_in)
.with(2.minutes, 'FixMergeRequestDiffCommitUsers', [42])
migration.schedule_next_job
end
it 'does nothing when there are no jobs' do
expect(BackgroundMigrationWorker)
.not_to receive(:perform_in)
migration.schedule_next_job
end
end
describe '#find_commit' do
let(:project) { create(:project, :repository) }
it 'finds a commit using Git' do
commit = project.commit
found = migration.find_commit(project, commit.sha)
expect(found).to eq(commit.to_hash)
end
it 'caches the results' do
commit = project.commit
migration.find_commit(project, commit.sha)
expect { migration.find_commit(project, commit.sha) }
.not_to change { Gitlab::GitalyClient.get_request_count }
end
it 'returns an empty hash if the commit does not exist' do
expect(migration.find_commit(project, 'kittens')).to eq({})
end
end
describe '#find_or_create_user' do
let(:project) { create(:project, :repository) }
it 'creates missing users' do
commit = project.commit.to_hash
id = migration.find_or_create_user(commit, :author_name, :author_email)
expect(MergeRequest::DiffCommitUser.count).to eq(1)
created = MergeRequest::DiffCommitUser.first
expect(created.name).to eq(commit[:author_name])
expect(created.email).to eq(commit[:author_email])
expect(created.id).to eq(id)
end
it 'returns users that already exist' do
commit = project.commit.to_hash
user1 = migration.find_or_create_user(commit, :author_name, :author_email)
user2 = migration.find_or_create_user(commit, :author_name, :author_email)
expect(user1).to eq(user2)
end
it 'caches the results' do
commit = project.commit.to_hash
migration.find_or_create_user(commit, :author_name, :author_email)
recorder = ActiveRecord::QueryRecorder.new do
migration.find_or_create_user(commit, :author_name, :author_email)
end
expect(recorder.count).to be_zero
end
it 'returns nil if the commit details are missing' do
id = migration.find_or_create_user({}, :author_name, :author_email)
expect(id).to be_nil
end
end
describe '#matches_row' do
it 'returns the query matches for the composite primary key' do
row = double(:commit, merge_request_diff_id: 4, relative_order: 5)
arel = migration.matches_row(row)
expect(arel.to_sql).to eq(
'("merge_request_diff_commits"."merge_request_diff_id", "merge_request_diff_commits"."relative_order") = (4, 5)'
)
end
end
end
# rubocop: enable RSpec/FactoriesInMigrationSpecs
......@@ -176,4 +176,118 @@ RSpec.describe Gitlab::ImportExport::Project::ObjectBuilder do
expect(found.email).to eq('alice@example.com')
end
end
context 'merge request diff commits' do
context 'when the "committer" object is present' do
it 'uses this object as the committer' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer' => user,
'committer_name' => 'Bla',
'committer_email' => 'bla@example.com',
'author_name' => 'Bla',
'author_email' => 'bla@example.com'
}
)
expect(commit.committer).to eq(user)
end
end
context 'when the "committer" object is missing' do
it 'creates one from the committer name and Email' do
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'author_name' => 'Alice',
'author_email' => 'alice@example.com'
}
)
expect(commit.committer.name).to eq('Alice')
expect(commit.committer.email).to eq('alice@example.com')
end
end
context 'when the "commit_author" object is present' do
it 'uses this object as the author' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'commit_author' => user,
'author_name' => 'Bla',
'author_email' => 'bla@example.com'
}
)
expect(commit.commit_author).to eq(user)
end
end
context 'when the "commit_author" object is missing' do
it 'creates one from the author name and Email' do
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'author_name' => 'Alice',
'author_email' => 'alice@example.com'
}
)
expect(commit.commit_author.name).to eq('Alice')
expect(commit.commit_author.email).to eq('alice@example.com')
end
end
end
describe '#find_or_create_diff_commit_user' do
context 'when the user already exists' do
it 'returns the existing user' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
found = described_class
.new(MergeRequestDiffCommit, {})
.send(:find_or_create_diff_commit_user, user.name, user.email)
expect(found).to eq(user)
end
end
context 'when the user does not exist' do
it 'creates the user' do
found = described_class
.new(MergeRequestDiffCommit, {})
.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
expect(found.name).to eq('Alice')
expect(found.email).to eq('alice@example.com')
end
end
it 'caches the results' do
builder = described_class.new(MergeRequestDiffCommit, {})
builder.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
record = ActiveRecord::QueryRecorder.new do
builder.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
end
expect(record.count).to eq(1)
end
end
end
# frozen_string_literal: true
require 'spec_helper'
require_migration! 'schedule_fix_merge_request_diff_commit_users_migration'
RSpec.describe ScheduleFixMergeRequestDiffCommitUsersMigration, :migration do
let(:migration) { described_class.new }
let(:namespaces) { table(:namespaces) }
let(:projects) { table(:projects) }
let(:namespace) { namespaces.create!(name: 'foo', path: 'foo') }
describe '#up' do
it 'does nothing when there are no projects to correct' do
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
it 'schedules imported projects created after July' do
project = projects.create!(
namespace_id: namespace.id,
import_type: 'gitlab_project',
created_at: '2021-08-01'
)
expect(migration)
.to receive(:migrate_in)
.with(2.minutes, 'FixMergeRequestDiffCommitUsers', [project.id])
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to eq(1)
job = Gitlab::Database::BackgroundMigrationJob.first
expect(job.class_name).to eq('FixMergeRequestDiffCommitUsers')
expect(job.arguments).to eq([project.id])
end
it 'ignores projects imported before July' do
projects.create!(
namespace_id: namespace.id,
import_type: 'gitlab_project',
created_at: '2020-08-01'
)
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
it 'ignores projects that are not imported' do
projects.create!(
namespace_id: namespace.id,
created_at: '2021-08-01'
)
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment