diff --git a/changelogs/unreleased/216216-add-rake-task-for-external-diffs-cleanup.yml b/changelogs/unreleased/216216-add-rake-task-for-external-diffs-cleanup.yml new file mode 100644 index 0000000000000000000000000000000000000000..1cbbc96a730eb2612f1d6c2fab2a5a72b8882e2f --- /dev/null +++ b/changelogs/unreleased/216216-add-rake-task-for-external-diffs-cleanup.yml @@ -0,0 +1,5 @@ +--- +title: Add a Rake task to fix incorrectly-recorded external diffs +merge_request: 36353 +author: +type: fixed diff --git a/doc/administration/merge_request_diffs.md b/doc/administration/merge_request_diffs.md index 2223480daf843c6ecc2b06ffb4d9d4c98fec8dd7..3c0311018d1cc2c4efca9ed5962167f07ca84594 100644 --- a/doc/administration/merge_request_diffs.md +++ b/doc/administration/merge_request_diffs.md @@ -186,3 +186,51 @@ conditions become true: These rules strike a balance between space and performance by only storing frequently-accessed diffs in the database. Diffs that are less likely to be accessed are moved to external storage instead. + +## Correcting incorrectly-migrated diffs + +Versions of GitLab earlier than `v13.0.0` would incorrectly record the location +of some merge request diffs when [external diffs in object storage](#object-storage-settings) +were enabled. This mainly affected imported merge requests, and was resolved +with [this merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/31005). + +If you are using object storage, have never used on-disk storage for external +diffs, the "changes" tab for some merge requests fails to load with a 500 error, +and the exception for that error is of this form: + +```plain +Errno::ENOENT (No such file or directory @ rb_sysopen - /var/opt/gitlab/gitlab-rails/shared/external-diffs/merge_request_diffs/mr-6167082/diff-8199789) +``` + +Then you are affected by this issue. Since it's not possible to safely determine +all these conditions automatically, we've provided a Rake task in GitLab v13.2.0 +that you can run manually to correct the data: + +**In Omnibus installations:** + +```shell +sudo gitlab-rake gitlab:external_diffs:force_object_storage +``` + +**In installations from source:** + +```shell +sudo -u git -H bundle exec rake gitlab:external_diffs:force_object_storage RAILS_ENV=production +``` + +Environment variables can be provided to modify the behavior of the task. The +available variables are: + +| Name | Default value | Purpose | +| ---- | ------------- | ------- | +| `ANSI` | `true` | Use ANSI escape codes to make output more understandable | +| `BATCH_SIZE` | `1000` | Iterate through the table in batches of this size | +| `START_ID` | `nil` | If set, begin scanning at this ID | +| `END_ID` | `nil` | If set, stop scanning at this ID | +| `UPDATE_DELAY` | `1` | Number of seconds to sleep between updates | + +The `START_ID` and `END_ID` variables may be used to run the update in parallel, +by assigning different processes to different parts of the table. The `BATCH` +and `UPDATE_DELAY` parameters allow the speed of the migration to be traded off +against concurrent access to the table. The `ANSI` parameter should be set to +false if your terminal does not support ANSI escape codes. diff --git a/lib/tasks/gitlab/external_diffs.rake b/lib/tasks/gitlab/external_diffs.rake new file mode 100644 index 0000000000000000000000000000000000000000..08f259140076dbefadaa17f83566c3e08ae8aa2e --- /dev/null +++ b/lib/tasks/gitlab/external_diffs.rake @@ -0,0 +1,35 @@ +namespace :gitlab do + namespace :external_diffs do + desc "Override external diffs in file storage to be in object storage instead. This does not change the actual location of the data" + task force_object_storage: :environment do |t, args| + ansi = Gitlab::Utils.to_boolean(ENV.fetch('ANSI', true)) + batch = ENV.fetch('BATCH_SIZE', 1000) + start_id = ENV.fetch('START_ID', nil) + end_id = ENV.fetch('END_ID', nil) + update_delay = args.fetch('UPDATE_DELAY', 1) + + # Use ANSI codes to overwrite the same line repeatedly if supported + newline = ansi ? "\x1B8\x1B[2K" : "\n" + + total = 0 + + # The only useful index on the table is by id, so scan through the whole + # table by that and filter out those we don't want on each relation + MergeRequestDiff.in_batches(of: batch, start: start_id, finish: end_id) do |relation| # rubocop:disable Cop/InBatches + count = relation + .except(:order) + .where(stored_externally: true, external_diff_store: ExternalDiffUploader::Store::LOCAL) + .update_all(external_diff_store: ExternalDiffUploader::Store::REMOTE) + + total += count + + if count > 0 + print "#{newline}#{total} updated..." + sleep(update_delay) if update_delay > 0 + end + end + + puts "done!" + end + end +end diff --git a/spec/tasks/gitlab/external_diffs_rake_spec.rb b/spec/tasks/gitlab/external_diffs_rake_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..66e555734b3ca0c3ddd2bb49b24e0b5f68399570 --- /dev/null +++ b/spec/tasks/gitlab/external_diffs_rake_spec.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +require 'rake_helper' + +RSpec.describe 'gitlab:external_diffs rake tasks' do + before do + Rake.application.rake_require 'tasks/gitlab/external_diffs' + end + + describe 'force_object_storage task' do + it 'forces externally stored merge request diffs to object storage' do + db = create(:merge_request).merge_request_diff + file = create(:merge_request).merge_request_diff.tap { |o| o.update_columns(stored_externally: true, external_diff_store: 1) } + object = create(:merge_request).merge_request_diff.tap { |o| o.update_columns(stored_externally: true, external_diff_store: 2) } + + run_rake_task('gitlab:external_diffs:force_object_storage') + + expect(db.reload).not_to be_stored_externally + expect(file.reload).to be_stored_externally + expect(object.reload).to be_stored_externally + + expect(file.external_diff_store).to eq(2) + expect(object.external_diff_store).to eq(2) + end + + it 'limits batches according to BATCH_SIZE, START_ID, and END_ID' do + stub_env('START_ID' => 'foo', 'END_ID' => 'bar', 'BATCH_SIZE' => 'baz') + + expect(MergeRequestDiff).to receive(:in_batches).with(start: 'foo', finish: 'bar', of: 'baz') + + run_rake_task('gitlab:external_diffs:force_object_storage') + end + end +end