Commit 7336ccfa authored by David Fernandez's avatar David Fernandez

Merge branch '352968-guard-worker-long-running' into 'master'

Abort long running registry migrations

See merge request gitlab-org/gitlab!81938
parents a9703b65 a4615862
......@@ -9,10 +9,13 @@ class ContainerRepository < ApplicationRecord
WAITING_CLEANUP_STATUSES = %i[cleanup_scheduled cleanup_unfinished].freeze
REQUIRING_CLEANUP_STATUSES = %i[cleanup_unscheduled cleanup_scheduled].freeze
IDLE_MIGRATION_STATES = %w[default pre_import_done import_done import_aborted import_skipped].freeze
ACTIVE_MIGRATION_STATES = %w[pre_importing importing].freeze
ABORTABLE_MIGRATION_STATES = (ACTIVE_MIGRATION_STATES + %w[pre_import_done default]).freeze
MIGRATION_STATES = (IDLE_MIGRATION_STATES + ACTIVE_MIGRATION_STATES).freeze
ABORTABLE_MIGRATION_STATES = (ACTIVE_MIGRATION_STATES + %w[pre_import_done default]).freeze
IRRECONCILABLE_MIGRATIONS_STATUSES = %w[import_in_progress pre_import_in_progress pre_import_canceled import_canceled].freeze
MIGRATION_PHASE_1_STARTED_AT = Date.new(2021, 11, 4).freeze
......@@ -32,7 +35,7 @@ class ContainerRepository < ApplicationRecord
enum status: { delete_scheduled: 0, delete_failed: 1 }
enum expiration_policy_cleanup_status: { cleanup_unscheduled: 0, cleanup_scheduled: 1, cleanup_unfinished: 2, cleanup_ongoing: 3 }
enum migration_skipped_reason: { not_in_plan: 0, too_many_retries: 1, too_many_tags: 2, root_namespace_in_deny_list: 3 }
enum migration_skipped_reason: { not_in_plan: 0, too_many_retries: 1, too_many_tags: 2, root_namespace_in_deny_list: 3, migration_canceled: 4 }
delegate :client, :gitlab_api_client, to: :registry
......@@ -114,15 +117,15 @@ class ContainerRepository < ApplicationRecord
end
event :finish_pre_import do
transition %i[pre_importing import_aborted] => :pre_import_done
transition %i[pre_importing importing import_aborted] => :pre_import_done
end
event :start_import do
transition pre_import_done: :importing
transition %i[pre_import_done pre_importing importing import_aborted] => :importing
end
event :finish_import do
transition %i[importing import_aborted] => :import_done
transition %i[pre_importing importing import_aborted] => :import_done
end
event :already_migrated do
......@@ -138,11 +141,11 @@ class ContainerRepository < ApplicationRecord
end
event :retry_pre_import do
transition import_aborted: :pre_importing
transition %i[pre_importing importing import_aborted] => :pre_importing
end
event :retry_import do
transition import_aborted: :importing
transition %i[pre_importing importing import_aborted] => :importing
end
before_transition any => :pre_importing do |container_repository|
......@@ -276,24 +279,28 @@ class ContainerRepository < ApplicationRecord
def retry_aborted_migration
return unless migration_state == 'import_aborted'
case external_import_status
reconcile_import_status(external_import_status) do
# If the import_status request fails, use the timestamp to guess current state
migration_pre_import_done_at ? retry_import : retry_pre_import
end
end
def reconcile_import_status(status)
case status
when 'native'
raise NativeImportError
when 'import_in_progress'
when *IRRECONCILABLE_MIGRATIONS_STATUSES
nil
when 'import_complete'
finish_import
when 'import_failed'
retry_import
when 'pre_import_in_progress'
nil
when 'pre_import_complete'
finish_pre_import_and_start_import
when 'pre_import_failed'
retry_pre_import
else
# If the import_status request fails, use the timestamp to guess current state
migration_pre_import_done_at ? retry_import : retry_pre_import
yield
end
end
......@@ -450,6 +457,12 @@ class ContainerRepository < ApplicationRecord
response
end
def migration_cancel
return :error unless gitlab_api_client.supports_gitlab_api?
gitlab_api_client.cancel_repository_import(self.path)
end
def self.build_from_path(path)
self.new(project: path.repository_project,
name: path.repository_name)
......
......@@ -29,46 +29,45 @@ module ContainerRegistry
log_extra_metadata_on_done(:stale_migrations_count, repositories.to_a.size)
repositories.each do |repository|
if abortable?(repository)
if actively_importing?(repository)
# if a repository is actively importing but not yet long_running, do nothing
if long_running_migration?(repository)
long_running_migration_ids << repository.id
cancel_long_running_migration(repository)
aborts_count += 1
end
else
repository.abort_import
aborts_count += 1
else
long_running_migration_ids << repository.id if long_running_migration?(repository)
end
end
log_extra_metadata_on_done(:aborted_stale_migrations_count, aborts_count)
if long_running_migration_ids.any?
log_extra_metadata_on_done(:long_running_stale_migration_container_repository_ids, long_running_migration_ids)
log_extra_metadata_on_done(:aborted_long_running_migration_ids, long_running_migration_ids)
end
end
private
# This can ping the Container Registry API.
# We loop on a set of repositories to calls this function (see #perform)
# In the worst case scenario, we have a n+1 API calls situation here.
#
# This is reasonable because the maximum amount of repositories looped
# on is `25`. See ::ContainerRegistry::Migration.capacity.
#
# TODO We can remove this n+1 situation by having a Container Registry API
# endpoint that accepts multiple repository paths at once. This is issue
# A repository is actively_importing if it has an importing migration state
# and that state matches the state in the registry
# TODO We can have an API call n+1 situation here. It can be solved when the
# endpoint accepts multiple repository paths at once. This is issue
# https://gitlab.com/gitlab-org/container-registry/-/issues/582
def abortable?(repository)
# early return to save one Container Registry API request
return true unless repository.importing? || repository.pre_importing?
return true unless external_migration_in_progress?(repository)
def actively_importing?(repository)
return false unless repository.importing? || repository.pre_importing?
return false unless external_state_matches_migration_state?(repository)
false
true
end
def long_running_migration?(repository)
migration_start_timestamp(repository).before?(long_running_migration_threshold)
end
def external_migration_in_progress?(repository)
def external_state_matches_migration_state?(repository)
status = repository.external_import_status
(status == 'pre_import_in_progress' && repository.pre_importing?) ||
......@@ -96,6 +95,21 @@ module ContainerRegistry
def long_running_migration_threshold
@threshold ||= 30.minutes.ago
end
def cancel_long_running_migration(repository)
result = repository.migration_cancel
case result[:status]
when :ok
repository.skip_import(reason: :migration_canceled)
when :bad_request
repository.reconcile_import_status(result[:state]) do
repository.abort_import
end
else
repository.abort_import
end
end
end
end
end
......@@ -5,10 +5,12 @@ module ContainerRegistry
include Gitlab::Utils::StrongMemoize
JSON_TYPE = 'application/json'
CANCEL_RESPONSE_STATUS_HEADER = 'status'
IMPORT_RESPONSES = {
200 => :already_imported,
202 => :ok,
400 => :bad_request,
401 => :unauthorized,
404 => :not_found,
409 => :already_being_imported,
......@@ -50,6 +52,18 @@ module ContainerRegistry
IMPORT_RESPONSES.fetch(response.status, :error)
end
# https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#import-repository
def cancel_repository_import(path)
response = with_import_token_faraday do |faraday_client|
faraday_client.delete(import_url_for(path))
end
status = IMPORT_RESPONSES.fetch(response.status, :error)
actual_state = response.body[CANCEL_RESPONSE_STATUS_HEADER]
{ status: status, migration_state: actual_state }
end
# https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#get-repository-import-status
def import_status(path)
with_import_token_faraday do |faraday_client|
......
......@@ -62,6 +62,7 @@ RSpec.describe ContainerRegistry::GitlabApiClient do
where(:status_code, :expected_result) do
200 | :already_imported
202 | :ok
400 | :bad_request
401 | :unauthorized
404 | :not_found
409 | :already_being_imported
......@@ -86,6 +87,7 @@ RSpec.describe ContainerRegistry::GitlabApiClient do
where(:status_code, :expected_result) do
200 | :already_imported
202 | :ok
400 | :bad_request
401 | :unauthorized
404 | :not_found
409 | :already_being_imported
......@@ -104,6 +106,41 @@ RSpec.describe ContainerRegistry::GitlabApiClient do
end
end
describe '#cancel_repository_import' do
subject { client.cancel_repository_import(path) }
where(:status_code, :expected_result) do
200 | :already_imported
202 | :ok
400 | :bad_request
401 | :unauthorized
404 | :not_found
409 | :already_being_imported
418 | :error
424 | :pre_import_failed
425 | :already_being_imported
429 | :too_many_imports
end
with_them do
before do
stub_import_cancel(path, status_code)
end
it { is_expected.to eq({ status: expected_result, migration_state: nil }) }
end
context 'bad request' do
let(:status) { 'this_is_a_test' }
before do
stub_import_cancel(path, 400, status: status)
end
it { is_expected.to eq({ status: :bad_request, migration_state: status }) }
end
end
describe '#import_status' do
subject { client.import_status(path) }
......@@ -250,6 +287,22 @@ RSpec.describe ContainerRegistry::GitlabApiClient do
)
end
def stub_import_cancel(path, http_status, status: nil)
body = {}
if http_status == 400
body = { status: status }
end
stub_request(:delete, "#{registry_api_url}/gitlab/v1/import/#{path}/")
.with(headers: { 'Accept' => described_class::JSON_TYPE, 'Authorization' => "bearer #{import_token}" })
.to_return(
status: http_status,
body: body.to_json,
headers: { content_type: 'application/json' }
)
end
def stub_repository_details(path, with_size: true, status_code: 200, respond_with: {})
url = "#{registry_api_url}/gitlab/v1/repositories/#{path}/"
url += "?size=self" if with_size
......
......@@ -208,7 +208,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
end
it_behaves_like 'transitioning from allowed states', %w[import_aborted]
it_behaves_like 'transitioning from allowed states', %w[pre_importing importing import_aborted]
it_behaves_like 'transitioning to pre_importing'
it_behaves_like 'transitioning out of import_aborted'
end
......@@ -218,7 +218,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
subject { repository.finish_pre_import }
it_behaves_like 'transitioning from allowed states', %w[pre_importing import_aborted]
it_behaves_like 'transitioning from allowed states', %w[pre_importing importing import_aborted]
it 'sets migration_pre_import_done_at' do
expect { subject }.to change { repository.reload.migration_pre_import_done_at }
......@@ -238,7 +238,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
end
it_behaves_like 'transitioning from allowed states', %w[pre_import_done]
it_behaves_like 'transitioning from allowed states', %w[pre_import_done pre_importing importing import_aborted]
it_behaves_like 'transitioning to importing'
end
......@@ -253,7 +253,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
end
it_behaves_like 'transitioning from allowed states', %w[import_aborted]
it_behaves_like 'transitioning from allowed states', %w[pre_importing importing import_aborted]
it_behaves_like 'transitioning to importing'
it_behaves_like 'no action when feature flag is disabled'
end
......@@ -263,7 +263,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
subject { repository.finish_import }
it_behaves_like 'transitioning from allowed states', %w[importing import_aborted]
it_behaves_like 'transitioning from allowed states', %w[pre_importing importing import_aborted]
it_behaves_like 'queueing the next import'
it 'sets migration_import_done_at and queues the next import' do
......@@ -334,7 +334,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
end
it_behaves_like 'transitioning from allowed states', %w[pre_importing import_aborted]
it_behaves_like 'transitioning from allowed states', %w[pre_importing importing import_aborted]
it_behaves_like 'transitioning to importing'
end
end
......@@ -391,7 +391,7 @@ RSpec.describe ContainerRepository, :aggregate_failures do
describe '#retry_aborted_migration' do
subject { repository.retry_aborted_migration }
shared_examples 'no action' do
context 'when migration_state is not aborted' do
it 'does nothing' do
expect { subject }.not_to change { repository.reload.migration_state }
......@@ -399,104 +399,45 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
end
shared_examples 'retrying the pre_import' do
it 'retries the pre_import' do
expect(repository).to receive(:migration_pre_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('pre_importing')
end
end
shared_examples 'retrying the import' do
it 'retries the import' do
expect(repository).to receive(:migration_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('importing')
end
end
context 'when migration_state is not aborted' do
it_behaves_like 'no action'
end
context 'when migration_state is aborted' do
before do
repository.abort_import
allow(repository.gitlab_api_client)
.to receive(:import_status).with(repository.path).and_return(client_response)
.to receive(:import_status).with(repository.path).and_return(status)
end
context 'native response' do
let(:client_response) { 'native' }
it 'raises an error' do
expect { subject }.to raise_error(described_class::NativeImportError)
end
end
it_behaves_like 'reconciling migration_state' do
context 'error response' do
let(:status) { 'error' }
context 'import_in_progress response' do
let(:client_response) { 'import_in_progress' }
it_behaves_like 'no action'
end
context 'import_complete response' do
let(:client_response) { 'import_complete' }
it 'finishes the import' do
expect { subject }.to change { repository.reload.migration_state }.to('import_done')
end
end
context 'import_failed response' do
let(:client_response) { 'import_failed' }
it_behaves_like 'retrying the import'
end
context 'pre_import_in_progress response' do
let(:client_response) { 'pre_import_in_progress' }
it_behaves_like 'no action'
end
context 'migration_pre_import_done_at is NULL' do
it_behaves_like 'retrying the pre_import'
end
context 'pre_import_complete response' do
let(:client_response) { 'pre_import_complete' }
context 'migration_pre_import_done_at is not NULL' do
before do
repository.update_columns(
migration_pre_import_started_at: 5.minutes.ago,
migration_pre_import_done_at: Time.zone.now
)
end
it 'finishes the pre_import and starts the import' do
expect(repository).to receive(:finish_pre_import).and_call_original
expect(repository).to receive(:migration_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('importing')
it_behaves_like 'retrying the import'
end
end
end
end
end
context 'pre_import_failed response' do
let(:client_response) { 'pre_import_failed' }
it_behaves_like 'retrying the pre_import'
end
context 'error response' do
let(:client_response) { 'error' }
context 'migration_pre_import_done_at is NULL' do
it_behaves_like 'retrying the pre_import'
end
context 'migration_pre_import_done_at is not NULL' do
before do
repository.update_columns(
migration_pre_import_started_at: 5.minutes.ago,
migration_pre_import_done_at: Time.zone.now
)
end
describe '#reconcile_import_status' do
subject { repository.reconcile_import_status(status) }
it_behaves_like 'retrying the import'
end
end
before do
repository.abort_import
end
it_behaves_like 'reconciling migration_state'
end
describe '#tag' do
......@@ -722,12 +663,12 @@ RSpec.describe ContainerRepository, :aggregate_failures do
end
context 'registry migration' do
shared_examples 'handling the migration step' do |step|
let(:client_response) { :foobar }
before do
allow(repository.gitlab_api_client).to receive(:supports_gitlab_api?).and_return(true)
end
before do
allow(repository.gitlab_api_client).to receive(:supports_gitlab_api?).and_return(true)
end
shared_examples 'gitlab migration client request' do |step|
let(:client_response) { :foobar }
it 'returns the same response as the client' do
expect(repository.gitlab_api_client)
......@@ -746,6 +687,10 @@ RSpec.describe ContainerRepository, :aggregate_failures do
expect(subject).to eq(:error)
end
end
end
shared_examples 'handling the migration step' do |step|
it_behaves_like 'gitlab migration client request', step
context 'too many imports' do
it 'raises an error when it receives too_many_imports as a response' do
......@@ -767,6 +712,12 @@ RSpec.describe ContainerRepository, :aggregate_failures do
it_behaves_like 'handling the migration step', :import_repository
end
describe '#migration_cancel' do
subject { repository.migration_cancel }
it_behaves_like 'gitlab migration client request', :cancel_repository_import
end
end
describe '.build_from_path' do
......
......@@ -67,12 +67,17 @@ RSpec.describe API::Internal::ContainerRegistry::Migration do
it_behaves_like 'returning an error', with_message: "Couldn't transition from pre_importing to importing"
end
end
context 'with repository in importing migration state' do
let(:repository) { create(:container_repository, :importing) }
context 'with repository in importing migration state' do
let(:repository) { create(:container_repository, :importing) }
it 'returns ok and does not update the migration state' do
expect { subject }
.not_to change { repository.reload.migration_state }
it_behaves_like 'returning an error', with_message: "Couldn't transition from pre_importing to importing"
expect(response).to have_gitlab_http_status(:ok)
end
end
end
end
......@@ -101,7 +106,7 @@ RSpec.describe API::Internal::ContainerRegistry::Migration do
context 'with repository in pre_importing migration state' do
let(:repository) { create(:container_repository, :pre_importing) }
it_behaves_like 'returning an error', with_message: "Couldn't transition from importing to import_done"
it_behaves_like 'updating the repository migration status', from: 'pre_importing', to: 'import_done'
end
end
......
......@@ -116,3 +116,80 @@ RSpec.shared_examples 'not hitting graphql network errors with the container reg
expect_graphql_errors_to_be_empty
end
end
RSpec.shared_examples 'reconciling migration_state' do
shared_examples 'no action' do
it 'does nothing' do
expect { subject }.not_to change { repository.reload.migration_state }
expect(subject).to eq(nil)
end
end
shared_examples 'retrying the pre_import' do
it 'retries the pre_import' do
expect(repository).to receive(:migration_pre_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('pre_importing')
end
end
shared_examples 'retrying the import' do
it 'retries the import' do
expect(repository).to receive(:migration_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('importing')
end
end
context 'native response' do
let(:status) { 'native' }
it 'raises an error' do
expect { subject }.to raise_error(described_class::NativeImportError)
end
end
context 'import_in_progress response' do
let(:status) { 'import_in_progress' }
it_behaves_like 'no action'
end
context 'import_complete response' do
let(:status) { 'import_complete' }
it 'finishes the import' do
expect { subject }.to change { repository.reload.migration_state }.to('import_done')
end
end
context 'import_failed response' do
let(:status) { 'import_failed' }
it_behaves_like 'retrying the import'
end
context 'pre_import_in_progress response' do
let(:status) { 'pre_import_in_progress' }
it_behaves_like 'no action'
end
context 'pre_import_complete response' do
let(:status) { 'pre_import_complete' }
it 'finishes the pre_import and starts the import' do
expect(repository).to receive(:finish_pre_import).and_call_original
expect(repository).to receive(:migration_import).and_return(:ok)
expect { subject }.to change { repository.reload.migration_state }.to('importing')
end
end
context 'pre_import_failed response' do
let(:status) { 'pre_import_failed' }
it_behaves_like 'retrying the pre_import'
end
end
......@@ -3,8 +3,6 @@
require 'spec_helper'
RSpec.describe ContainerRegistry::Migration::GuardWorker, :aggregate_failures do
include_context 'container registry client'
let(:worker) { described_class.new }
describe '#perform' do
......@@ -13,11 +11,12 @@ RSpec.describe ContainerRegistry::Migration::GuardWorker, :aggregate_failures do
let(:importing_migrations) { ::ContainerRepository.with_migration_states(:importing) }
let(:import_aborted_migrations) { ::ContainerRepository.with_migration_states(:import_aborted) }
let(:import_done_migrations) { ::ContainerRepository.with_migration_states(:import_done) }
let(:import_skipped_migrations) { ::ContainerRepository.with_migration_states(:import_skipped) }
subject { worker.perform }
before do
stub_container_registry_config(enabled: true, api_url: registry_api_url, key: 'spec/fixtures/x509_certificate_pk.key')
stub_container_registry_config(enabled: true, api_url: 'http://container-registry', key: 'spec/fixtures/x509_certificate_pk.key')
allow(::ContainerRegistry::Migration).to receive(:max_step_duration).and_return(5.minutes)
end
......@@ -26,20 +25,57 @@ RSpec.describe ContainerRegistry::Migration::GuardWorker, :aggregate_failures do
allow(::Gitlab).to receive(:com?).and_return(true)
end
shared_examples 'not aborting any migration' do
it 'will not abort the migration' do
expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 0)
expect(worker).to receive(:log_extra_metadata_on_done).with(:long_running_stale_migration_container_repository_ids, [stale_migration.id])
shared_examples 'handling long running migrations' do
before do
allow_next_found_instance_of(ContainerRepository) do |repository|
allow(repository).to receive(:migration_cancel).and_return(migration_cancel_response)
end
end
expect { subject }
.to not_change(pre_importing_migrations, :count)
.and not_change(pre_import_done_migrations, :count)
.and not_change(importing_migrations, :count)
.and not_change(import_done_migrations, :count)
.and not_change(import_aborted_migrations, :count)
.and not_change { stale_migration.reload.migration_state }
.and not_change { ongoing_migration.migration_state }
context 'migration is canceled' do
let(:migration_cancel_response) { { status: :ok } }
it 'will not abort the migration' do
expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_long_running_migration_ids, [stale_migration.id])
expect { subject }
.to change(import_skipped_migrations, :count)
expect(stale_migration.reload.migration_state).to eq('import_skipped')
expect(stale_migration.reload.migration_skipped_reason).to eq('migration_canceled')
end
end
context 'migration cancelation fails with an error' do
let(:migration_cancel_response) { { status: :error } }
it 'will abort the migration' do
expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_long_running_migration_ids, [stale_migration.id])
expect { subject }
.to change(import_aborted_migrations, :count).by(1)
.and change { stale_migration.reload.migration_state }.to('import_aborted')
.and not_change { ongoing_migration.migration_state }
end
end
context 'migration receives bad request with a new status' do
let(:migration_cancel_response) { { status: :bad_request, migration_state: :import_done } }
it 'will abort the migration' do
expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_long_running_migration_ids, [stale_migration.id])
expect { subject }
.to change(import_aborted_migrations, :count).by(1)
.and change { stale_migration.reload.migration_state }.to('import_aborted')
.and not_change { ongoing_migration.migration_state }
end
end
end
......@@ -86,7 +122,7 @@ RSpec.describe ContainerRegistry::Migration::GuardWorker, :aggregate_failures do
context 'the client returns pre_import_in_progress' do
let(:import_status) { 'pre_import_in_progress' }
it_behaves_like 'not aborting any migration'
it_behaves_like 'handling long running migrations'
end
end
......@@ -141,7 +177,7 @@ RSpec.describe ContainerRegistry::Migration::GuardWorker, :aggregate_failures do
context 'the client returns import_in_progress' do
let(:import_status) { 'import_in_progress' }
it_behaves_like 'not aborting any migration'
it_behaves_like 'handling long running migrations'
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment