Merge branch '267828-remove-minimum_interval-from-backgroundmigrationworker' into 'master'

Remove possible infinite loop from BackgroundMigrationWorker See merge request gitlab-org/gitlab!45298

Merge branch '267828-remove-minimum_interval-from-backgroundmigrationworker' into 'master'
Remove possible infinite loop from BackgroundMigrationWorker See merge request gitlab-org/gitlab!45298
7c4f0c96 · Stan Hu · c67a1159 · ec76b503 · 7c4f0c96 · 7c4f0c96
Commit 7c4f0c96 authored Nov 06, 2020 by Stan Hu
3 changed files
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -24,10 +24,14 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
  # class_name - The class name of the background migration to run.
  # arguments - The arguments to pass to the migration class.
  # lease_attempts - The number of times we will try to obtain an exclusive
-  #   lease on the class before running anyway.  Pass 0 to always run.
+  #   lease on the class before giving up. See MR for more discussion.
+  #   https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956
  def perform(class_name, arguments = [], lease_attempts = 5)
    with_context(caller_id: class_name.to_s) do
-      should_perform, ttl = perform_and_ttl(class_name)
+      attempts_left = lease_attempts - 1
+      should_perform, ttl = perform_and_ttl(class_name, attempts_left)
+
+      break if should_perform.nil?

      if should_perform
        Gitlab::BackgroundMigration.perform(class_name, arguments)
@@ -37,32 +41,39 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
        # we'll reschedule the job in such a way that it is picked up again around
        # the time the lease expires.
        self.class
-          .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
+          .perform_in(ttl || self.class.minimum_interval, class_name, arguments, attempts_left)
      end
    end
  end

-  def perform_and_ttl(class_name)
-    if always_perform?
-      # In test environments `perform_in` will run right away. This can then
-      # lead to stack level errors in the above `#perform`. To work around this
-      # we'll just perform the migration right away in the test environment.
-      [true, nil]
-    else
-      lease = lease_for(class_name)
-      perform = !!lease.try_obtain
-
-      # If we managed to acquire the lease but the DB is not healthy, then we
-      # want to simply reschedule our job and try again _after_ the lease
-      # expires.
-      if perform && !healthy_database?
-        database_unhealthy_counter.increment
-
-        perform = false
-      end
+  def perform_and_ttl(class_name, attempts_left)
+    # In test environments `perform_in` will run right away. This can then
+    # lead to stack level errors in the above `#perform`. To work around this
+    # we'll just perform the migration right away in the test environment.
+    return [true, nil] if always_perform?
+
+    lease = lease_for(class_name)
+    lease_obtained = !!lease.try_obtain
+    healthy_db = healthy_database?
+    perform = lease_obtained && healthy_db
+
+    database_unhealthy_counter.increment if lease_obtained && !healthy_db

-      [perform, lease.ttl]
+    # If we've tried several times to get a lease with a healthy DB without success, just give up.
+    # Otherwise we could end up in an infinite rescheduling loop.
+    if !perform && attempts_left < 0
+      msg = if !lease_obtained
+              'Job could not get an exclusive lease after several tries. Giving up.'
+            else
+              'Database was unhealthy after several tries. Giving up.'
+            end
+
+      Sidekiq.logger.warn(class: class_name, message: msg, job_id: jid)
+
+      return [nil, nil]
    end
+
+    [perform, lease.ttl]
  end

  def lease_for(class_name)

--- a/changelogs/unreleased/267828-remove-minimum_interval-from-backgroundmigrationworker.yml
+++ b/changelogs/unreleased/267828-remove-minimum_interval-from-backgroundmigrationworker.yml
+---
+title: Limit number of times a background migration is rescheduled
+merge_request: 45298
+author:
+type: fixed
--- a/spec/workers/background_migration_worker_spec.rb
+++ b/spec/workers/background_migration_worker_spec.rb
@@ -12,45 +12,91 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
  end

  describe '#perform' do
-    it 'performs a background migration' do
-      expect(Gitlab::BackgroundMigration)
-        .to receive(:perform)
-        .with('Foo', [10, 20])
+    before do
+      allow(worker).to receive(:jid).and_return(1)
+      expect(worker).to receive(:always_perform?).and_return(false)
+    end

-      worker.perform('Foo', [10, 20])
+    context 'when lease can be obtained' do
+      before do
+        expect(Gitlab::BackgroundMigration)
+          .to receive(:perform)
+          .with('Foo', [10, 20])
+      end
+
+      it 'performs a background migration' do
+        worker.perform('Foo', [10, 20])
+      end
+
+      context 'when lease_attempts is 1' do
+        it 'performs a background migration' do
+          worker.perform('Foo', [10, 20], 1)
+        end
+      end
    end

-    it 'reschedules a migration if it was performed recently' do
-      expect(worker)
-        .to receive(:always_perform?)
-        .and_return(false)
+    context 'when lease not obtained (migration of same class was performed recently)' do
+      before do
+        expect(Gitlab::BackgroundMigration).not_to receive(:perform)
+
+        worker.lease_for('Foo').try_obtain
+      end

-      worker.lease_for('Foo').try_obtain
+      it 'reschedules the migration and decrements the lease_attempts' do
+        expect(described_class)
+          .to receive(:perform_in)
+          .with(a_kind_of(Numeric), 'Foo', [10, 20], 4)

-      expect(Gitlab::BackgroundMigration)
-        .not_to receive(:perform)
+        worker.perform('Foo', [10, 20], 5)
+      end

-      expect(described_class)
-        .to receive(:perform_in)
-        .with(a_kind_of(Numeric), 'Foo', [10, 20])
+      context 'when lease_attempts is 1' do
+        it 'reschedules the migration and decrements the lease_attempts' do
+          expect(described_class)
+            .to receive(:perform_in)
+            .with(a_kind_of(Numeric), 'Foo', [10, 20], 0)

-      worker.perform('Foo', [10, 20])
+          worker.perform('Foo', [10, 20], 1)
+        end
+      end
+
+      context 'when lease_attempts is 0' do
+        it 'gives up performing the migration' do
+          expect(described_class).not_to receive(:perform_in)
+          expect(Sidekiq.logger).to receive(:warn).with(
+            class: 'Foo',
+            message: 'Job could not get an exclusive lease after several tries. Giving up.',
+            job_id: 1)
+
+          worker.perform('Foo', [10, 20], 0)
+        end
+      end
    end

-    it 'reschedules a migration if the database is not healthy' do
-      allow(worker)
-        .to receive(:always_perform?)
-        .and_return(false)
+    context 'when database is not healthy' do
+      before do
+        allow(worker).to receive(:healthy_database?).and_return(false)
+      end

-      allow(worker)
-        .to receive(:healthy_database?)
-        .and_return(false)
+      it 'reschedules a migration if the database is not healthy' do
+        expect(described_class)
+          .to receive(:perform_in)
+          .with(a_kind_of(Numeric), 'Foo', [10, 20], 4)

-      expect(described_class)
-        .to receive(:perform_in)
-        .with(a_kind_of(Numeric), 'Foo', [10, 20])
+        worker.perform('Foo', [10, 20])
+      end

-      worker.perform('Foo', [10, 20])
+      context 'when lease_attempts is 0' do
+        it 'gives up performing the migration' do
+          expect(described_class).not_to receive(:perform_in)
+          expect(Sidekiq.logger).to receive(:warn).with(
+            class: 'Foo',
+            message: 'Database was unhealthy after several tries. Giving up.',
+            job_id: 1)
+
+          worker.perform('Foo', [10, 20], 0)
+        end
+      end
    end

    it 'sets the class that will be executed as the caller_id' do