Refactor Geo LogCursor Daemon loop

- Rename run_once! to find_and_handle_events! - Extract loop contents into run_once! - Reuse sleep_break in arbitrary_sleep - Remove now-redundant exit? check - Use exit! to set @exit due to error - Now we can avoid calling run! in specs to avoid excessive stubbing

Refactor Geo LogCursor Daemon loop
- Rename run_once! to find_and_handle_events! - Extract loop contents into run_once! - Reuse sleep_break in arbitrary_sleep - Remove now-redundant exit? check - Use exit! to set @exit due to error - Now we can avoid calling run! in specs to avoid excessive stubbing
60117481 · Michael Kozono · da77dc56 · 60117481 · 60117481
Commit 60117481 authored Sep 10, 2019 by Michael Kozono
Showing with 79 additions and 91 deletions

ee/lib/gitlab/geo/log_cursor/daemon.rb ee/lib/gitlab/geo/log_cursor/daemon.rb +34 -26

ee/spec/lib/gitlab/geo/log_cursor/daemon_spec.rb ee/spec/lib/gitlab/geo/log_cursor/daemon_spec.rb +45 -65

No files found.
--- a/ee/lib/gitlab/geo/log_cursor/daemon.rb
+++ b/ee/lib/gitlab/geo/log_cursor/daemon.rb
@@ -21,28 +21,28 @@ module Gitlab
          logger.debug('#run!: start')
          trap_signals
-          until exit?
+          run_once! until exit?
-            # Prevent the node from processing events unless it's a secondary
-            unless Geo.secondary?
-              logger.debug("#run!: not a secondary, sleeping for #{SECONDARY_CHECK_INTERVAL} secs")
-              sleep_break(SECONDARY_CHECK_INTERVAL)
-              next
-            end
-            lease = Lease.try_obtain_with_ttl { run_once! }
+          logger.debug('#run!: finish')
+        end
-            handle_error(lease[:error])
+        def run_once!
+          # Prevent the node from processing events unless it's a secondary
+          unless Geo.secondary?
+            logger.debug("#run!: not a secondary, sleeping for #{SECONDARY_CHECK_INTERVAL} secs")
+            sleep_break(SECONDARY_CHECK_INTERVAL)
+            return
+          end
-            return if exit?
+          lease = Lease.try_obtain_with_ttl { find_and_handle_events! }
-            # When no new event is found sleep for a few moments
+          handle_error(lease[:error])
-            arbitrary_sleep(lease[:ttl])
-          end
-          logger.debug('#run!: finish')
+          # When no new event is found sleep for a few moments
+          arbitrary_sleep(lease[:ttl])
        end
-        def run_once!
+        def find_and_handle_events!
          gap_tracking.fill_gaps { |event_log| handle_single_event(event_log) }
          # Wrap this with the connection to make it possible to reconnect if
@@ -58,8 +58,7 @@ module Gitlab
          track_failing_since(did_error)
          if excessive_errors?
-            logger.error("#run!: Exiting due to consecutive errors for over #{MAX_ERROR_DURATION} seconds")
+            exit!("Consecutive errors for over #{MAX_ERROR_DURATION} seconds")
-            @exit = true
          end
        end
@@ -77,14 +76,6 @@ module Gitlab
          MAX_ERROR_DURATION < (Time.now - @failing_since)
        end
-        def sleep_break(seconds)
-          while seconds > 0
-            sleep(1)
-            seconds -= 1
-            break if exit?
-          end
-        end
        def handle_events(batch, previous_batch_last_id)
          logger.info("#handle_events:", first_id: batch.first.id, last_id: batch.last.id)
@@ -140,6 +131,12 @@ module Gitlab
          @exit = true
        end
+        def exit!(error_message)
+          logger.error("Exiting due to: #{error_message}") if error_message
+          @exit = true
+        end
        def exit?
          @exit
        end
@@ -160,7 +157,18 @@ module Gitlab
        # This allows multiple GeoLogCursors to randomly process a batch of events,
        # without favouring the shortest path (or latency).
        def arbitrary_sleep(delay)
-          sleep(delay + rand(1..20) * 0.1)
+          jitter = rand(1..20) * 0.1
+          sleep_break(delay + jitter)
+        end
+        def sleep_break(seconds)
+          while seconds > 0.0
+            to_sleep = seconds > 1.0 ? 1.0 : seconds
+            seconds -= to_sleep
+            sleep(to_sleep)
+            break if exit?
+          end
        end
        def gap_tracking

--- a/ee/spec/lib/gitlab/geo/log_cursor/daemon_spec.rb
+++ b/ee/spec/lib/gitlab/geo/log_cursor/daemon_spec.rb
@@ -22,16 +22,9 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
    allow(daemon).to receive(:arbitrary_sleep).and_return(0.1)
  end
-  # WARNINGS
+  # Warning: Ensure an exit condition for the main run! loop, or RSpec will not
-  #
+  # stop without an interrupt. You can use `ensure_exit_on` to specify the exact
-  # 1. Ensure an exit condition for the main run! loop, or RSpec will not stop
+  # number of calls to `exit?`, with the last call returning `true`.
-  #    without an interrupt.
-  #
-  #    I recommend using `ensure_exit_on`.
-  #
-  # 2. run! occasionally spawns git processes that run forever at 100% CPU.
-  #
-  #    I don't know why this happens.
  describe '#run!' do
    it 'traps signals' do
      ensure_exit_on(1)
@@ -41,12 +34,14 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
    end
    it 'delegates to #run_once! in a loop' do
-      ensure_exit_on(4)
+      ensure_exit_on(3)
      is_expected.to receive(:run_once!).twice
      daemon.run!
    end
+  end
+  describe '#run_once!' do
    it 'skips execution if cannot achieve a lease' do
      lease = stub_exclusive_lease_taken('geo_log_cursor_processed')
@@ -54,83 +49,68 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
      allow(lease).to receive(:same_uuid?).and_return(false)
      allow(Gitlab::Geo::LogCursor::Lease).to receive(:exclusive_lease).and_return(lease)
-      ensure_exit_on(2)
+      is_expected.not_to receive(:find_and_handle_events!)
-      is_expected.not_to receive(:run_once!)
-      daemon.run!
+      daemon.run_once!
    end
    it 'skips execution if not a Geo node' do
      stub_current_geo_node(nil)
-      ensure_exit_on(2)
      is_expected.to receive(:sleep_break).with(1.minute)
-      is_expected.not_to receive(:run_once!)
+      is_expected.not_to receive(:find_and_handle_events!)
-      daemon.run!
+      daemon.run_once!
    end
    it 'skips execution if the current node is a primary' do
      stub_current_geo_node(primary)
-      ensure_exit_on(2)
      is_expected.to receive(:sleep_break).with(1.minute)
-      is_expected.not_to receive(:run_once!)
+      is_expected.not_to receive(:find_and_handle_events!)
-      daemon.run!
+      daemon.run_once!
    end
-    context 'when run! has handled an error every call for over the allowed duration' do
+    context 'when the lease block rescues an error' do
-      it 'exits' do
+      context 'when this error is the final straw' do
-        # Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior.
+        it 'calls `#exit!`' do
-        # If `exit?` is called a third time, then the "exit if failing for too long" behavior is broken.
+          is_expected.to receive(:exit!)
-        expect(daemon).to receive(:exit?).and_call_original.twice
-        Timecop.freeze do
-          daemon.send(:handle_error, true)
-          Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
+          is_expected.to receive(:find_and_handle_events!).and_raise('any error').twice
-            expect(daemon).to receive(:gap_tracking).and_raise('boom')
-            is_expected.to receive(:run_once!).and_call_original.once
+          Timecop.freeze do
+            daemon.run_once!
-            daemon.run!
+            Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
+              daemon.run_once!
+            end
          end
        end
      end
-    end
-    context 'when run_once! has returned one call without raising an error before the allowed duration' do
-      it 'does not exit' do
-        # Can't use ensure_exit_on here since the logic we are testing depends on `exit?` behavior
-        expect(daemon).to receive(:exit?).and_call_original.exactly(5).times
-        # Force exit on 6th call
+      context 'when this error is not the final straw' do
-        # If `exit?` is not called 6 times, then the daemon stopped too early.
+        it 'does not call `#exit!`' do
-        expect(daemon).to receive(:exit?).and_return(true)
+          is_expected.not_to receive(:exit!)
-        Timecop.freeze do
+          Timecop.freeze do
-          # As if an error occurred
+            is_expected.to receive(:find_and_handle_events!).and_raise('any error')
-          daemon.send(:handle_error, true)
+            daemon.run_once!
-          Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
+            Timecop.travel(described_class::MAX_ERROR_DURATION + 1.second) do
-            # First, a successful run to reset the timer
+              is_expected.to receive(:find_and_handle_events!) # successful
-            expect(daemon).to receive(:gap_tracking).and_call_original
+              daemon.run_once!
-            # Then more errors
+              is_expected.to receive(:find_and_handle_events!).and_raise('any error')
-            expect(daemon).to receive(:gap_tracking).and_raise('boom').twice
+              daemon.run_once!
+            end
-            # It should continue running until we force it to exit
-            is_expected.to receive(:run_once!).and_call_original.exactly(3).times
-            daemon.run!
          end
        end
      end
    end
  end
-  describe '#run_once!' do
+  describe '#find_and_handle_events!' do
    context 'with some event logs' do
      let(:project) { create(:project) }
      let(:repository_updated_event) { create(:geo_repository_updated_event, project: project) }
@@ -141,7 +121,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
      it 'handles events' do
        expect(daemon).to receive(:handle_events).with(batch, anything)
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
      it 'calls #handle_gap_event for each gap the gap tracking finds' do
@@ -153,7 +133,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
        expect(daemon).to receive(:handle_single_event).with(event_log)
        expect(daemon).to receive(:handle_single_event).with(second_event_log)
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
    end
@@ -176,7 +156,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
        expect(Geo::ProjectSyncWorker).to receive(:perform_async).with(project.id, anything).once
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
      it 'does not replay events for projects that do not belong to selected namespaces to replicate' do
@@ -184,14 +164,14 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
        expect(Geo::ProjectSyncWorker).not_to receive(:perform_async).with(project.id, anything)
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
      it 'detects when an event was skipped' do
        updated_event = create(:geo_repository_updated_event, project: project)
        new_event = create(:geo_event_log, id: event_log.id + 2, repository_updated_event: updated_event)
-        daemon.run_once!
+        daemon.find_and_handle_events!
        create(:geo_event_log, id: event_log.id + 1)
@@ -204,11 +184,11 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
        updated_event = create(:geo_repository_updated_event, project: project)
        new_event = create(:geo_event_log, repository_updated_event: updated_event)
-        daemon.run_once!
+        daemon.find_and_handle_events!
        create(:geo_event_log, id: new_event.id + 3, repository_updated_event: updated_event)
-        daemon.run_once!
+        daemon.find_and_handle_events!
        create(:geo_event_log, id: new_event.id + 1, repository_updated_event: updated_event)
        create(:geo_event_log, id: new_event.id + 2, repository_updated_event: updated_event)
@@ -225,7 +205,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
                                                message: '#handle_single_event: unknown event',
                                                event_log_id: new_event.id))
-        daemon.run_once!
+        daemon.find_and_handle_events!
        expect(::Geo::EventLogState.last_processed.id).to eq(new_event.id)
      end
@@ -243,7 +223,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
                                                             event_type: 'Geo::RepositoryUpdatedEvent',
                                                             project_id: project.id))
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
      it 'does not replay events for projects that do not belong to selected shards to replicate' do
@@ -251,7 +231,7 @@ describe Gitlab::Geo::LogCursor::Daemon, :clean_gitlab_redis_shared_state do
        expect(Geo::ProjectSyncWorker).not_to receive(:perform_async).with(project.id, anything)
-        daemon.run_once!
+        daemon.find_and_handle_events!
      end
    end
  end