Add E2E test of backend node recovery

Includes changes to allow Resources to return enumerable results via the API

Add E2E test of backend node recovery
Includes changes to allow Resources to return enumerable results via the API
7b8e5b7d · Mark Lapierre · 3d9857f7 · 7b8e5b7d · 7b8e5b7d · 7b8e5b7d
Commit 7b8e5b7d authored Aug 03, 2020 by Mark Lapierre
5 changed files
--- a/qa/qa/resource/project.rb
+++ b/qa/qa/resource/project.rb
@@ -123,6 +123,10 @@ module QA
        "#{api_get_path}/runners"
      end

+      def api_commits_path
+        "#{api_get_path}/repository/commits"
+      end
+
      def api_repository_branches_path
        "#{api_get_path}/repository/branches"
      end
@@ -176,6 +180,10 @@ module QA
        raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
      end

+      def commits
+        parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
+      end
+
      def import_status
        response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url


--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -17,21 +17,31 @@ module QA
        @virtual_storage = 'default'
      end

-      def enable_writes
-        shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
+      # Executes the praefect `dataloss` command.
+      #
+      # @return [Boolean] whether dataloss has occurred
+      def dataloss?
+        wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
      end

      def replicated?(project_id)
-        shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
+        replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
+          QA::Runtime::Logger.debug(line.chomp)
          # The output of the rake task looks something like this:
          #
          # Project name                    | gitaly1 (primary)                        | gitaly2                                  | gitaly3
          # ----------------------------------------------------------------------------------------------------------------------------------------------------------------
          # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
          #
-          # We want to confirm that the checksums are identical
-          break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
+          break line if line.start_with?("gitaly_cluster")
        end
+
+        # We want to know if the checksums are identical
+        replicas.split('|').map(&:strip)[1..3].uniq.one?
+      end
+
+      def start_primary_node
+        start_node(@primary_node)
      end

      def start_praefect
@@ -51,40 +61,54 @@ module QA
      end

      def trigger_failover_by_stopping_primary_node
+        QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
        stop_node(@primary_node)
      end

      def clear_replication_queue
-        QA::Runtime::Logger.debug("Clearing the replication queue")
-        shell <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
-        CMD
+        QA::Runtime::Logger.info("Clearing the replication queue")
+        shell sql_to_docker_exec_cmd(
+          <<~SQL
+            delete from replication_queue_job_lock;
+            delete from replication_queue_lock;
+            delete from replication_queue;
+          SQL
+        )
      end

      def create_stalled_replication_queue
-        QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
-        shell <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"update replication_queue set state = 'in_progress';
-                  insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
-                    select id, rq.lock_id, created_at from replication_queue rq
-                      left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
-                      where state = 'in_progress' and rqjl.job_id is null;
-                  update replication_queue_lock set acquired = 't';\\""
-        CMD
+        QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
+        shell sql_to_docker_exec_cmd(
+          <<~SQL
+            update replication_queue set state = 'in_progress';
+            insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
+              select id, rq.lock_id, created_at from replication_queue rq
+                left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
+                where state = 'in_progress' and rqjl.job_id is null;
+            update replication_queue_lock set acquired = 't';
+          SQL
+        )
+      end
+
+      # Reconciles the previous primary node with the current one
+      # I.e., it brings the previous primary node up-to-date
+      def reconcile_nodes
+        reconcile_node_with_node(@primary_node, current_primary_node)
+      end
+
+      def reconcile_node_with_node(target, reference)
+        QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
+        wait_until_shell_command_matches(
+          "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
+          /FINISHED: \d+ repos were checked for consistency/,
+          sleep_interval: 5,
+          retry_on_exception: true
+        )
      end

      def replication_queue_lock_count
        result = []
-        cmd = <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
-        CMD
-        shell cmd do |line|
+        shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
          result << line
        end
        # The result looks like:
@@ -94,12 +118,36 @@ module QA
        result[2].to_i
      end

+      # Makes the original primary (gitaly1) the primary again by
+      # stopping the other nodes, waiting for gitaly1 to be made the
+      # primary again, and then it starts the other nodes and enables
+      # writes
+      def reset_primary_to_original
+        QA::Runtime::Logger.info("Checking primary node...")
+
+        return if @primary_node == current_primary_node
+
+        QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
+        start_node(@primary_node)
+        stop_node(@secondary_node)
+        stop_node(@tertiary_node)
+
+        wait_for_new_primary_node(@primary_node)
+
+        start_node(@secondary_node)
+        start_node(@tertiary_node)
+
+        wait_for_health_check_all_nodes
+        wait_for_reliable_connection
+      end
+
      def reset_cluster
+        QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
        start_node(@praefect)
        start_node(@primary_node)
        start_node(@secondary_node)
        start_node(@tertiary_node)
-        enable_writes
+        wait_for_health_check_all_nodes
      end

      def verify_storage_move(source_storage, destination_storage)
@@ -111,10 +159,33 @@ module QA
      end

      def wait_for_praefect
+        QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
        wait_until_shell_command_matches(
          "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
          /listening at tcp address/
        )
+
+        # Praefect can fail to start if unable to dial one of the gitaly nodes
+        # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
+        # We tail the logs to allow us to confirm if that is the problem if tests fail
+
+        shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+        end
+      end
+
+      def wait_for_new_primary_node(node)
+        QA::Runtime::Logger.info("Wait until #{node} is the primary node")
+        with_praefect_log do |log|
+          break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
+        end
+      end
+
+      def wait_for_new_primary
+        QA::Runtime::Logger.info("Wait until a new primary node is selected")
+        with_praefect_log do |log|
+          break true if log['msg'] == 'primary node changed'
+        end
      end

      def wait_for_sql_ping
@@ -124,35 +195,86 @@ module QA
        )
      end

+      def wait_for_no_praefect_storage_error
+        # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
+        # That is, there's no message shown in the Praefect logs when the healthcheck succeeds
+        # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
+
+        QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
+
+        Support::Waiter.wait_until(max_duration: 120) do
+          wait_for_gitaly_check
+
+          sleep 5
+
+          shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
+            QA::Runtime::Logger.debug(line.chomp)
+            log = JSON.parse(line)
+
+            break true if log['msg'] != 'error when pinging healthcheck'
+          rescue JSON::ParserError
+            # Ignore lines that can't be parsed as JSON
+          end
+        end
+      end
+
      def wait_for_storage_nodes
-        nodes_confirmed = {
-          @primary_node => false,
-          @secondary_node => false,
-          @tertiary_node => false
-        }
+        wait_for_no_praefect_storage_error

-        wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
-          QA::Runtime::Logger.info(line.chomp)
+        Support::Waiter.repeat_until(max_attempts: 3) do
+          nodes_confirmed = {
+            @primary_node => false,
+            @secondary_node => false,
+            @tertiary_node => false
+          }

-          nodes_confirmed.each_key do |node|
-            nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+          wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
+            QA::Runtime::Logger.debug(line.chomp)
+
+            nodes_confirmed.each_key do |node|
+              nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+            end
+
+            nodes_confirmed.values.all?
          end
+        end
+      end

-          nodes_confirmed.values.all?
+      def wait_for_health_check_current_primary_node
+        wait_for_health_check(current_primary_node)
+      end
+
+      def wait_for_health_check_all_nodes
+        wait_for_health_check(@primary_node)
+        wait_for_health_check(@secondary_node)
+        wait_for_health_check(@tertiary_node)
+      end
+
+      def wait_for_health_check(node)
+        QA::Runtime::Logger.info("Waiting for health check on #{node}")
+        wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+          log = JSON.parse(line)
+
+          log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
+        rescue JSON::ParserError
+          # Ignore lines that can't be parsed as JSON
        end
      end

      def wait_for_gitaly_check
-        storage_ok = false
-        check_finished = false
+        Support::Waiter.repeat_until(max_attempts: 3) do
+          storage_ok = false
+          check_finished = false

-        wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
-          QA::Runtime::Logger.info(line.chomp)
+          wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
+            QA::Runtime::Logger.debug(line.chomp)

-          storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
-          check_finished = true if line =~ /Checking Gitaly ... Finished/
+            storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
+            check_finished = true if line =~ /Checking Gitaly ... Finished/

-          storage_ok && check_finished
+            storage_ok && check_finished
+          end
        end
      end

@@ -164,15 +286,36 @@ module QA
      end

      def wait_for_reliable_connection
+        QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
        wait_for_praefect
        wait_for_sql_ping
        wait_for_storage_nodes
-        wait_for_gitaly_check
        wait_for_gitlab_shell_check
      end

+      def wait_for_replication(project_id)
+        Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
+      end
+
      private

+      def current_primary_node
+        shell dataloss_command do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+
+          match = line.match(/Primary: (.*)/)
+          break match[1] if match
+        end
+      end
+
+      def dataloss_command
+        "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
+      end
+
+      def sql_to_docker_exec_cmd(sql)
+        Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
+      end
+
      def verify_storage_move_from_gitaly(storage)
        wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
          log = JSON.parse(line)
@@ -203,21 +346,10 @@ module QA
        end
      end

-      def wait_until_shell_command(cmd, **kwargs)
-        sleep_interval = kwargs.delete(:sleep_interval) || 1
-
-        Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
-          shell cmd do |line|
-            break true if yield line
-          end
-        end
-      end
-
-      def wait_until_shell_command_matches(cmd, regex, **kwargs)
-        wait_until_shell_command(cmd, kwargs) do |line|
-          QA::Runtime::Logger.info(line.chomp)
-
-          line =~ regex
+      def with_praefect_log
+        wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+          yield JSON.parse(line)
        end
      end
    end

--- a/qa/qa/service/shellout.rb
+++ b/qa/qa/service/shellout.rb
@@ -33,6 +33,31 @@ module QA
          end
        end
      end
+
+      def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
+        <<~CMD
+          docker exec --env PGPASSWORD=#{password} #{container} \
+            bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
+        CMD
+      end
+
+      def wait_until_shell_command(cmd, **kwargs)
+        sleep_interval = kwargs.delete(:sleep_interval) || 1
+
+        Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
+          shell cmd do |line|
+            break true if yield line
+          end
+        end
+      end
+
+      def wait_until_shell_command_matches(cmd, regex, **kwargs)
+        wait_until_shell_command(cmd, kwargs) do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+
+          line =~ regex
+        end
+      end
    end
  end
 end
--- a/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
+# frozen_string_literal: true
+
+module QA
+  RSpec.describe 'Create' do
+    context 'Gitaly' do
+      describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
+        let(:praefect_manager) { Service::PraefectManager.new }
+        let(:project) do
+          Resource::Project.fabricate! do |project|
+            project.name = "gitaly_cluster"
+            project.initialize_with_readme = true
+          end
+        end
+
+        before do
+          # Reset the cluster in case previous tests left it in a bad state
+          praefect_manager.reset_primary_to_original
+        end
+
+        after do
+          # Leave the cluster in a suitable state for subsequent tests
+          praefect_manager.reset_primary_to_original
+        end
+
+        it 'recovers from dataloss' do
+          # Create a new project with a commit and wait for it to replicate
+          praefect_manager.wait_for_replication(project.id)
+
+          # Stop the primary node to trigger failover, and then wait
+          # for Gitaly to be ready for writes again
+          praefect_manager.trigger_failover_by_stopping_primary_node
+          praefect_manager.wait_for_new_primary
+          praefect_manager.wait_for_health_check_current_primary_node
+          praefect_manager.wait_for_gitaly_check
+
+          # Confirm that we have access to the repo after failover
+          Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
+            Resource::Repository::Commit.fabricate_via_api! do |commits|
+              commits.project = project
+              commits.sha = 'master'
+            end
+          end
+
+          # Push a commit to the new primary
+          Resource::Repository::ProjectPush.fabricate! do |push|
+            push.project = project
+            push.new_branch = false
+            push.commit_message = 'pushed after failover'
+            push.file_name = 'new_file'
+            push.file_content = 'new file'
+          end
+
+          # Start the old primary node again
+          praefect_manager.start_primary_node
+          praefect_manager.wait_for_health_check_current_primary_node
+
+          # Confirm dataloss (i.e., inconsistent nodes)
+          expect(praefect_manager.dataloss?).to be true
+          expect(praefect_manager.replicated?(project.id)).to be false
+
+          # Reconcile nodes to recover from dataloss
+          praefect_manager.reconcile_nodes
+          praefect_manager.wait_for_replication(project.id)
+
+          # Confirm that both commits are available after reconciliation
+          expect(project.commits.map { |commit| commit[:message].chomp })
+            .to include("Initial commit").and include("pushed after failover")
+        end
+      end
+    end
+  end
+end
--- a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
+++ b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
@@ -41,8 +41,6 @@ module QA
            expect(show).to have_file(initial_file)
          end

-          praefect_manager.enable_writes
-
          Resource::Repository::Commit.fabricate_via_api! do |commit|
            commit.project = project
            commit.add_files([