Commit 7b8e5b7d authored by Mark Lapierre's avatar Mark Lapierre

Add E2E test of backend node recovery

Includes changes to allow Resources to return enumerable results via
the API
parent 3d9857f7
......@@ -123,6 +123,10 @@ module QA
"#{api_get_path}/runners"
end
def api_commits_path
"#{api_get_path}/repository/commits"
end
def api_repository_branches_path
"#{api_get_path}/repository/branches"
end
......@@ -176,6 +180,10 @@ module QA
raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
end
def commits
parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
end
def import_status
response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url
......
......@@ -17,21 +17,31 @@ module QA
@virtual_storage = 'default'
end
def enable_writes
shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
# Executes the praefect `dataloss` command.
#
# @return [Boolean] whether dataloss has occurred
def dataloss?
wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
end
def replicated?(project_id)
shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this:
#
# Project name | gitaly1 (primary) | gitaly2 | gitaly3
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
#
# We want to confirm that the checksums are identical
break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
break line if line.start_with?("gitaly_cluster")
end
# We want to know if the checksums are identical
replicas.split('|').map(&:strip)[1..3].uniq.one?
end
def start_primary_node
start_node(@primary_node)
end
def start_praefect
......@@ -51,40 +61,54 @@ module QA
end
def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node)
end
def clear_replication_queue
QA::Runtime::Logger.debug("Clearing the replication queue")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
CMD
QA::Runtime::Logger.info("Clearing the replication queue")
shell sql_to_docker_exec_cmd(
<<~SQL
delete from replication_queue_job_lock;
delete from replication_queue_lock;
delete from replication_queue;
SQL
)
end
def create_stalled_replication_queue
QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';\\""
CMD
QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell sql_to_docker_exec_cmd(
<<~SQL
update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';
SQL
)
end
# Reconciles the previous primary node with the current one
# I.e., it brings the previous primary node up-to-date
def reconcile_nodes
reconcile_node_with_node(@primary_node, current_primary_node)
end
def reconcile_node_with_node(target, reference)
QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
/FINISHED: \d+ repos were checked for consistency/,
sleep_interval: 5,
retry_on_exception: true
)
end
def replication_queue_lock_count
result = []
cmd = <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
CMD
shell cmd do |line|
shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
result << line
end
# The result looks like:
......@@ -94,12 +118,36 @@ module QA
result[2].to_i
end
# Makes the original primary (gitaly1) the primary again by
# stopping the other nodes, waiting for gitaly1 to be made the
# primary again, and then it starts the other nodes and enables
# writes
def reset_primary_to_original
QA::Runtime::Logger.info("Checking primary node...")
return if @primary_node == current_primary_node
QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
start_node(@primary_node)
stop_node(@secondary_node)
stop_node(@tertiary_node)
wait_for_new_primary_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
wait_for_health_check_all_nodes
wait_for_reliable_connection
end
def reset_cluster
QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
start_node(@praefect)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
enable_writes
wait_for_health_check_all_nodes
end
def verify_storage_move(source_storage, destination_storage)
......@@ -111,10 +159,33 @@ module QA
end
def wait_for_praefect
QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/
)
# Praefect can fail to start if unable to dial one of the gitaly nodes
# See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
# We tail the logs to allow us to confirm if that is the problem if tests fail
shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
end
end
def wait_for_new_primary_node(node)
QA::Runtime::Logger.info("Wait until #{node} is the primary node")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
end
end
def wait_for_new_primary
QA::Runtime::Logger.info("Wait until a new primary node is selected")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed'
end
end
def wait_for_sql_ping
......@@ -124,35 +195,86 @@ module QA
)
end
def wait_for_no_praefect_storage_error
# If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
# That is, there's no message shown in the Praefect logs when the healthcheck succeeds
# To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
Support::Waiter.wait_until(max_duration: 120) do
wait_for_gitaly_check
sleep 5
shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
break true if log['msg'] != 'error when pinging healthcheck'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
end
def wait_for_storage_nodes
nodes_confirmed = {
@primary_node => false,
@secondary_node => false,
@tertiary_node => false
}
wait_for_no_praefect_storage_error
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.info(line.chomp)
Support::Waiter.repeat_until(max_attempts: 3) do
nodes_confirmed = {
@primary_node => false,
@secondary_node => false,
@tertiary_node => false
}
nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.debug(line.chomp)
nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
end
nodes_confirmed.values.all?
end
end
end
nodes_confirmed.values.all?
def wait_for_health_check_current_primary_node
wait_for_health_check(current_primary_node)
end
def wait_for_health_check_all_nodes
wait_for_health_check(@primary_node)
wait_for_health_check(@secondary_node)
wait_for_health_check(@tertiary_node)
end
def wait_for_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}")
wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
def wait_for_gitaly_check
storage_ok = false
check_finished = false
Support::Waiter.repeat_until(max_attempts: 3) do
storage_ok = false
check_finished = false
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.info(line.chomp)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.debug(line.chomp)
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok && check_finished
storage_ok && check_finished
end
end
end
......@@ -164,15 +286,36 @@ module QA
end
def wait_for_reliable_connection
QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect
wait_for_sql_ping
wait_for_storage_nodes
wait_for_gitaly_check
wait_for_gitlab_shell_check
end
def wait_for_replication(project_id)
Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
end
private
def current_primary_node
shell dataloss_command do |line|
QA::Runtime::Logger.debug(line.chomp)
match = line.match(/Primary: (.*)/)
break match[1] if match
end
end
def dataloss_command
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
end
def sql_to_docker_exec_cmd(sql)
Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
end
def verify_storage_move_from_gitaly(storage)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
log = JSON.parse(line)
......@@ -203,21 +346,10 @@ module QA
end
end
def wait_until_shell_command(cmd, **kwargs)
sleep_interval = kwargs.delete(:sleep_interval) || 1
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.info(line.chomp)
line =~ regex
def with_praefect_log
wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
yield JSON.parse(line)
end
end
end
......
......@@ -33,6 +33,31 @@ module QA
end
end
end
def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
<<~CMD
docker exec --env PGPASSWORD=#{password} #{container} \
bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
CMD
end
def wait_until_shell_command(cmd, **kwargs)
sleep_interval = kwargs.delete(:sleep_interval) || 1
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.debug(line.chomp)
line =~ regex
end
end
end
end
end
# frozen_string_literal: true
module QA
RSpec.describe 'Create' do
context 'Gitaly' do
describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do
Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
project.initialize_with_readme = true
end
end
before do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
end
after do
# Leave the cluster in a suitable state for subsequent tests
praefect_manager.reset_primary_to_original
end
it 'recovers from dataloss' do
# Create a new project with a commit and wait for it to replicate
praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check
# Confirm that we have access to the repo after failover
Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
Resource::Repository::Commit.fabricate_via_api! do |commits|
commits.project = project
commits.sha = 'master'
end
end
# Push a commit to the new primary
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.new_branch = false
push.commit_message = 'pushed after failover'
push.file_name = 'new_file'
push.file_content = 'new file'
end
# Start the old primary node again
praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node
# Confirm dataloss (i.e., inconsistent nodes)
expect(praefect_manager.dataloss?).to be true
expect(praefect_manager.replicated?(project.id)).to be false
# Reconcile nodes to recover from dataloss
praefect_manager.reconcile_nodes
praefect_manager.wait_for_replication(project.id)
# Confirm that both commits are available after reconciliation
expect(project.commits.map { |commit| commit[:message].chomp })
.to include("Initial commit").and include("pushed after failover")
end
end
end
end
end
......@@ -41,8 +41,6 @@ module QA
expect(show).to have_file(initial_file)
end
praefect_manager.enable_writes
Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project
commit.add_files([
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment