Commit f59dbe0c authored by Mark Lapierre's avatar Mark Lapierre Committed by Ramya Authappan

Add replication queue e2e test

- Allows PraefectManager to start and stop praefect and inspect the
  replication queue and gitaly-to-gitlab communication
parent fc512455
......@@ -118,6 +118,10 @@ module QA
run("git push #{uri} #{branch}", max_attempts: 3).to_s
end
def push_all_branches
run("git push --all").to_s
end
def merge(branch)
run("git merge #{branch}")
end
......
......@@ -6,25 +6,177 @@ module QA
include Service::Shellout
def initialize
@gitlab = 'gitlab-gitaly-ha'
@praefect = 'praefect'
@first_node = 'gitaly1'
@second_node = 'gitaly2'
@primary_node = @first_node
@secondary_node = @second_node
@postgres = 'postgres'
@primary_node = 'gitaly1'
@secondary_node = 'gitaly2'
@tertiary_node = 'gitaly3'
@virtual_storage = 'default'
end
def enable_writes
shell "docker exec praefect bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage default'"
shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
end
def stop_primary_node
shell "docker stop #{@primary_node}"
@secondary_node, @primary_node = @primary_node, @secondary_node
def replicated?(project_id)
shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
# The output of the rake task looks something like this:
#
# Project name | gitaly1 (primary) | gitaly2 | gitaly3
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
#
# We want to confirm that the checksums are identical
break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
end
end
def reset
shell "docker start #{@primary_node}"
shell "docker start #{@secondary_node}"
def start_praefect
start_node(@praefect)
end
def stop_praefect
stop_node(@praefect)
end
def start_node(name)
shell "docker start #{name}"
end
def stop_node(name)
shell "docker stop #{name}"
end
def trigger_failover_by_stopping_primary_node
stop_node(@primary_node)
end
def clear_replication_queue
QA::Runtime::Logger.debug("Clearing the replication queue")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
CMD
end
def create_stalled_replication_queue
QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';\\""
CMD
end
def replication_queue_lock_count
result = []
cmd = <<~CMD
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
CMD
shell cmd do |line|
result << line
end
# The result looks like:
# count
# -----
# 1
result[2].to_i
end
def reset_cluster
start_node(@praefect)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
enable_writes
end
def wait_for_praefect
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/
)
end
def wait_for_sql_ping
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'",
/praefect sql-ping: OK/
)
end
def wait_for_storage_nodes
nodes_confirmed = {
@primary_node => false,
@secondary_node => false,
@tertiary_node => false
}
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.info(line.chomp)
nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
end
nodes_confirmed.values.all?
end
end
def wait_for_gitaly_check
storage_ok = false
check_finished = false
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.info(line.chomp)
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/
storage_ok && check_finished
end
end
def wait_for_gitlab_shell_check
wait_until_shell_command_matches(
"docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitlab_shell:check'",
/Checking GitLab Shell ... Finished/
)
end
def wait_for_reliable_connection
wait_for_praefect
wait_for_sql_ping
wait_for_storage_nodes
wait_for_gitaly_check
wait_for_gitlab_shell_check
end
private
def wait_until_shell_command(cmd)
Support::Waiter.wait_until do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex)
wait_until_shell_command(cmd) do |line|
QA::Runtime::Logger.info(line.chomp)
line =~ regex
end
end
end
end
......
......@@ -19,6 +19,13 @@ module QA
Open3.popen2e(*command) do |stdin, out, wait|
stdin.puts(stdin_data) if stdin_data
stdin.close if stdin_data
if block_given?
out.each do |line|
yield line
end
end
out.each_char { |char| print char }
if wait.value.exited? && wait.value.exitstatus.nonzero?
......
# frozen_string_literal: true
require 'parallel'
module QA
RSpec.describe 'Create' do
context 'Gitaly Cluster replication queue', :orchestrated, :gitaly_ha, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do
Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
project.initialize_with_readme = true
end
end
after do
praefect_manager.reset_cluster
praefect_manager.clear_replication_queue
end
it 'allows replication of different repository after interruption' do
# We want to fill the replication queue with 10 `in_progress` jobs,
# while a lock has been acquired, which is when the problem occurred
# as reported in https://gitlab.com/gitlab-org/gitaly/-/issues/2801
#
# We'll do this by creating 10 branches and pushing them all at once,
# and then stop Praefect when a lock is acquired, set all the jobs
# to `in_progress`, and create a job lock for each one.
queue_size_target = 10
Git::Repository.perform do |repository|
repository.uri = project.repository_http_location.uri
repository.use_default_credentials
repository.clone
repository.configure_identity('GitLab QA', 'root@gitlab.com')
1.upto(queue_size_target) do |i|
repository.checkout("branch#{i}", new_branch: true)
repository.commit_file("file#{i}", SecureRandom.random_bytes(10000000), "Add file#{i}")
end
repository.push_all_branches
end
count = 0
while count < 1
count = praefect_manager.replication_queue_lock_count
QA::Runtime::Logger.debug("Lock count: #{count}")
end
praefect_manager.stop_praefect
praefect_manager.create_stalled_replication_queue
praefect_manager.start_praefect
praefect_manager.wait_for_reliable_connection
# Create a new project, push to it, and check that replication occurs
project_push = Resource::Repository::ProjectPush.fabricate! do |push|
push.project_name = "gitaly_cluster"
end
expect(praefect_manager.replicated?(project_push.project.id)).to be true
end
end
end
end
......@@ -18,7 +18,7 @@ module QA
end
after do
praefect_manager.reset
praefect_manager.reset_cluster
end
it 'makes sure that automatic failover is happening' do
......@@ -30,7 +30,7 @@ module QA
push.file_content = "This should exist on both nodes"
end
praefect_manager.stop_primary_node
praefect_manager.trigger_failover_by_stopping_primary_node
project.visit!
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment