Commit 26fc9aa9 authored by Stan Hu's avatar Stan Hu

Geo: Don't attempt to schedule a repository sync for downed Gitaly shards

We saw in the Geo testbed where a misconfigured Gitaly node would receive many
gRPC timeouts when a worker attempted to sync a repository on specific
shards. Until direct filesystem access is eliminated from GitLab, we now
perform an additional health check via a gRPC ping to ensure each shard has a
responsive Gitaly server.

Closes #4329
parent 1d9e53ed
---
title: 'Geo: Don''t attempt to schedule a repository sync for downed Gitaly shards'
merge_request:
author:
type: changed
...@@ -3,6 +3,11 @@ module Geo ...@@ -3,6 +3,11 @@ module Geo
include ApplicationWorker include ApplicationWorker
include CronjobQueue include CronjobQueue
HEALTHY_SHARD_CHECKS = [
Gitlab::HealthChecks::FsShardsCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze
def perform def perform
return unless Gitlab::Geo.geo_database_configured? return unless Gitlab::Geo.geo_database_configured?
return unless Gitlab::Geo.secondary? return unless Gitlab::Geo.secondary?
...@@ -17,10 +22,13 @@ module Geo ...@@ -17,10 +22,13 @@ module Geo
end end
def healthy_shards def healthy_shards
Gitlab::HealthChecks::FsShardsCheck # For now, we need to perform both Gitaly and direct filesystem checks to ensure
.readiness # the shard is healthy. We take the intersection of the successful checks
.select(&:success) # as the healthy shards.
.map { |check| check.labels[:shard] } HEALTHY_SHARD_CHECKS.map(&:readiness)
.map { |check_result| check_result.select(&:success) }
.inject(:&)
.map { |check_result| check_result.labels[:shard] }
.compact .compact
.uniq .uniq
end end
......
...@@ -48,6 +48,27 @@ describe Geo::RepositorySyncWorker, :geo, :clean_gitlab_redis_cache do ...@@ -48,6 +48,27 @@ describe Geo::RepositorySyncWorker, :geo, :clean_gitlab_redis_cache do
Sidekiq::Testing.inline! { subject.perform } Sidekiq::Testing.inline! { subject.perform }
end end
it 'skips backfill for projects with downed Gitaly server' do
create(:project, group: synced_group, repository_storage: 'broken')
unhealthy_dirty = create(:project, group: synced_group, repository_storage: 'broken')
healthy_shard = project_in_synced_group.repository.storage
create(:geo_project_registry, :synced, :repository_dirty, project: unhealthy_dirty)
# Report only one healthy shard
allow(Gitlab::HealthChecks::FsShardsCheck).to receive(:readiness).and_return(
[Gitlab::HealthChecks::Result.new(true, nil, { shard: healthy_shard }),
Gitlab::HealthChecks::Result.new(true, nil, { shard: 'broken' })])
allow(Gitlab::HealthChecks::GitalyCheck).to receive(:readiness).and_return(
[Gitlab::HealthChecks::Result.new(true, nil, { shard: healthy_shard }),
Gitlab::HealthChecks::Result.new(false, nil, { shard: 'broken' })])
expect(Geo::RepositoryShardSyncWorker).to receive(:perform_async).with(healthy_shard)
expect(Geo::RepositoryShardSyncWorker).not_to receive(:perform_async).with('broken')
Sidekiq::Testing.inline! { subject.perform }
end
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment