Commit b8da28c1 authored by Stan Hu's avatar Stan Hu

Merge branch...

Merge branch '5195-geo-actively-try-to-correct-verification-failures-on-the-secondary' into 'master'

Resolve "Geo: Actively try to correct verification failures on the secondary"

Closes #5195

See merge request gitlab-org/gitlab-ee!6759
parents 8100c031 3ab1648e
......@@ -17,9 +17,9 @@ it from the primary to resolve the issue.
If verification succeeds on the **primary** but fails on the **secondary**,
this indicates that the object was corrupted during the replication process.
Until [issue #5195][ee-5195] is implemented, Geo won't automatically resolve
verification failures of this kind, so you should follow
[these instructions][reset-verification]
Geo actively try to correct verification failures marking the repository to
be resynced with a backoff period. If you want to reset the verification for
these failures, so you should follow [these instructions][reset-verification].
If verification is lagging significantly behind replication, consider giving
the node more time before scheduling a planned failover.
......@@ -88,9 +88,10 @@ for every node after every update to make sure that they are all in sync.
# Reset verification for projects where verification has failed
Until [issue #5195][ee-5195] is implemented, Geo won't automatically resolve
verification failures, so you should reset them manually. This rake task marks
projects where verification has failed or the checksum mismatch to be resynced:
Geo actively try to correct verification failures marking the repository to
be resynced with a backoff period. If you want to reset them manually, this
rake task marks projects where verification has failed or the checksum mismatch
to be resynced without the backoff period:
#### For repositories:
......
......@@ -355,12 +355,9 @@ module Geo
# @return [ActiveRecord::Relation<Geo::ProjectRegistry>] list of registries that need verification
def legacy_find_registries_to_verify(batch_size:)
repo_condition = local_repo_condition
wiki_condition = local_wiki_condition
registries = Geo::ProjectRegistry
.where(repo_condition.or(wiki_condition))
.pluck(:project_id, repo_condition.to_sql, wiki_condition.to_sql)
.where(local_repo_condition.or(local_wiki_condition))
.pluck(:project_id, local_repo_condition.to_sql, local_wiki_condition.to_sql)
return Geo::ProjectRegistry.none if registries.empty?
......@@ -410,12 +407,22 @@ module Geo
local_registry_table[:repository_verification_checksum_sha].eq(nil)
.and(local_registry_table[:last_repository_verification_failure].eq(nil))
.and(local_registry_table[:resync_repository].eq(false))
.and(repository_missing_on_primary_is_not_true)
end
def local_wiki_condition
local_registry_table[:wiki_verification_checksum_sha].eq(nil)
.and(local_registry_table[:last_wiki_verification_failure].eq(nil))
.and(local_registry_table[:resync_wiki].eq(false))
.and(wiki_missing_on_primary_is_not_true)
end
def repository_missing_on_primary_is_not_true
Arel::Nodes::SqlLiteral.new("project_registry.repository_missing_on_primary IS NOT TRUE")
end
def wiki_missing_on_primary_is_not_true
Arel::Nodes::SqlLiteral.new("project_registry.wiki_missing_on_primary IS NOT TRUE")
end
end
end
......@@ -115,7 +115,11 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
"#{type}_verification_checksum_sha" => nil,
"#{type}_checksum_mismatch" => false,
"last_#{type}_verification_failure" => nil,
"resync_#{type}_was_scheduled_at" => scheduled_at)
"#{type}_verification_retry_count" => nil,
"resync_#{type}_was_scheduled_at" => scheduled_at,
"#{type}_retry_count" => nil,
"#{type}_retry_at" => nil
)
end
def repository_sync_due?(scheduled_time)
......@@ -150,6 +154,10 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
retry_count(type) > RETRIES_BEFORE_REDOWNLOAD
end
def verification_retry_count(type)
public_send("#{type}_verification_retry_count").to_i # rubocop:disable GitlabSecurity/PublicSend
end
private
def fetches_since_gc_redis_key
......@@ -178,15 +186,6 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
last_wiki_synced_at && timestamp > last_wiki_synced_at
end
# To prevent the retry time from storing invalid dates in the database,
# cap the max time to a week plus some random jitter value.
def next_retry_time(retry_count)
proposed_time = Time.now + delay(retry_count).seconds
max_future_time = Time.now + 7.days + delay(1).seconds
[proposed_time, max_future_time].min
end
def retry_count(type)
public_send("#{type}_retry_count") || -1 # rubocop:disable GitlabSecurity/PublicSend
end
......
# frozen_string_literal: true
module Geo
class BaseRepositoryVerificationService
include Delay
include Gitlab::Geo::ProjectLogHelpers
def execute
raise NotImplementedError
end
private
def calculate_checksum(repository)
repository.checksum
rescue Gitlab::Git::Repository::NoRepository, Gitlab::Git::Repository::InvalidRepository
Gitlab::Git::Repository::EMPTY_REPOSITORY_CHECKSUM
end
def calculate_next_retry_attempt(resource, type)
retry_count = resource.public_send("#{type}_retry_count").to_i + 1 # rubocop:disable GitlabSecurity/PublicSend
[next_retry_time(retry_count), retry_count]
end
end
end
module Geo
class RepositoryVerificationPrimaryService
include Delay
include Gitlab::Geo::ProjectLogHelpers
# frozen_string_literal: true
module Geo
class RepositoryVerificationPrimaryService < BaseRepositoryVerificationService
def initialize(project)
@project = project
end
def execute
calculate_repository_checksum
calculate_wiki_checksum
verify_checksum(:repository, project.repository)
verify_checksum(:wiki, project.wiki.repository)
end
private
attr_reader :project
def calculate_repository_checksum
calculate_checksum(:repository, project.repository)
end
def calculate_wiki_checksum
calculate_checksum(:wiki, project.wiki.repository)
end
def calculate_checksum(type, repository)
update_repository_state!(type, checksum: repository.checksum)
rescue Gitlab::Git::Repository::NoRepository, Gitlab::Git::Repository::InvalidRepository
update_repository_state!(type, checksum: Gitlab::Git::Repository::EMPTY_REPOSITORY_CHECKSUM)
def verify_checksum(type, repository)
checksum = calculate_checksum(repository)
update_repository_state!(type, checksum: checksum)
rescue => e
log_error('Error calculating the repository checksum', e, type: type)
log_error("Error calculating the #{type} checksum", e, type: type)
update_repository_state!(type, failure: e.message)
end
def update_repository_state!(type, checksum: nil, failure: nil)
retry_at, retry_count =
if failure.present?
retry_count = repository_state.public_send("#{type}_retry_count").to_i + 1 # rubocop:disable GitlabSecurity/PublicSend
[next_retry_time(retry_count), retry_count]
calculate_next_retry_attempt(repository_state, type)
end
repository_state.update!(
......@@ -48,15 +37,6 @@ module Geo
)
end
# To prevent the retry time from storing invalid dates in the database,
# cap the max time to a week plus some random jitter value.
def next_retry_time(retry_count)
proposed_time = Time.now + delay(retry_count).seconds
max_future_time = Time.now + 7.days + delay(1).seconds
[proposed_time, max_future_time].min
end
def repository_state
@repository_state ||= project.repository_state || project.build_repository_state
end
......
......@@ -47,7 +47,9 @@ module Geo
"resync_#{type}" => true,
"#{type}_verification_checksum_sha" => nil,
"#{type}_checksum_mismatch" => false,
"last_#{type}_verification_failure" => nil
"last_#{type}_verification_failure" => nil,
"#{type}_verification_retry_count" => nil,
"#{type}_missing_on_primary" => nil
}
end
end
......
module Geo
class RepositoryVerificationSecondaryService
include Gitlab::Geo::ProjectLogHelpers
# frozen_string_literal: true
module Geo
class RepositoryVerificationSecondaryService < BaseRepositoryVerificationService
def initialize(registry, type)
@registry = registry
@type = type.to_sym
......@@ -23,8 +23,9 @@ module Geo
def should_verify_checksum?
return false if resync?
return false unless primary_checksum.present?
primary_checksum.present? && primary_checksum != secondary_checksum
mismatch?(secondary_checksum)
end
def resync?
......@@ -39,46 +40,47 @@ module Geo
registry.public_send("#{type}_verification_checksum_sha") # rubocop:disable GitlabSecurity/PublicSend
end
def mismatch?(checksum)
primary_checksum != checksum
end
def verify_checksum
checksum = calculate_checksum
checksum = calculate_checksum(repository)
if mismatch?(checksum)
update_registry!(mismatch: true, failure: "#{type.to_s.capitalize} checksum mismatch: #{repository.disk_path}")
update_registry!(mismatch: true, failure: "#{type.to_s.capitalize} checksum mismatch")
else
update_registry!(checksum: checksum)
end
rescue ::Gitlab::Git::Repository::ChecksumError, Timeout::Error => e
update_registry!(failure: "Error verifying #{type.to_s.capitalize} checksum: #{repository.disk_path}", exception: e)
rescue => e
update_registry!(failure: "Error calculating #{type} checksum", exception: e)
end
def calculate_checksum
repository.checksum
rescue Gitlab::Git::Repository::NoRepository, Gitlab::Git::Repository::InvalidRepository
Gitlab::Git::Repository::EMPTY_REPOSITORY_CHECKSUM
def update_registry!(checksum: nil, mismatch: false, failure: nil, exception: nil)
reverify, verification_retry_count =
if mismatch || failure.present?
log_error(failure, exception, type: type)
[true, registry.verification_retry_count(type) + 1]
else
[false, nil]
end
def mismatch?(checksum)
primary_checksum != checksum
resync_retry_at, resync_retry_count =
if reverify
[*calculate_next_retry_attempt(registry, type)]
end
def update_registry!(checksum: nil, mismatch: false, failure: nil, exception: nil, details: {})
attrs = {
registry.update!(
"#{type}_verification_checksum_sha" => checksum,
"#{type}_checksum_mismatch" => mismatch,
"last_#{type}_verification_failure" => failure
}
if failure
log_error(failure, exception,
type: type,
repository_shard: project.repository_storage,
repsitory_disk_path: repository.disk_path
"last_#{type}_verification_failure" => failure,
"#{type}_verification_retry_count" => verification_retry_count,
"resync_#{type}" => reverify,
"#{type}_retry_at" => resync_retry_at,
"#{type}_retry_count" => resync_retry_count
)
end
registry.update!(attrs)
end
def repository
@repository ||=
case type
......
---
title: Geo - Actively try to correct verification failures on the secondary
merge_request: 6759
author:
type: added
# frozen_string_literal: true
class AddRetryVerificationFieldsToProjectRegistry < ActiveRecord::Migration
def change
add_column :project_registry, :repository_verification_retry_count, :integer
add_column :project_registry, :wiki_verification_retry_count, :integer
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180727221937) do
ActiveRecord::Schema.define(version: 20180802215313) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -80,6 +80,8 @@ ActiveRecord::Schema.define(version: 20180727221937) do
t.datetime_with_timezone "resync_wiki_was_scheduled_at"
t.boolean "repository_missing_on_primary"
t.boolean "wiki_missing_on_primary"
t.integer "repository_verification_retry_count"
t.integer "wiki_verification_retry_count"
end
add_index "project_registry", ["last_repository_successful_sync_at"], name: "index_project_registry_on_last_repository_successful_sync_at", using: :btree
......
......@@ -3,4 +3,13 @@ module Delay
def delay(retry_count = 0)
(retry_count**4) + 15 + (rand(30) * (retry_count + 1))
end
# To prevent the retry time from storing invalid dates in the database,
# cap the max time to a week plus some random jitter value.
def next_retry_time(retry_count)
proposed_time = Time.now + delay(retry_count).seconds
max_future_time = Time.now + 7.days + delay(1).seconds
[proposed_time, max_future_time].min
end
end
......@@ -677,6 +677,20 @@ describe Geo::ProjectRegistryFinder, :geo do
expect(subject.find_registries_to_verify(batch_size: 100)).to be_empty
end
it 'does not return registries when the repository is missing on primary' do
project_verified = create(:repository_state, :repository_verified).project
create(:geo_project_registry, :synced, project: project_verified, repository_missing_on_primary: true)
expect(subject.find_registries_to_verify(batch_size: 100)).to be_empty
end
it 'does not return registries when the wiki is missing on primary' do
project_verified = create(:repository_state, :wiki_verified).project
create(:geo_project_registry, :synced, project: project_verified, wiki_missing_on_primary: true)
expect(subject.find_registries_to_verify(batch_size: 100)).to be_empty
end
end
end
......
This diff is collapsed.
......@@ -53,7 +53,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_repository: true,
repository_verification_checksum_sha: nil,
repository_checksum_mismatch: false,
last_repository_verification_failure: nil
last_repository_verification_failure: nil,
repository_verification_retry_count: nil,
repository_missing_on_primary: nil
)
end
......@@ -67,7 +69,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_repository: true,
repository_verification_checksum_sha: nil,
repository_checksum_mismatch: false,
last_repository_verification_failure: nil
last_repository_verification_failure: nil,
repository_verification_retry_count: nil,
repository_missing_on_primary: nil
)
end
......@@ -81,7 +85,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_repository: false,
repository_verification_checksum_sha: be_present,
repository_checksum_mismatch: false,
last_repository_verification_failure: nil
last_repository_verification_failure: nil,
repository_verification_retry_count: nil,
repository_missing_on_primary: nil
)
end
......@@ -95,7 +101,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_repository: false,
repository_verification_checksum_sha: nil,
repository_checksum_mismatch: false,
last_repository_verification_failure: nil
last_repository_verification_failure: nil,
repository_verification_retry_count: nil,
repository_missing_on_primary: nil
)
end
end
......@@ -121,7 +129,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_wiki: true,
wiki_verification_checksum_sha: nil,
wiki_checksum_mismatch: false,
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
wiki_verification_retry_count: nil,
wiki_missing_on_primary: nil
)
end
......@@ -135,7 +145,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_wiki: true,
wiki_verification_checksum_sha: nil,
wiki_checksum_mismatch: false,
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
wiki_verification_retry_count: nil,
wiki_missing_on_primary: nil
)
end
......@@ -149,7 +161,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_wiki: false,
wiki_verification_checksum_sha: be_present,
wiki_checksum_mismatch: false,
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
wiki_verification_retry_count: nil,
wiki_missing_on_primary: nil
)
end
......@@ -163,7 +177,9 @@ describe Geo::RepositoryVerificationReset, :geo do
resync_wiki: false,
wiki_verification_checksum_sha: nil,
wiki_checksum_mismatch: false,
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
wiki_verification_retry_count: nil,
wiki_missing_on_primary: nil
)
end
end
......
......@@ -3,19 +3,13 @@ require 'spec_helper'
describe Geo::RepositoryVerificationSecondaryService, :geo do
include ::EE::GeoHelpers
let(:secondary) { create(:geo_node) }
before do
stub_current_geo_node(secondary)
end
shared_examples 'verify checksums for repositories/wikis' do |type|
let(:repository) { find_repository(type) }
subject(:service) { described_class.new(registry, type) }
it 'does not calculate the checksum when not running on a secondary' do
allow(Gitlab::Geo).to receive(:secondary?) { false }
allow(Gitlab::Geo).to receive(:secondary?).and_return(false)
expect(repository).not_to receive(:checksum)
......@@ -38,7 +32,7 @@ describe Geo::RepositoryVerificationSecondaryService, :geo do
service.execute
end
it 'does not verify the checksum if the checksums already match' do
it 'does not verify the checksum if the current checksum matches' do
repository_state.assign_attributes("#{type}_verification_checksum" => 'my_checksum')
registry.assign_attributes("#{type}_verification_checksum_sha" => 'my_checksum')
......@@ -48,14 +42,18 @@ describe Geo::RepositoryVerificationSecondaryService, :geo do
end
it 'sets checksum when the checksum matches' do
expect(repository).to receive(:checksum).and_return('my_checksum')
allow(repository).to receive(:checksum).and_return('my_checksum')
service.execute
expect(registry).to have_attributes(
"#{type}_verification_checksum_sha" => 'my_checksum',
"#{type}_checksum_mismatch" => false,
"last_#{type}_verification_failure" => nil
"last_#{type}_verification_failure" => nil,
"#{type}_verification_retry_count" => nil,
"resync_#{type}" => false,
"#{type}_retry_at" => nil,
"#{type}_retry_count" => nil
)
end
......@@ -69,22 +67,78 @@ describe Geo::RepositoryVerificationSecondaryService, :geo do
expect(registry).to have_attributes(
"#{type}_verification_checksum_sha" => '0000000000000000000000000000000000000000',
"#{type}_checksum_mismatch" => false,
"last_#{type}_verification_failure" => nil
"last_#{type}_verification_failure" => nil,
"#{type}_verification_retry_count" => nil,
"resync_#{type}" => false,
"#{type}_retry_at" => nil,
"#{type}_retry_count" => nil
)
end
it 'keeps track of failure when the checksum mismatch' do
expect(repository).to receive(:checksum).and_return('other_checksum')
context 'when the checksum mismatch' do
before do
allow(repository).to receive(:checksum).and_return('other_checksum')
end
it 'keeps track of failures' do
service.execute
expect(registry).to have_attributes(
"#{type}_verification_checksum_sha" => nil,
"#{type}_checksum_mismatch" => true,
"last_#{type}_verification_failure" => /#{Regexp.quote(type.to_s.capitalize)} checksum mismatch/
"last_#{type}_verification_failure" => "#{type.to_s.capitalize} checksum mismatch",
"#{type}_verification_retry_count" => 1,
"resync_#{type}" => true,
"#{type}_retry_at" => be_present,
"#{type}_retry_count" => 1
)
end
it 'ensures the next retry time is capped properly' do
registry.update("#{type}_retry_count" => 30)
service.execute
expect(registry).to have_attributes(
"resync_#{type}" => true,
"#{type}_retry_at" => be_within(100.seconds).of(Time.now + 7.days),
"#{type}_retry_count" => 31
)
end
end
context 'when checksum calculation fails' do
before do
allow(repository).to receive(:checksum).and_raise("Something went wrong with #{type}")
end
it 'keeps track of failures' do
service.execute
expect(registry).to have_attributes(
"#{type}_verification_checksum_sha" => nil,
"#{type}_checksum_mismatch" => false,
"last_#{type}_verification_failure" => "Error calculating #{type} checksum",
"#{type}_verification_retry_count" => 1,
"resync_#{type}" => true,
"#{type}_retry_at" => be_present,
"#{type}_retry_count" => 1
)
end
it 'ensures the next retry time is capped properly' do
registry.update("#{type}_retry_count" => 30)
service.execute
expect(registry).to have_attributes(
"resync_#{type}" => true,
"#{type}_retry_at" => be_within(100.seconds).of(Time.now + 7.days),
"#{type}_retry_count" => 31
)
end
end
def find_repository(type)
case type
when :repository then project.repository
......@@ -93,16 +147,22 @@ describe Geo::RepositoryVerificationSecondaryService, :geo do
end
end
let(:secondary) { create(:geo_node) }
before do
stub_current_geo_node(secondary)
end
describe '#execute' do
let(:project) { create(:project, :repository, :wiki_repo) }
let!(:repository_state) { create(:repository_state, project: project, repository_verification_checksum: 'my_checksum', wiki_verification_checksum: 'my_checksum') }
let(:registry) { create(:geo_project_registry, :synced, project: project) }
context 'repository' do
context 'for a repository' do
include_examples 'verify checksums for repositories/wikis', :repository
end
context 'wiki' do
context 'for a wiki' do
include_examples 'verify checksums for repositories/wikis', :wiki
end
end
......
......@@ -52,6 +52,18 @@ describe Geo::RepositoryVerification::Secondary::ShardWorker, :postgresql, :clea
subject.perform(shard_name)
end
it 'does not schedule jobs for projects missing repositories on primary' do
other_project = create(:project)
create(:repository_state, :repository_verified, project: project)
create(:repository_state, :wiki_verified, project: other_project)
create(:geo_project_registry, :synced, project: project, repository_missing_on_primary: true)
create(:geo_project_registry, :synced, project: other_project, wiki_missing_on_primary: true)
expect(secondary_singleworker).not_to receive(:perform_async)
subject.perform(shard_name)
end
# test that when jobs are always moving forward and we're not querying the same things
# over and over
describe 'resource loading' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment