Commit e906f85f authored by Heinrich Lee Yu's avatar Heinrich Lee Yu

Merge branch 'stomlinson/retry-service-discovery-on-failure' into 'master'

Retry service discovery when it fails

See merge request gitlab-org/gitlab!68672
parents a7e9c8ff 893b2da2
......@@ -35,12 +35,6 @@ module Gitlab
def hosts=(hosts)
@mutex.synchronize do
::Gitlab::Database::LoadBalancing::Logger.info(
event: :host_list_update,
message: "Updating the host list for service discovery",
host_list_length: hosts.length,
old_host_list_length: @hosts.length
)
@hosts = hosts
unsafe_shuffle
end
......
......@@ -13,11 +13,17 @@ module Gitlab
# balancer with said hosts. Requests may continue to use the old hosts
# until they complete.
class ServiceDiscovery
EmptyDnsResponse = Class.new(StandardError)
attr_reader :interval, :record, :record_type, :disconnect_timeout,
:load_balancer
MAX_SLEEP_ADJUSTMENT = 10
MAX_DISCOVERY_RETRIES = 3
RETRY_DELAY_RANGE = (0.1..0.2).freeze
RECORD_TYPES = {
'A' => Net::DNS::A,
'SRV' => Net::DNS::SRV
......@@ -76,15 +82,21 @@ module Gitlab
end
def perform_service_discovery
refresh_if_necessary
rescue StandardError => error
# Any exceptions that might occur should be reported to
# Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::AppLogger.error(
"Service discovery encountered an error: #{error.message}"
)
MAX_DISCOVERY_RETRIES.times do
return refresh_if_necessary
rescue StandardError => error
# Any exceptions that might occur should be reported to
# Sentry, instead of silently terminating this thread.
Gitlab::ErrorTracking.track_exception(error)
Gitlab::AppLogger.error(
"Service discovery encountered an error: #{error.message}"
)
# Slightly randomize the retry delay so that, in the case of a total
# dns outage, all starting services do not pressure the dns server at the same time.
sleep(rand(RETRY_DELAY_RANGE))
end
interval
end
......@@ -99,7 +111,22 @@ module Gitlab
current = addresses_from_load_balancer
replace_hosts(from_dns) if from_dns != current
if from_dns != current
::Gitlab::Database::LoadBalancing::Logger.info(
event: :host_list_update,
message: "Updating the host list for service discovery",
host_list_length: from_dns.length,
old_host_list_length: current.length
)
replace_hosts(from_dns)
else
::Gitlab::Database::LoadBalancing::Logger.info(
event: :host_list_unchanged,
message: "Unchanged host list for service discovery",
host_list_length: from_dns.length,
old_host_list_length: current.length
)
end
interval
end
......@@ -141,6 +168,8 @@ module Gitlab
addresses_from_srv_record(response)
end
raise EmptyDnsResponse if addresses.empty?
# Addresses are sorted so we can directly compare the old and new
# addresses, without having to use any additional data structures.
[new_wait_time_for(resources), addresses.sort]
......
......@@ -69,18 +69,69 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
end
describe '#perform_service_discovery' do
it 'reports exceptions to Sentry' do
error = StandardError.new
context 'without any failures' do
it 'runs once' do
expect(service)
.to receive(:refresh_if_necessary).once
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error)
expect(service).not_to receive(:sleep)
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(error)
expect(Gitlab::ErrorTracking).not_to receive(:track_exception)
service.perform_service_discovery
service.perform_service_discovery
end
end
context 'with failures' do
before do
allow(Gitlab::ErrorTracking).to receive(:track_exception)
allow(service).to receive(:sleep)
end
let(:valid_retry_sleep_duration) { satisfy { |val| described_class::RETRY_DELAY_RANGE.include?(val) } }
it 'retries service discovery when under the retry limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times.ordered
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES - 1).times
expect(service).to receive(:refresh_if_necessary).and_return(45).ordered
expect(service.perform_service_discovery).to eq(45)
end
it 'does not retry service discovery after exceeding the limit' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(service)
.to receive(:sleep).with(valid_retry_sleep_duration)
.exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
it 'reports exceptions to Sentry' do
error = StandardError.new
expect(service)
.to receive(:refresh_if_necessary)
.and_raise(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(error).exactly(described_class::MAX_DISCOVERY_RETRIES).times
service.perform_service_discovery
end
end
end
......@@ -224,6 +275,16 @@ RSpec.describe Gitlab::Database::LoadBalancing::ServiceDiscovery do
expect(service.addresses_from_dns).to eq([90, addresses])
end
end
context 'when the resolver returns an empty response' do
let(:packet) { double(:packet, answer: []) }
let(:record_type) { 'A' }
it 'raises EmptyDnsResponse' do
expect { service.addresses_from_dns }.to raise_error(Gitlab::Database::LoadBalancing::ServiceDiscovery::EmptyDnsResponse)
end
end
end
describe '#new_wait_time_for' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment