Commit 4bbdab76 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'bvl-circuitbreaker-improvements' into 'master'

Make the circuitbreaker configurable at runtime

See merge request gitlab-org/gitlab-ce!14842
parents 79e88912 38af7c16
......@@ -108,6 +108,34 @@ module ApplicationSettingsHelper
options_for_select(Sidekiq::Queue.all.map(&:name), @application_setting.sidekiq_throttling_queues)
end
def circuitbreaker_failure_count_help_text
health_link = link_to(s_('AdminHealthPageLink|health page'), admin_health_check_path)
api_link = link_to(s_('CircuitBreakerApiLink|circuitbreaker api'), help_page_path("api/repository_storage_health"))
message = _("The number of failures of after which GitLab will completely "\
"prevent access to the storage. The number of failures can be "\
"reset in the admin interface: %{link_to_health_page} or using "\
"the %{api_documentation_link}.")
message = message % { link_to_health_page: health_link, api_documentation_link: api_link }
message.html_safe
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
"recover. Repositories on failing shards are temporarly unavailable")
end
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
end
def circuitbreaker_storage_timeout_help_text
_("The time in seconds GitLab will try to access storage. After this time a "\
"timeout error will be raised.")
end
def visible_attributes
[
:admin_notification_email,
......@@ -116,6 +144,10 @@ module ApplicationSettingsHelper
:akismet_api_key,
:akismet_enabled,
:auto_devops_enabled,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
:container_registry_token_expire_delay,
......
......@@ -151,6 +151,13 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
SUPPORTED_KEY_TYPES.each do |type|
validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type }
end
......
......@@ -530,6 +530,32 @@
= succeed "." do
= link_to "repository storages documentation", help_page_path("administration/repository_storages")
%fieldset
%legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.help-block
= circuitbreaker_failure_count_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.help-block
= circuitbreaker_failure_reset_time_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
%fieldset
%legend Repository Checks
......
---
title: Store circuitbreaker settings in the database instead of config
merge_request: 14842
author:
type: changed
......@@ -522,11 +522,6 @@ production: &base
path: /home/git/repositories/
gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port)
# gitaly_token: 'special token' # Optional: override global gitaly.token for this storage.
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after an access failure before allowing access again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 30 # Time in seconds to wait before aborting a storage access attempt
## Backup settings
backup:
......@@ -659,9 +654,6 @@ test:
default:
path: tmp/tests/repositories/
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
failure_count_threshold: 999999
failure_wait_time: 0
storage_timeout: 30
broken:
path: tmp/tests/non-existent-repositories
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
......
......@@ -455,17 +455,6 @@ Settings.repositories.storages.each do |key, storage|
# Expand relative paths
storage['path'] = Settings.absolute(storage['path'])
# Set failure defaults
storage['failure_count_threshold'] ||= 10
storage['failure_wait_time'] ||= 30
storage['failure_reset_time'] ||= 1800
storage['storage_timeout'] ||= 5
# Set turn strings into numbers
storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i
storage['failure_wait_time'] = storage['failure_wait_time'].to_i
storage['failure_reset_time'] = storage['failure_reset_time'].to_i
# We might want to have a timeout shorter than 1 second.
storage['storage_timeout'] = storage['storage_timeout'].to_f
Settings.repositories.storages[key] = storage
end
......
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddCircuitBreakerPropertiesToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :application_settings,
:circuitbreaker_failure_count_threshold,
:integer,
default: 160
add_column :application_settings,
:circuitbreaker_failure_wait_time,
:integer,
default: 30
add_column :application_settings,
:circuitbreaker_failure_reset_time,
:integer,
default: 1800
add_column :application_settings,
:circuitbreaker_storage_timeout,
:integer,
default: 30
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171006091000) do
ActiveRecord::Schema.define(version: 20171012101043) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -134,6 +134,10 @@ ActiveRecord::Schema.define(version: 20171006091000) do
t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: false, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 160
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30
end
create_table "audit_events", force: :cascade do |t|
......
......@@ -105,61 +105,26 @@ When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
The configuration could look as follows:
This can be configured from the admin interface:
**For Omnibus installations**
1. Edit `/etc/gitlab/gitlab.rb`:
```ruby
git_data_dirs({
"default" => {
"path" => "/mnt/nfs-01/git-data",
"failure_count_threshold" => 10,
"failure_wait_time" => 30,
"failure_reset_time" => 1800,
"storage_timeout" => 5
}
})
```
1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect.
---
**For installations from source**
1. Edit `config/gitlab.yml`:
```yaml
repositories:
storages: # You must have at least a `default` storage path.
default:
path: /home/git/repositories/
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after last access failure before trying again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
```
1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect.
![circuitbreaker configuration](img/circuitbreaker_config.png)
**`failure_count_threshold`:** The number of failures of after which GitLab will
**Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**`failure_wait_time`:** When access to a storage fails. GitLab will prevent
access to the storage for the time specified here. This allows the filesystem to
recover without.
**Seconds to wait after a storage failure:** When access to a storage fails. GitLab
will prevent access to the storage for the time specified here. This allows the
filesystem to recover.
**`failure_reset_time`:** The time in seconds GitLab will keep failure
information. When no failures occur during this time, information about the
**Seconds before reseting failure information:** The time in seconds GitLab will
keep failure information. When no failures occur during this time, information about the
mount is reset.
**`storage_timeout`:** The time in seconds GitLab will try to access storage.
After this time a timeout error will be raised.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised.
When storage failures occur, this will be visible in the admin interface like this:
......
......@@ -69,6 +69,10 @@ PUT /application/settings
| `after_sign_up_text` | string | no | Text shown to the user after signing up |
| `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt |
| `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled |
| `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side |
| `container_registry_token_expire_delay` | integer | no | Container Registry token duration in minutes |
......
......@@ -2,15 +2,13 @@ module Gitlab
module Git
module Storage
class CircuitBreaker
include CircuitBreakerSettings
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage,
:hostname,
:storage_path,
:failure_count_threshold,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
:storage_path
delegate :last_failure, :failure_count, to: :failure_info
......@@ -53,10 +51,6 @@ module Gitlab
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@failure_count_threshold = config['failure_count_threshold']
@failure_wait_time = config['failure_wait_time']
@failure_reset_time = config['failure_reset_time']
@storage_timeout = config['storage_timeout']
end
def perform
......
module Gitlab
module Git
module Storage
module CircuitBreakerSettings
def failure_count_threshold
application_settings.circuitbreaker_failure_count_threshold
end
def failure_wait_time
application_settings.circuitbreaker_failure_wait_time
end
def failure_reset_time
application_settings.circuitbreaker_failure_reset_time
end
def storage_timeout
application_settings.circuitbreaker_storage_timeout
end
private
def application_settings
Gitlab::CurrentSettings.current_application_settings
end
end
end
end
end
......@@ -2,15 +2,14 @@ module Gitlab
module Git
module Storage
class NullCircuitBreaker
include CircuitBreakerSettings
# These will have actual values
attr_reader :storage,
:hostname
# These will always have nil values
attr_reader :storage_path,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
attr_reader :storage_path
def initialize(storage, hostname, error: nil)
@storage = storage
......@@ -26,16 +25,12 @@ module Gitlab
!!@error
end
def failure_count_threshold
1
end
def last_failure
circuit_broken? ? Time.now : nil
end
def failure_count
circuit_broken? ? 1 : 0
circuit_broken? ? failure_count_threshold : 0
end
def failure_info
......
......@@ -65,9 +65,11 @@ feature "Admin Health Check", :feature, :broken_storage do
it 'shows storage failure information' do
hostname = Gitlab::Environment.hostname
maximum_failures = Gitlab::CurrentSettings.current_application_settings
.circuitbreaker_failure_count_threshold
expect(page).to have_content('broken: failed storage access attempt on host:')
expect(page).to have_content("#{hostname}: 1 of 10 failures.")
expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.")
end
it 'allows resetting storage failures' do
......
......@@ -18,26 +18,6 @@ describe Settings do
end
end
describe '#repositories' do
it 'assigns the default failure attributes' do
repository_settings = Gitlab.config.repositories.storages['broken']
expect(repository_settings['failure_count_threshold']).to eq(10)
expect(repository_settings['failure_wait_time']).to eq(30)
expect(repository_settings['failure_reset_time']).to eq(1800)
expect(repository_settings['storage_timeout']).to eq(5)
end
it 'can be accessed with dot syntax all the way down' do
expect(Gitlab.config.repositories.storages.broken.failure_count_threshold).to eq(10)
end
it 'can be accessed in a very specific way that breaks without reassigning each element with Settingslogic' do
storage_settings = Gitlab.config.repositories.storages['broken']
expect(storage_settings.failure_count_threshold).to eq(10)
end
end
describe '#host_without_www' do
context 'URL with protocol' do
it 'returns the host' do
......
......@@ -10,18 +10,10 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
# Override test-settings for the circuitbreaker with something more realistic
# for these specs.
stub_storage_settings('default' => {
'path' => TestEnv.repos_path,
'failure_count_threshold' => 10,
'failure_wait_time' => 30,
'failure_reset_time' => 1800,
'storage_timeout' => 5
'path' => TestEnv.repos_path
},
'broken' => {
'path' => 'tmp/tests/non-existent-repositories',
'failure_count_threshold' => 10,
'failure_wait_time' => 30,
'failure_reset_time' => 1800,
'storage_timeout' => 5
'path' => 'tmp/tests/non-existent-repositories'
},
'nopath' => { 'path' => nil }
)
......@@ -75,10 +67,39 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.hostname).to eq(hostname)
expect(circuit_breaker.storage).to eq('default')
expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
expect(circuit_breaker.failure_count_threshold).to eq(10)
expect(circuit_breaker.failure_wait_time).to eq(30)
expect(circuit_breaker.failure_reset_time).to eq(1800)
expect(circuit_breaker.storage_timeout).to eq(5)
end
end
context 'circuitbreaker settings' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2,
circuitbreaker_storage_timeout: 3)
end
describe '#failure_count_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_count_threshold).to eq(0)
end
end
describe '#failure_wait_time' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_wait_time).to eq(1)
end
end
describe '#failure_reset_time' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_reset_time).to eq(2)
end
end
describe '#storage_timeout' do
it 'reads the value from settings' do
expect(circuit_breaker.storage_timeout).to eq(3)
end
end
end
......@@ -151,10 +172,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
context 'the `failure_wait_time` is set to 0' do
before do
stub_storage_settings('default' => {
'failure_wait_time' => 0,
'path' => TestEnv.repos_path
})
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
it 'is working even when there is a recent failure' do
......
......@@ -54,6 +54,10 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
end
describe '#failure_count_threshold' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 1)
end
it { expect(breaker.failure_count_threshold).to eq(1) }
end
......
......@@ -114,6 +114,19 @@ describe ApplicationSetting do
it { expect(setting.repository_storages).to eq(['default']) }
end
context 'circuitbreaker settings' do
[:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field|
it "Validates #{field} as number" do
is_expected.to validate_numericality_of(field)
.only_integer
.is_greater_than_or_equal_to(0)
end
end
end
context 'repository storages' do
before do
storages = {
......
......@@ -23,6 +23,7 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(0)
expect(json_response['ecdsa_key_restriction']).to eq(0)
expect(json_response['ed25519_key_restriction']).to eq(0)
expect(json_response['circuitbreaker_failure_count_threshold']).not_to be_nil
end
end
......@@ -52,7 +53,8 @@ describe API::Settings, 'Settings' do
rsa_key_restriction: ApplicationSetting::FORBIDDEN_KEY_VALUE,
dsa_key_restriction: 2048,
ecdsa_key_restriction: 384,
ed25519_key_restriction: 256
ed25519_key_restriction: 256,
circuitbreaker_failure_wait_time: 2
expect(response).to have_http_status(200)
expect(json_response['default_projects_limit']).to eq(3)
......@@ -73,6 +75,7 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(2048)
expect(json_response['ecdsa_key_restriction']).to eq(384)
expect(json_response['ed25519_key_restriction']).to eq(256)
expect(json_response['circuitbreaker_failure_wait_time']).to eq(2)
end
end
......
......@@ -43,10 +43,6 @@ module StubConfiguration
messages['default'] ||= Gitlab.config.repositories.storages.default
messages.each do |storage_name, storage_settings|
storage_settings['path'] = TestEnv.repos_path unless storage_settings.key?('path')
storage_settings['failure_count_threshold'] ||= 10
storage_settings['failure_wait_time'] ||= 30
storage_settings['failure_reset_time'] ||= 1800
storage_settings['storage_timeout'] ||= 5
end
allow(Gitlab.config.repositories).to receive(:storages).and_return(Settingslogic.new(messages))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment