Commit 57e9d641 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'zj-circuit-breaker-removal-ee' into 'master'

Remove Git circuit breaker

See merge request gitlab-org/gitlab-ee!7845
parents 44346736 869a459e
......@@ -6,12 +6,5 @@ class Admin::HealthCheckController < Admin::ApplicationController
checks << 'geo' if Gitlab::Geo.secondary?
@errors = HealthCheck::Utils.process_checks(checks)
@failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages
end
def reset_storage_health
Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end
end
......@@ -66,7 +66,7 @@ class ApplicationController < ActionController::Base
head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window
end
rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
rescue_from GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
log_exception(exception)
headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after)
......
# frozen_string_literal: true
class HealthController < ActionController::Base
protect_from_forgery with: :exception, except: :storage_check, prepend: true
protect_from_forgery with: :exception, prepend: true
include RequiresWhitelistedMonitoringClient
CHECKS = [
......@@ -25,15 +25,6 @@ class HealthController < ActionController::Base
render_check_results(results)
end
def storage_check
results = Gitlab::Git::Storage::Checker.check_all
render json: {
check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
results: results
}
end
private
def render_check_results(results)
......
......@@ -109,37 +109,6 @@ module ApplicationSettingsHelper
options_for_select(options, selected)
end
def circuitbreaker_failure_count_help_text
health_link = link_to(s_('AdminHealthPageLink|health page'), admin_health_check_path)
api_link = link_to(s_('CircuitBreakerApiLink|circuitbreaker api'), help_page_path("api/repository_storage_health"))
message = _("The number of failures after which GitLab will completely "\
"prevent access to the storage. The number of failures can be "\
"reset in the admin interface: %{link_to_health_page} or using "\
"the %{api_documentation_link}.")
message = message % { link_to_health_page: health_link, api_documentation_link: api_link }
message.html_safe
end
def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
end
def circuitbreaker_storage_timeout_help_text
_("The time in seconds GitLab will try to access storage. After this time a "\
"timeout error will be raised.")
end
def circuitbreaker_check_interval_help_text
_("The time in seconds between storage checks. If a check did "\
"not complete yet, GitLab will skip the next check.")
end
def visible_attributes
[
:admin_notification_email,
......@@ -151,11 +120,6 @@ module ApplicationSettingsHelper
:authorized_keys_enabled,
:auto_devops_enabled,
:auto_devops_domain,
:circuitbreaker_access_retries,
:circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
:container_registry_token_expire_delay,
......
# frozen_string_literal: true
module StorageHealthHelper
def failing_storage_health_message(storage_health)
storage_name = content_tag(:strong, h(storage_health.storage_name))
host_names = h(storage_health.failing_on_hosts.to_sentence)
translation_params = { storage_name: storage_name,
host_names: host_names,
failed_attempts: storage_health.total_failures }
translation = n_('%{storage_name}: failed storage access attempt on host:',
'%{storage_name}: %{failed_attempts} failed storage access attempts:',
storage_health.total_failures) % translation_params
translation.html_safe
end
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures }
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
end
end
end
......@@ -4,6 +4,7 @@ class ApplicationSetting < ActiveRecord::Base
include CacheableAttributes
include CacheMarkdownField
include TokenAuthenticatable
include IgnorableColumn
prepend EE::ApplicationSetting
add_authentication_token_field :runners_registration_token
......@@ -28,6 +29,12 @@ class ApplicationSetting < ActiveRecord::Base
serialize :domain_blacklist, Array # rubocop:disable Cop/ActiveRecordSerialize
serialize :repository_storages # rubocop:disable Cop/ActiveRecordSerialize
ignore_column :circuitbreaker_failure_count_threshold
ignore_column :circuitbreaker_failure_reset_time
ignore_column :circuitbreaker_storage_timeout
ignore_column :circuitbreaker_access_retries
ignore_column :circuitbreaker_check_interval
cache_markdown_field :sign_in_text
cache_markdown_field :help_page_text
cache_markdown_field :shared_runners_text, pipeline: :plain_markdown
......@@ -151,17 +158,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
:circuitbreaker_check_interval,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :circuitbreaker_access_retries,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates :gitaly_timeout_default,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......
......@@ -20,32 +20,5 @@
Manage repository storage paths. Learn more in the
= succeed "." do
= link_to "repository storages documentation", help_page_path("administration/repository_storage_paths")
.sub-section
%h4 Circuit breaker
.form-group
= f.label :circuitbreaker_check_interval, _('Check interval'), class: 'label-bold'
= f.number_field :circuitbreaker_check_interval, class: 'form-control'
.form-text.text-muted
= circuitbreaker_check_interval_help_text
.form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'label-bold'
= f.number_field :circuitbreaker_access_retries, class: 'form-control'
.form-text.text-muted
= circuitbreaker_access_retries_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'label-bold'
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.form-text.text-muted
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'label-bold'
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.form-text.text-muted
= circuitbreaker_failure_count_help_text
.form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'label-bold'
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.form-text.text-muted
= circuitbreaker_failure_reset_time_help_text
= f.submit 'Save changes', class: "btn btn-success qa-save-changes-button"
......@@ -20,7 +20,7 @@
%button.btn.btn-default.js-settings-toggle{ type: 'button' }
= expanded_by_default? ? _('Collapse') : _('Expand')
%p
= _('Configure storage path and circuit breaker settings.')
= _('Configure storage path settings.')
.settings-content
= render 'repository_storage'
......
- if failing_storages.any?
= _('There are problems accessing Git storage: ')
%ul
- failing_storages.each do |storage_health|
%li
= failing_storage_health_message(storage_health)
%ul
- storage_health.failing_circuit_breakers.each do |circuit_breaker|
%li
#{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)}
= _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.")
.prepend-top-10
= button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path,
method: :post, class: 'btn btn-default'
- @no_container = true
- page_title _('Health Check')
- no_errors = @errors.blank? && @failing_storage_statuses.blank?
- no_errors = @errors.blank?
%div{ class: container_class }
%h3.page-title= page_title
......@@ -41,4 +41,3 @@
#{ s_('HealthCheck|No Health Problems Detected') }
- else
= @errors
= render partial: 'failing_storages', object: @failing_storage_statuses
#!/usr/bin/env ruby
require 'optparse'
require 'net/http'
require 'json'
require 'socket'
require 'logger'
require_relative '../lib/gitlab/storage_check'
Gitlab::StorageCheck::CLI.start!(ARGV)
---
title: Remove Git circuit breaker
merge_request: 22212
author:
type: removed
......@@ -68,7 +68,6 @@ Rails.application.routes.draw do
# '/-/health' implemented by BasicHealthMiddleware
get 'liveness' => 'health#liveness'
get 'readiness' => 'health#readiness'
post 'storage_check' => 'health#storage_check'
resources :metrics, only: [:index]
mount Peek::Railtie => '/peek', as: 'peek_routes'
......
......@@ -76,9 +76,7 @@ namespace :admin do
end
resource :logs, only: [:show]
resource :health_check, controller: 'health_check', only: [:show] do
post :reset_storage_health
end
resource :health_check, controller: 'health_check', only: [:show]
resource :background_jobs, controller: 'background_jobs', only: [:show]
## EE-specific
......
# frozen_string_literal: true
class RemoveCircuitBreaker < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
disable_ddl_transaction!
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT = {
circuitbreaker_failure_count_threshold: 3,
circuitbreaker_failure_reset_time: 1800,
circuitbreaker_storage_timeout: 15,
circuitbreaker_access_retries: 3,
circuitbreaker_check_interval: 1
}.freeze
def up
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT.keys.each do |column|
remove_column(:application_settings, column) if column_exists?(:application_settings, column)
end
end
def down
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT.each do |column, default|
add_column_with_default(:application_settings, column, :integer, default: default) unless column_exists?(:application_settings, column)
end
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20181008145359) do
ActiveRecord::Schema.define(version: 20181008200441) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -168,10 +168,6 @@ ActiveRecord::Schema.define(version: 20181008145359) do
t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: true, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 3
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 15
t.integer "circuitbreaker_access_retries", default: 3
t.boolean "throttle_unauthenticated_enabled", default: false, null: false
t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
......@@ -181,14 +177,13 @@ ActiveRecord::Schema.define(version: 20181008145359) do
t.boolean "throttle_authenticated_web_enabled", default: false, null: false
t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
t.integer "circuitbreaker_check_interval", default: 1, null: false
t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true
t.integer "gitaly_timeout_default", default: 55, null: false
t.integer "gitaly_timeout_medium", default: 30, null: false
t.integer "gitaly_timeout_fast", default: 10, null: false
t.boolean "mirror_available", default: true, null: false
t.integer "default_project_creation", default: 2, null: false
t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true, null: false
t.string "auto_devops_domain"
t.boolean "external_authorization_service_enabled", default: false, null: false
t.string "external_authorization_service_url"
......
......@@ -45,10 +45,7 @@ The following metrics are available:
| redis_ping_success | Gauge | 9.4 | Whether or not the last redis ping succeeded |
| redis_ping_latency_seconds | Gauge | 9.4 | Round trip time of the redis ping |
| user_session_logins_total | Counter | 9.4 | Counter of how many users have logged in |
| filesystem_circuitbreaker_latency_seconds | Gauge | 9.5 | Time spent validating if a storage is accessible |
| filesystem_circuitbreaker | Gauge | 9.5 | Whether or not the circuit for a certain shard is broken or not |
| circuitbreaker_storage_check_duration_seconds | Histogram | 10.3 | Time a single storage probe took |
| upload_file_does_not_exist | Counter | 10.7 | Number of times an upload record could not find its file |
| upload_file_does_not_exist | Counter | 10.7 | Number of times an upload record could not find its file |
| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login |
| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login |
......
......@@ -97,55 +97,6 @@ be stored via the **Application Settings** in the Admin area.
Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be
randomly placed on one of the selected paths.
## Handling failing repository storage
> [Introduced][ce-11449] in GitLab 9.5.
When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
This can be configured from the admin interface:
![circuitbreaker configuration](img/circuitbreaker_config.png)
**Number of access attempts**: The number of attempts GitLab will make to access a
storage when probing a shard.
**Number of failures before backing off**: The number of failures after which
GitLab will start temporarily disabling access to a storage shard on a host.
**Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**Seconds to wait after a storage failure:** When access to a storage fails. GitLab
will prevent access to the storage for the time specified here. This allows the
filesystem to recover.
**Seconds before reseting failure information:** The time in seconds GitLab will
keep failure information. When no failures occur during this time, information about the
mount is reset.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised.
To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console:
```
Feature.enable('git_storage_circuit_breaker')
```
Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
This approach would be used when enabling the circuit breaker on a single host.
When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png)
To allow access to all storages, click the `Reset git storage health information` button.
[ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578
[restart-gitlab]: restart_gitlab.md#installations-from-source
[reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
......
# Circuitbreaker API
> [Introduced][ce-11449] in GitLab 9.5.
The Circuitbreaker API is only accessible to administrators. All requests by
guests will respond with `401 Unauthorized`, and all requests by normal users
will respond with `403 Forbidden`.
## Repository Storages
### Get all storage information
Returns of all currently configured storages and their health information.
```
GET /circuit_breakers/repository_storage
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
```json
[
{
"storage_name": "default",
"failing_on_hosts": [],
"total_failures": 0
},
{
"storage_name": "broken",
"failing_on_hosts": [
"web01", "worker01"
],
"total_failures": 1
}
]
```
### Get failing storages
This returns a list of all currently failing storages.
```
GET /circuit_breakers/repository_storage/failing
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing
```
```json
[
{
"storage_name":"broken",
"failing_on_hosts":["web01", "worker01"],
"total_failures":2
}
]
```
## Reset failing storage information
Use this remove all failing storage information and allow access to the storage again.
```
DELETE /circuit_breakers/repository_storage
```
```bash
curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
NOTE: **Deprecated:**
Support of the circuit breaker is removed, as Gitaly can be configured to
to work without NFS and [communicate solely over HTTP](../administration/gitaly/index.md).
......@@ -146,11 +146,6 @@ are listed in the descriptions of the relevant settings.
| `auto_devops_domain` | string | no | Specify a domain to use by default for every project's Auto Review Apps and Auto Deploy stages. |
| `auto_devops_enabled` | boolean | no | Enable Auto DevOps for projects by default. It will automatically build, test, and deploy applications based on a predefined CI/CD configuration. |
| `check_namespace_plan` | boolean | no | **(Premium)** Enabling this will make only licensed EE features available to projects if the project namespace's plan includes the feature or if the project is public. |
| `circuitbreaker_access_retries` | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt. |
| `clientside_sentry_dsn` | string | required by: `clientside_sentry_enabled` | Clientside Sentry Data Source Name. |
| `clientside_sentry_enabled` | boolean | no | (**If enabled, requires:** `clientside_sentry_dsn`) Enable Sentry error reporting for the client side. |
| `container_registry_token_expire_delay` | integer | no | Container Registry token duration in minutes. |
......
......@@ -13,37 +13,24 @@ module API
end
resource ':type' do
namespace '', requirements: { type: 'repository_storage' } do
helpers do
def failing_storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
end
def storage_health
@storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
end
end
desc 'Get all git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get do
present storage_health, with: Entities::RepositoryStorageHealth
present []
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get 'failing' do
present failing_storage_health, with: Entities::RepositoryStorageHealth
present []
end
desc 'Reset all storage failures and open circuitbreaker' do
detail 'This feature was introduced in GitLab 9.5'
end
delete do
Gitlab::Git::Storage::FailureInfo.reset_all!
end
end
end
......
......@@ -1392,12 +1392,6 @@ module API
expose :submitted, as: :akismet_submitted
end
class RepositoryStorageHealth < Grape::Entity
expose :storage_name
expose :failing_on_hosts
expose :total_failures
end
class CustomAttribute < Grape::Entity
expose :key
expose :value
......
......@@ -96,10 +96,6 @@ module Gitlab
raise Gitlab::Git::CommandError.new(e.message)
end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
end
def exists?
gitaly_repository_client.exists?
end
......
module Gitlab
module Git
module Storage
class Inaccessible < StandardError
attr_reader :retry_after
def initialize(message = nil, retry_after = nil)
super(message)
@retry_after = retry_after
end
end
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
REDIS_KNOWN_KEYS = "#{REDIS_KEY_PREFIX}known_keys_set".freeze
def self.redis
Gitlab::Redis::SharedState
end
end
end
end
module Gitlab
module Git
module Storage
class Checker
include CircuitBreakerSettings
attr_reader :storage_path, :storage, :hostname, :logger
METRICS_MUTEX = Mutex.new
STORAGE_TIMING_BUCKETS = [0.1, 0.15, 0.25, 0.33, 0.5, 1, 1.5, 2.5, 5, 10, 15].freeze
def self.check_all(logger = Rails.logger)
threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
Thread.new do
Thread.current[:result] = new(storage_name, logger).check_with_lease
end
end
threads.map do |thread|
thread.join
thread[:result]
end
end
def self.check_histogram
@check_histogram ||=
METRICS_MUTEX.synchronize do
@check_histogram || Gitlab::Metrics.histogram(:circuitbreaker_storage_check_duration_seconds,
'Storage check time in seconds',
{},
STORAGE_TIMING_BUCKETS
)
end
end
def initialize(storage, logger = Rails.logger)
@storage = storage
config = Gitlab.config.repositories.storages[@storage]
@storage_path = Gitlab::GitalyClient::StorageSettings.allow_disk_access { config.legacy_disk_path }
@logger = logger
@hostname = Gitlab::Environment.hostname
end
def check_with_lease
lease_key = "storage_check:#{cache_key}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
result = { storage: storage, success: nil }
if uuid = lease.try_obtain
result[:success] = check
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
else
logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
end
result
end
def check
if perform_access_check
track_storage_accessible
true
else
track_storage_inaccessible
logger.error("#{hostname}: #{storage}: Not accessible.")
false
end
end
private
def perform_access_check
start_time = Gitlab::Metrics::System.monotonic_time
Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
ensure
execution_time = Gitlab::Metrics::System.monotonic_time - start_time
self.class.check_histogram.observe({ storage: storage }, execution_time)
end
def track_storage_inaccessible
first_failure = current_failure_info.first_failure || Time.now
last_failure = Time.now
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, nil)
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def current_failure_info
FailureInfo.load(cache_key)
end
end
end
end
end
module Gitlab
module Git
module Storage
class CircuitBreaker
include CircuitBreakerSettings
attr_reader :storage,
:hostname
delegate :last_failure, :failure_count, :no_failures?,
to: :failure_info
def self.for_storage(storage)
cached_circuitbreakers = Gitlab::SafeRequestStore.fetch(:circuitbreaker_cache) do
Hash.new do |hash, storage_name|
hash[storage_name] = build(storage_name)
end
end
cached_circuitbreakers[storage]
end
def self.build(storage, hostname = Gitlab::Environment.hostname)
config = Gitlab.config.repositories.storages[storage]
Gitlab::GitalyClient::StorageSettings.allow_disk_access do
if !config.present?
NullCircuitBreaker.new(storage, hostname, error: Misconfiguration.new("Storage '#{storage}' is not configured"))
elsif !config.legacy_disk_path.present?
NullCircuitBreaker.new(storage, hostname, error: Misconfiguration.new("Path for storage '#{storage}' is not configured"))
else
new(storage, hostname)
end
end
end
def initialize(storage, hostname)
@storage = storage
@hostname = hostname
end
def perform
return yield unless enabled?
check_storage_accessible!
yield
end
def circuit_broken?
return false if no_failures?
failure_count > failure_count_threshold
end
private
# The circuitbreaker can be enabled for the entire fleet using a Feature
# flag.
#
# Enabling it for a single host can be done setting the
# `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
def enabled?
ENV['GIT_STORAGE_CIRCUIT_BREAKER'].present? || Feature.enabled?('git_storage_circuit_breaker')
end
def failure_info
@failure_info ||= FailureInfo.load(cache_key)
end
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
end
end
end
end
end
module Gitlab
module Git
module Storage
module CircuitBreakerSettings
def failure_count_threshold
application_settings.circuitbreaker_failure_count_threshold
end
def failure_reset_time
application_settings.circuitbreaker_failure_reset_time
end
def storage_timeout
application_settings.circuitbreaker_storage_timeout
end
def access_retries
application_settings.circuitbreaker_access_retries
end
def check_interval
application_settings.circuitbreaker_check_interval
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def application_settings
Gitlab::CurrentSettings.current_application_settings
end
end
end
end
end
module Gitlab
module Git
module Storage
class FailureInfo
attr_accessor :first_failure, :last_failure, :failure_count
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
Gitlab::SafeRequestStore.delete(:circuitbreaker_cache)
end
def self.load(cache_key)
first_failure, last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :first_failure, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
first_failure = Time.at(first_failure.to_i) if first_failure.present?
new(first_failure, last_failure, failure_count.to_i)
end
def initialize(first_failure, last_failure, failure_count)
@first_failure = first_failure
@last_failure = last_failure
@failure_count = failure_count
end
def no_failures?
first_failure.blank? && last_failure.blank? && failure_count == 0
end
end
end
end
end
module Gitlab
module Git
module Storage
module ForkedStorageCheck
extend self
def storage_available?(path, timeout_seconds = 5, retries = 1)
partial_timeout = timeout_seconds / retries
status = timeout_check(path, partial_timeout)
# If the status check did not succeed the first time, we retry a few
# more times to avoid one-off failures
current_attempts = 1
while current_attempts < retries && !status.success?
status = timeout_check(path, partial_timeout)
current_attempts += 1
end
status.success?
end
def timeout_check(path, timeout_seconds)
filesystem_check_pid = check_filesystem_in_process(path)
deadline = timeout_seconds.seconds.from_now.utc
wait_time = 0.01
status = nil
while status.nil?
if deadline > Time.now.utc
sleep(wait_time)
_pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG)
else
Process.kill('KILL', filesystem_check_pid)
# Blocking wait, so we are sure the process is gone before continuing
_pid, status = Process.wait2(filesystem_check_pid)
end
end
status
end
# This will spawn a new 2 processes to do the check:
# The outer child (waiter) will spawn another child process (stater).
#
# The stater is the process is performing the actual filesystem check
# the check might hang if the filesystem is acting up.
# In this case we will send a `KILL` to the waiter, which will still
# be responsive while the stater is hanging.
def check_filesystem_in_process(path)
spawn('ruby', '-e', ruby_check, path, [:out, :err] => '/dev/null')
end
def ruby_check
<<~RUBY_FILESYSTEM_CHECK
inner_pid = fork { File.stat(ARGV.first) }
Process.waitpid(inner_pid)
exit $?.exitstatus
RUBY_FILESYSTEM_CHECK
end
end
end
end
end
module Gitlab
module Git
module Storage
class Health
attr_reader :storage_name, :info
def self.prefix_for_storage(storage_name)
"#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:"
end
def self.for_all_storages
storage_names = Gitlab.config.repositories.storages.keys
results_per_storage = nil
Gitlab::Git::Storage.redis.with do |redis|
keys_per_storage = all_keys_for_storages(storage_names, redis)
results_per_storage = load_for_keys(keys_per_storage, redis)
end
results_per_storage.map do |name, info|
info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
new(name, info)
end
end
private_class_method def self.all_keys_for_storages(storage_names, redis)
keys_per_storage = {}
all_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
storage_names.each do |storage_name|
prefix = prefix_for_storage(storage_name)
keys_per_storage[storage_name] = all_keys.select { |key| key.starts_with?(prefix) }
end
keys_per_storage
end
private_class_method def self.load_for_keys(keys_per_storage, redis)
info_for_keys = {}
redis.pipelined do
keys_per_storage.each do |storage_name, keys_future|
info_for_storage = keys_future.map do |key|
{ name: key, failure_count: redis.hget(key, :failure_count) }
end
info_for_keys[storage_name] = info_for_storage
end
end
info_for_keys
end
def self.for_failing_storages
for_all_storages.select(&:failing?)
end
def initialize(storage_name, info)
@storage_name = storage_name
@info = info
end
def failing_info
@failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 }
end
def failing?
failing_info.any?
end
def failing_on_hosts
@failing_on_hosts ||= failing_info.map do |info_for_host|
info_for_host[:name].split(':').last
end
end
def failing_circuit_breakers
@failing_circuit_breakers ||= failing_on_hosts.map do |hostname|
CircuitBreaker.build(storage_name, hostname)
end
end
# rubocop: disable CodeReuse/ActiveRecord
def total_failures
@total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] }
end
# rubocop: enable CodeReuse/ActiveRecord
end
end
end
end
module Gitlab
module Git
module Storage
class NullCircuitBreaker
include CircuitBreakerSettings
# These will have actual values
attr_reader :storage,
:hostname
# These will always have nil values
attr_reader :storage_path
delegate :last_failure, :failure_count, :no_failures?,
to: :failure_info
def initialize(storage, hostname, error: nil)
@storage = storage
@hostname = hostname
@error = error
end
def perform
@error ? raise(@error) : yield
end
def circuit_broken?
!!@error
end
def backing_off?
false
end
def failure_info
@failure_info ||=
if circuit_broken?
Gitlab::Git::Storage::FailureInfo.new(Time.now,
Time.now,
failure_count_threshold)
else
Gitlab::Git::Storage::FailureInfo.new(nil,
nil,
0)
end
end
end
end
end
end
require_relative 'storage_check/cli'
require_relative 'storage_check/gitlab_caller'
require_relative 'storage_check/option_parser'
require_relative 'storage_check/response'
module Gitlab
module StorageCheck
ENDPOINT = '/-/storage_check'.freeze
Options = Struct.new(:target, :token, :interval, :dryrun)
end
end
module Gitlab
module StorageCheck
class CLI
def self.start!(args)
runner = new(Gitlab::StorageCheck::OptionParser.parse!(args))
runner.start_loop
end
attr_reader :logger, :options
def initialize(options)
@options = options
@logger = Logger.new(STDOUT)
end
def start_loop
logger.info "Checking #{options.target} every #{options.interval} seconds"
if options.dryrun
logger.info "Dryrun, exiting..."
return
end
begin
loop do
response = GitlabCaller.new(options).call!
log_response(response)
update_settings(response)
sleep options.interval
end
rescue Interrupt
logger.info "Ending storage-check"
end
end
def update_settings(response)
previous_interval = options.interval
if response.valid?
options.interval = response.check_interval || previous_interval
end
if previous_interval != options.interval
logger.info "Interval changed: #{options.interval} seconds"
end
end
def log_response(response)
unless response.valid?
return logger.error("Invalid response checking nfs storage: #{response.http_response.inspect}")
end
if response.responsive_shards.any?
logger.debug("Responsive shards: #{response.responsive_shards.join(', ')}")
end
warnings = []
if response.skipped_shards.any?
warnings << "Skipped shards: #{response.skipped_shards.join(', ')}"
end
if response.failing_shards.any?
warnings << "Failing shards: #{response.failing_shards.join(', ')}"
end
logger.warn(warnings.join(' - ')) if warnings.any?
end
end
end
end
require 'excon'
module Gitlab
module StorageCheck
class GitlabCaller
def initialize(options)
@options = options
end
def call!
Gitlab::StorageCheck::Response.new(get_response)
rescue Errno::ECONNREFUSED, Excon::Error
# Server not ready, treated as invalid response.
Gitlab::StorageCheck::Response.new(nil)
end
def get_response
scheme, *other_parts = URI.split(@options.target)
socket_path = if scheme == 'unix'
other_parts.compact.join
end
connection = Excon.new(@options.target, socket: socket_path)
connection.post(path: Gitlab::StorageCheck::ENDPOINT,
headers: headers)
end
def headers
@headers ||= begin
headers = {}
headers['Content-Type'] = headers['Accept'] = 'application/json'
headers['TOKEN'] = @options.token if @options.token
headers
end
end
end
end
end
module Gitlab
module StorageCheck
class OptionParser
def self.parse!(args)
# Start out with some defaults
options = Gitlab::StorageCheck::Options.new(nil, nil, 1, false)
parser = ::OptionParser.new do |opts|
opts.banner = "Usage: bin/storage_check [options]"
opts.on('-t=string', '--target string', 'URL or socket to trigger storage check') do |value|
options.target = value
end
opts.on('-T=string', '--token string', 'Health token to use') { |value| options.token = value }
opts.on('-i=n', '--interval n', ::OptionParser::DecimalInteger, 'Seconds between checks') do |value|
options.interval = value
end
opts.on('-d', '--dryrun', "Output what will be performed, but don't start the process") do |value|
options.dryrun = value
end
end
parser.parse!(args)
unless options.target
raise ::OptionParser::InvalidArgument.new('Provide a URI to provide checks')
end
if URI.parse(options.target).scheme.nil?
raise ::OptionParser::InvalidArgument.new('Add the scheme to the target, `unix://`, `https://` or `http://` are supported')
end
options
end
end
end
end
require 'json'
module Gitlab
module StorageCheck
class Response
attr_reader :http_response
def initialize(http_response)
@http_response = http_response
end
def valid?
@http_response && (200...299).cover?(@http_response.status) &&
@http_response.headers['Content-Type'].include?('application/json') &&
parsed_response
end
def check_interval
return nil unless parsed_response
parsed_response['check_interval']
end
def responsive_shards
divided_results[:responsive_shards]
end
def skipped_shards
divided_results[:skipped_shards]
end
def failing_shards
divided_results[:failing_shards]
end
private
def results
return [] unless parsed_response
parsed_response['results']
end
def divided_results
return @divided_results if @divided_results
@divided_results = {}
@divided_results[:responsive_shards] = []
@divided_results[:skipped_shards] = []
@divided_results[:failing_shards] = []
results.each do |info|
name = info['storage']
case info['success']
when true
@divided_results[:responsive_shards] << name
when false
@divided_results[:failing_shards] << name
else
@divided_results[:skipped_shards] << name
end
end
@divided_results
end
def parsed_response
return @parsed_response if defined?(@parsed_response)
@parsed_response = JSON.parse(@http_response.body)
rescue JSON::JSONError
@parsed_response = nil
end
end
end
end
......@@ -148,23 +148,12 @@ msgstr ""
msgid "%{number_commits_behind} commits behind %{default_branch}, %{number_commits_ahead} commits ahead"
msgstr ""
msgid "%{number_of_failures} of %{maximum_failures} failures. GitLab will allow access on the next attempt."
msgstr ""
msgid "%{number_of_failures} of %{maximum_failures} failures. GitLab will not retry automatically. Reset storage information when the problem is resolved."
msgstr ""
msgid "%{openOrClose} %{noteable}"
msgstr ""
msgid "%{percent}%% complete"
msgstr ""
msgid "%{storage_name}: failed storage access attempt on host:"
msgid_plural "%{storage_name}: %{failed_attempts} failed storage access attempts:"
msgstr[0] ""
msgstr[1] ""
msgid "%{text} %{files}"
msgid_plural "%{text} %{files} files"
msgstr[0] ""
......@@ -363,9 +352,6 @@ msgstr ""
msgid "Access to '%{classification_label}' not allowed"
msgstr ""
msgid "Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again."
msgstr ""
msgid "Account"
msgstr ""
......@@ -468,9 +454,6 @@ msgstr ""
msgid "AdminArea|You’re about to stop all jobs.This will halt all current jobs that are running."
msgstr ""
msgid "AdminHealthPageLink|health page"
msgstr ""
msgid "AdminProjects| You’re about to permanently delete the project %{projectName}, its repository, and all related resources including issues, merge requests, etc.. Once you confirm and press %{strong_start}Delete project%{strong_end}, it cannot be undone or recovered."
msgstr ""
......@@ -1412,9 +1395,6 @@ msgstr ""
msgid "Chat"
msgstr ""
msgid "Check interval"
msgstr ""
msgid "Checking %{text} availability…"
msgstr ""
......@@ -1568,9 +1548,6 @@ msgstr ""
msgid "CiVariable|Validation failed"
msgstr ""
msgid "CircuitBreakerApiLink|circuitbreaker api"
msgstr ""
msgid "ClassificationLabelUnavailable|is unavailable: %{reason}"
msgstr ""
......@@ -2151,7 +2128,7 @@ msgstr ""
msgid "Configure push mirrors."
msgstr ""
msgid "Configure storage path and circuit breaker settings."
msgid "Configure storage path settings."
msgstr ""
msgid "Configure the %{link} integration."
......@@ -3803,9 +3780,6 @@ msgstr ""
msgid "Git revision"
msgstr ""
msgid "Git storage health information has been reset"
msgstr ""
msgid "Git strategy for pipelines"
msgstr ""
......@@ -4825,9 +4799,6 @@ msgstr ""
msgid "Max access level"
msgstr ""
msgid "Maximum git storage failures"
msgstr ""
msgid "Maximum job timeout"
msgstr ""
......@@ -5445,9 +5416,6 @@ msgstr ""
msgid "November"
msgstr ""
msgid "Number of access attempts"
msgstr ""
msgid "OK"
msgstr ""
......@@ -6634,9 +6602,6 @@ msgstr ""
msgid "Require all users to accept Terms of Service and Privacy Policy when they access GitLab."
msgstr ""
msgid "Reset git storage health information"
msgstr ""
msgid "Reset health check access token"
msgstr ""
......@@ -6891,12 +6856,6 @@ msgstr ""
msgid "SearchAutocomplete|in this project"
msgstr ""
msgid "Seconds before reseting failure information"
msgstr ""
msgid "Seconds to wait for a storage access attempt"
msgstr ""
msgid "Secret"
msgstr ""
......@@ -7624,12 +7583,6 @@ msgstr ""
msgid "The maximum file size allowed is 200KB."
msgstr ""
msgid "The number of attempts GitLab will make to access a storage."
msgstr ""
msgid "The number of failures after which GitLab will completely prevent access to the storage. The number of failures can be reset in the admin interface: %{link_to_health_page} or using the %{api_documentation_link}."
msgstr ""
msgid "The passphrase required to decrypt the private key. This is optional and the value is encrypted at rest."
msgstr ""
......@@ -7687,15 +7640,6 @@ msgstr ""
msgid "The testing stage shows the time GitLab CI takes to run every pipeline for the related merge request. The data will automatically be added after your first pipeline finishes running."
msgstr ""
msgid "The time in seconds GitLab will keep failure information. When no failures occur during this time, information about the mount is reset."
msgstr ""
msgid "The time in seconds GitLab will try to access storage. After this time a timeout error will be raised."
msgstr ""
msgid "The time in seconds between storage checks. If a check did not complete yet, GitLab will skip the next check."
msgstr ""
msgid "The time taken by each data entry gathered by that stage."
msgstr ""
......@@ -7735,9 +7679,6 @@ msgstr ""
msgid "There are no unstaged changes"
msgstr ""
msgid "There are problems accessing Git storage: "
msgstr ""
msgid "There was an error adding a todo."
msgstr ""
......
require 'spec_helper'
describe 'bin/storage_check' do
it 'is executable' do
command = %w[bin/storage_check -t unix://the/path/to/a/unix-socket.sock -i 10 -d]
expected_output = 'Checking unix://the/path/to/a/unix-socket.sock every 10 seconds'
output, status = Gitlab::Popen.popen(command, Rails.root.to_s)
expect(status).to eq(0)
expect(output).to include(expected_output)
end
end
......@@ -8,18 +8,10 @@ describe Admin::HealthCheckController do
end
describe 'GET show' do
it 'loads the git storage health information' do
it 'loads the health information' do
get :show
expect(assigns[:failing_storage_statuses]).not_to be_nil
end
end
describe 'POST reset_storage_health' do
it 'resets all storage health information' do
expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)
post :reset_storage_health
expect(assigns[:errors]).not_to be_nil
end
end
end
......@@ -190,30 +190,6 @@ describe ApplicationController do
end
end
describe 'rescue from Gitlab::Git::Storage::Inaccessible' do
controller(described_class) do
def index
raise Gitlab::Git::Storage::Inaccessible.new('broken', 100)
end
end
it 'renders a 503 when storage is not available' do
sign_in(create(:user))
get :index
expect(response.status).to eq(503)
end
it 'renders includes a Retry-After header' do
sign_in(create(:user))
get :index
expect(response.headers['Retry-After']).to eq(100)
end
end
describe 'response format' do
controller(described_class) do
def index
......
......@@ -14,48 +14,6 @@ describe HealthController do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false')
end
describe '#storage_check' do
before do
allow(Gitlab::RequestContext).to receive(:client_ip).and_return(whitelisted_ip)
end
subject { post :storage_check }
it 'checks all the configured storages' do
expect(Gitlab::Git::Storage::Checker).to receive(:check_all).and_call_original
subject
end
it 'returns the check interval' do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'true')
stub_application_setting(circuitbreaker_check_interval: 10)
subject
expect(json_response['check_interval']).to eq(10)
end
context 'with failing storages', :broken_storage do
before do
stub_storage_settings(
broken: { path: 'tmp/tests/non-existent-repositories' }
)
end
it 'includes the failure information' do
subject
expected_results = [
{ 'storage' => 'broken', 'success' => false },
{ 'storage' => 'default', 'success' => true }
]
expect(json_response['results']).to eq(expected_results)
end
end
end
describe '#readiness' do
shared_context 'endpoint responding with readiness data' do
let(:request_params) { {} }
......
......@@ -2,10 +2,11 @@ require 'spec_helper'
describe "Admin Health Check", :feature do
include StubENV
set(:admin) { create(:admin) }
before do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false')
sign_in(create(:admin))
sign_in(admin)
end
describe '#show' do
......@@ -56,27 +57,4 @@ describe "Admin Health Check", :feature do
expect(page).to have_content('The server is on fire')
end
end
context 'with repository storage failures', :broken_storage do
before do
visit admin_health_check_path
end
it 'shows storage failure information' do
hostname = Gitlab::Environment.hostname
maximum_failures = Gitlab::CurrentSettings.current_application_settings
.circuitbreaker_failure_count_threshold
number_of_failures = maximum_failures + 1
expect(page).to have_content("broken: #{number_of_failures} failed storage access attempts:")
expect(page).to have_content("#{hostname}: #{number_of_failures} of #{maximum_failures} failures.")
end
it 'allows resetting storage failures' do
click_button 'Reset git storage health information'
expect(page).to have_content('Git storage health information has been reset')
expect(page).not_to have_content('failed storage access attempt')
end
end
end
require 'spec_helper'
describe StorageHealthHelper do
describe '#failing_storage_health_message' do
let(:health) do
Gitlab::Git::Storage::Health.new(
"<script>alert('storage name');)</script>",
[]
)
end
it 'escapes storage names' do
escaped_storage_name = '&lt;script&gt;alert(&#39;storage name&#39;);)&lt;/script&gt;'
result = helper.failing_storage_health_message(health)
expect(result).to include(escaped_storage_name)
end
end
end
......@@ -49,8 +49,6 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
end
it 'only connects to redis twice' do
# Stub circuitbreaker so it doesn't count the redis connections in there
stub_circuit_breaker(project_without_status)
expect(Gitlab::Redis::Cache).to receive(:with).exactly(2).and_call_original
described_class.load_in_batch_for_projects([project_without_status])
......@@ -302,13 +300,4 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
end
end
end
def stub_circuit_breaker(project)
fake_circuitbreaker = double
allow(fake_circuitbreaker).to receive(:perform).and_yield
allow(project.repository.raw_repository)
.to receive(:circuit_breaker).and_return(fake_circuitbreaker)
allow(project.repository)
.to receive(:circuit_breaker).and_return(fake_circuitbreaker)
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::Checker, :clean_gitlab_redis_shared_state do
let(:storage_name) { 'default' }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
subject(:checker) { described_class.new(storage_name) }
def value_from_redis(name)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmset(cache_key, name, value)
end.first
end
describe '.check_all' do
it 'calls a check for each storage' do
fake_checker_default = double
fake_checker_broken = double
fake_logger = fake_logger
expect(described_class).to receive(:new).with('default', fake_logger) { fake_checker_default }
expect(described_class).to receive(:new).with('broken', fake_logger) { fake_checker_broken }
expect(fake_checker_default).to receive(:check_with_lease)
expect(fake_checker_broken).to receive(:check_with_lease)
described_class.check_all(fake_logger)
end
context 'with broken storage', :broken_storage do
it 'returns the results' do
expected_result = [
{ storage: 'default', success: true },
{ storage: 'broken', success: false }
]
expect(described_class.check_all).to eq(expected_result)
end
end
end
describe '#initialize' do
it 'assigns the settings' do
expect(checker.hostname).to eq(hostname)
expect(checker.storage).to eq('default')
expect(checker.storage_path).to eq(TestEnv.repos_path)
end
end
describe '#check_with_lease' do
it 'only allows one check at a time' do
expect(checker).to receive(:check).once { sleep 1 }
thread = Thread.new { checker.check_with_lease }
checker.check_with_lease
thread.join
end
it 'returns a result hash' do
expect(checker.check_with_lease).to eq(storage: 'default', success: true)
end
end
describe '#check' do
it 'tracks that the storage was accessible' do
set_in_redis(:failure_count, 10)
set_in_redis(:last_failure, Time.now.to_f)
checker.check
expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(value_from_redis(:first_failure)).to be_empty
end
it 'calls the check with the correct arguments' do
stub_application_setting(circuitbreaker_storage_timeout: 30,
circuitbreaker_access_retries: 3)
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
.and_call_original
checker.check
end
it 'returns `true`' do
expect(checker.check).to eq(true)
end
it 'maintains known storage keys' do
Timecop.freeze do
# Insert an old key to expire
old_entry = Time.now.to_i - 3.days.to_i
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed')
end
checker.check
known_keys = Gitlab::Git::Storage.redis.with do |redis|
redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
end
expect(known_keys).to contain_exactly(cache_key)
end
end
context 'the storage is not available', :broken_storage do
let(:storage_name) { 'broken' }
it 'tracks that the storage was inaccessible' do
Timecop.freeze do
expect { checker.check }.to change { value_from_redis(:failure_count).to_i }.by(1)
expect(value_from_redis(:last_failure)).not_to be_empty
expect(value_from_redis(:first_failure)).not_to be_empty
end
end
it 'returns `false`' do
expect(checker.check).to eq(false)
end
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::CircuitBreaker, :broken_storage do
let(:storage_name) { 'default' }
let(:circuit_breaker) { described_class.new(storage_name, hostname) }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, name, value)
end.first
end
before do
# Override test-settings for the circuitbreaker with something more realistic
# for these specs.
stub_storage_settings('default' => {
'path' => TestEnv.repos_path
},
'broken' => {
'path' => 'tmp/tests/non-existent-repositories'
},
'nopath' => { 'path' => nil }
)
end
describe '.for_storage', :request_store do
it 'only builds a single circuitbreaker per storage' do
expect(described_class).to receive(:new).once.and_call_original
breaker = described_class.for_storage('default')
expect(breaker).to be_a(described_class)
expect(described_class.for_storage('default')).to eq(breaker)
end
it 'returns a broken circuit breaker for an unknown storage' do
expect(described_class.for_storage('unknown').circuit_broken?).to be_truthy
end
it 'returns a broken circuit breaker when the path is not set' do
expect(described_class.for_storage('nopath').circuit_broken?).to be_truthy
end
end
describe '#initialize' do
it 'assigns the settings' do
expect(circuit_breaker.hostname).to eq(hostname)
expect(circuit_breaker.storage).to eq('default')
end
end
context 'circuitbreaker settings' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2,
circuitbreaker_storage_timeout: 3,
circuitbreaker_access_retries: 4,
circuitbreaker_backoff_threshold: 5)
end
describe '#failure_count_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_count_threshold).to eq(0)
end
end
describe '#check_interval' do
it 'reads the value from settings' do
expect(circuit_breaker.check_interval).to eq(1)
end
end
describe '#failure_reset_time' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_reset_time).to eq(2)
end
end
describe '#storage_timeout' do
it 'reads the value from settings' do
expect(circuit_breaker.storage_timeout).to eq(3)
end
end
describe '#access_retries' do
it 'reads the value from settings' do
expect(circuit_breaker.access_retries).to eq(4)
end
end
end
describe '#perform' do
it 'raises the correct exception when the circuit is open' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 999)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(1800)
end
end
it 'yields the block' do
expect { |b| circuit_breaker.perform(&b) }
.to yield_control
end
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
.and_call_original
circuit_breaker.perform { 'hello world' }
end
it 'returns the value of the block' do
result = circuit_breaker.perform { 'return value' }
expect(result).to eq('return value')
end
it 'raises possible errors' do
expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } }
.to raise_error(Rugged::OSError)
end
context 'with the feature disabled' do
before do
stub_feature_flags(git_storage_circuit_breaker: false)
end
it 'returns the block without checking accessibility' do
expect(circuit_breaker).not_to receive(:check_storage_accessible!)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
it 'allows enabling the feature using an ENV var' do
stub_env('GIT_STORAGE_CIRCUIT_BREAKER', 'true')
expect(circuit_breaker).to receive(:check_storage_accessible!)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
end
end
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
describe '#last_failure' do
it 'returns the last failure time' do
time = Time.parse("2017-05-26 17:52:30")
set_in_redis(:last_failure, time.to_i)
expect(circuit_breaker.last_failure).to eq(time)
end
end
describe '#failure_count' do
it 'returns the failure count' do
set_in_redis(:failure_count, 7)
expect(circuit_breaker.failure_count).to eq(7)
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::FailureInfo, :broken_storage do
let(:storage_name) { 'default' }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
def value_from_redis(name)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, name, value)
end.first
end
describe '.reset_all!' do
it 'clears all entries form redis' do
set_in_redis(:failure_count, 10)
described_class.reset_all!
key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
expect(key_exists).to be_falsey
end
it 'does not break when there are no keys in redis' do
expect { described_class.reset_all! }.not_to raise_error
end
end
describe '.load' do
it 'loads failure information for a storage on a host' do
first_failure = Time.parse("2017-11-14 17:52:30")
last_failure = Time.parse("2017-11-14 18:54:37")
failure_count = 11
set_in_redis(:first_failure, first_failure.to_i)
set_in_redis(:last_failure, last_failure.to_i)
set_in_redis(:failure_count, failure_count.to_i)
info = described_class.load(cache_key)
expect(info.first_failure).to eq(first_failure)
expect(info.last_failure).to eq(last_failure)
expect(info.failure_count).to eq(failure_count)
end
end
describe '#no_failures?' do
it 'is true when there are no failures' do
info = described_class.new(nil, nil, 0)
expect(info.no_failures?).to be_truthy
end
it 'is false when there are failures' do
info = described_class.new(Time.parse("2017-11-14 17:52:30"),
Time.parse("2017-11-14 18:54:37"),
20)
expect(info.no_failures?).to be_falsy
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_database_cleaner: true do
let(:existing_path) do
existing_path = TestEnv.repos_path
FileUtils.mkdir_p(existing_path)
existing_path
end
describe '.storage_accessible?' do
it 'detects when a storage is not available' do
expect(described_class.storage_available?('/non/existant/path')).to be_falsey
end
it 'detects when a storage is available' do
expect(described_class.storage_available?(existing_path)).to be_truthy
end
it 'returns false when the check takes to long' do
# We're forking a process here that takes too long
# It will be killed it's parent process will be killed by it's parent
# and waited for inside `Gitlab::Git::Storage::ForkedStorageCheck.timeout_check`
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
result = true
runtime = Benchmark.realtime do
result = described_class.storage_available?(existing_path, 0.5)
end
expect(result).to be_falsey
expect(runtime).to be < 1.0
end
it 'will try the specified amount of times before failing' do
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
expect(Process).to receive(:spawn).with('sleep 10').twice
.and_call_original
runtime = Benchmark.realtime do
described_class.storage_available?(existing_path, 0.5, 2)
end
expect(runtime).to be < 1.0
end
describe 'when using paths with spaces' do
let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') }
let(:path_with_spaces) { File.join(test_dir, 'path with spaces') }
around do |example|
FileUtils.mkdir_p(path_with_spaces)
example.run
FileUtils.rm_r(test_dir)
end
it 'works for paths with spaces' do
expect(described_class.storage_available?(path_with_spaces)).to be_truthy
end
it 'works for a realpath with spaces' do
symlink_location = File.join(test_dir, 'a symlink')
FileUtils.ln_s(path_with_spaces, symlink_location)
expect(described_class.storage_available?(symlink_location)).to be_truthy
end
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::Health, broken_storage: true do
let(:host1_key) { 'storage_accessible:broken:web01' }
let(:host2_key) { 'storage_accessible:default:kiq01' }
def set_in_redis(cache_key, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, :failure_count, value)
end.first
end
describe '.for_failing_storages' do
it 'only includes health status for failures' do
set_in_redis(host1_key, 10)
set_in_redis(host2_key, 0)
expect(described_class.for_failing_storages.map(&:storage_name))
.to contain_exactly('broken')
end
end
describe '.for_all_storages' do
it 'loads health status for all configured storages' do
healths = described_class.for_all_storages
expect(healths.map(&:storage_name)).to contain_exactly('default', 'broken')
end
end
describe '#failing_info' do
it 'only contains storages that have failures' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 0 },
{ name: host2_key, failure_count: 3 }])
expect(health.failing_info).to contain_exactly({ name: host2_key, failure_count: 3 })
end
end
describe '#total_failures' do
it 'sums up all the failures' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
{ name: host2_key, failure_count: 3 }])
expect(health.total_failures).to eq(5)
end
end
describe '#failing_on_hosts' do
it 'collects only the failing hostnames' do
health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
{ name: host2_key, failure_count: 0 }])
expect(health.failing_on_hosts).to contain_exactly('web01')
end
end
end
require 'spec_helper'
describe Gitlab::Git::Storage::NullCircuitBreaker do
let(:storage) { 'default' }
let(:hostname) { 'localhost' }
let(:error) { nil }
subject(:breaker) { described_class.new(storage, hostname, error: error) }
context 'with an error' do
let(:error) { Gitlab::Git::Storage::Misconfiguration.new('error') }
describe '#perform' do
it { expect { breaker.perform { 'ok' } }.to raise_error(error) }
end
describe '#circuit_broken?' do
it { expect(breaker.circuit_broken?).to be_truthy }
end
describe '#last_failure' do
it { Timecop.freeze { expect(breaker.last_failure).to eq(Time.now) } }
end
describe '#failure_count' do
it { expect(breaker.failure_count).to eq(breaker.failure_count_threshold) }
end
describe '#failure_info' do
it { expect(breaker.failure_info.no_failures?).to be_falsy }
end
end
context 'not broken' do
describe '#perform' do
it { expect(breaker.perform { 'ok' }).to eq('ok') }
end
describe '#circuit_broken?' do
it { expect(breaker.circuit_broken?).to be_falsy }
end
describe '#last_failure' do
it { expect(breaker.last_failure).to be_nil }
end
describe '#failure_count' do
it { expect(breaker.failure_count).to eq(0) }
end
describe '#failure_info' do
it { expect(breaker.failure_info.no_failures?).to be_truthy }
end
end
describe '#failure_count_threshold' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 1)
end
it { expect(breaker.failure_count_threshold).to eq(1) }
end
it 'implements the CircuitBreaker interface' do
ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
expect(theirs - ours).to be_empty
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::CLI do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, 1, false) }
subject(:runner) { described_class.new(options) }
describe '#update_settings' do
it 'updates the interval when changed in a valid response and logs the change' do
fake_response = double
expect(fake_response).to receive(:valid?).and_return(true)
expect(fake_response).to receive(:check_interval).and_return(42)
expect(runner.logger).to receive(:info)
runner.update_settings(fake_response)
expect(options.interval).to eq(42)
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::GitlabCaller do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, nil, false) }
subject(:gitlab_caller) { described_class.new(options) }
describe '#call!' do
context 'when a socket is given' do
it 'calls a socket' do
fake_connection = double
expect(fake_connection).to receive(:post)
expect(Excon).to receive(:new).with('unix://tmp/socket.sock', socket: "tmp/socket.sock") { fake_connection }
gitlab_caller.call!
end
end
context 'when a host is given' do
let(:options) { Gitlab::StorageCheck::Options.new('http://localhost:8080', nil, nil, false) }
it 'it calls a http response' do
fake_connection = double
expect(Excon).to receive(:new).with('http://localhost:8080', socket: nil) { fake_connection }
expect(fake_connection).to receive(:post)
gitlab_caller.call!
end
end
end
describe '#headers' do
it 'Adds the JSON header' do
headers = gitlab_caller.headers
expect(headers['Content-Type']).to eq('application/json')
end
context 'when a token was provided' do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', 'atoken', nil, false) }
it 'adds it to the headers' do
expect(gitlab_caller.headers['TOKEN']).to eq('atoken')
end
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::OptionParser do
describe '.parse!' do
it 'assigns all options' do
args = %w(--target unix://tmp/hello/world.sock --token thetoken --interval 42)
options = described_class.parse!(args)
expect(options.token).to eq('thetoken')
expect(options.interval).to eq(42)
expect(options.target).to eq('unix://tmp/hello/world.sock')
end
it 'requires the interval to be a number' do
args = %w(--target unix://tmp/hello/world.sock --interval fortytwo)
expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
end
it 'raises an error if the scheme is not included' do
args = %w(--target tmp/hello/world.sock)
expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
end
it 'raises an error if both socket and host are missing' do
expect { described_class.parse!([]) }.to raise_error(OptionParser::InvalidArgument)
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::Response do
let(:fake_json) do
{
check_interval: 42,
results: [
{ storage: 'working', success: true },
{ storage: 'skipped', success: nil },
{ storage: 'failing', success: false }
]
}.to_json
end
let(:fake_http_response) do
fake_response = instance_double("Excon::Response - Status check")
allow(fake_response).to receive(:status).and_return(200)
allow(fake_response).to receive(:body).and_return(fake_json)
allow(fake_response).to receive(:headers).and_return('Content-Type' => 'application/json')
fake_response
end
let(:response) { described_class.new(fake_http_response) }
describe '#valid?' do
it 'is valid for a success response with parseable JSON' do
expect(response).to be_valid
end
end
describe '#check_interval' do
it 'returns the result from the JSON' do
expect(response.check_interval).to eq(42)
end
end
describe '#responsive_shards' do
it 'contains the names of working shards' do
expect(response.responsive_shards).to contain_exactly('working')
end
end
describe '#skipped_shards' do
it 'contains the names of skipped shards' do
expect(response.skipped_shards).to contain_exactly('skipped')
end
end
describe '#failing_shards' do
it 'contains the name of failing shards' do
expect(response.failing_shards).to contain_exactly('failing')
end
end
end
......@@ -141,19 +141,6 @@ describe ApplicationSetting do
end
end
context 'circuitbreaker settings' do
[:circuitbreaker_failure_count_threshold,
:circuitbreaker_check_interval,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field|
it "Validates #{field} as number" do
is_expected.to validate_numericality_of(field)
.only_integer
.is_greater_than_or_equal_to(0)
end
end
end
context 'repository storages' do
before do
storages = {
......
......@@ -30,7 +30,7 @@ describe Repository do
def expect_to_raise_storage_error
expect { yield }.to raise_error do |exception|
storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable]
storage_exceptions = [Gitlab::Git::CommandError, GRPC::Unavailable]
known_exception = storage_exceptions.select { |e| exception.is_a?(e) }
expect(known_exception).not_to be_nil
......
require 'spec_helper'
describe API::CircuitBreakers do
let(:user) { create(:user) }
let(:admin) { create(:admin) }
set(:user) { create(:user) }
set(:admin) { create(:admin) }
describe 'GET circuit_breakers/repository_storage' do
it 'returns a 401 for anonymous users' do
......@@ -18,37 +18,26 @@ describe API::CircuitBreakers do
end
it 'returns an Array of storages' do
expect(Gitlab::Git::Storage::Health).to receive(:for_all_storages) do
[Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
end
get api('/circuit_breakers/repository_storage', admin)
expect(response).to have_gitlab_http_status(200)
expect(json_response).to be_kind_of(Array)
expect(json_response.first['storage_name']).to eq('broken')
expect(json_response.first['failing_on_hosts']).to eq(['web01'])
expect(json_response.first['total_failures']).to eq(4)
expect(json_response).to be_empty
end
describe 'GET circuit_breakers/repository_storage/failing' do
it 'returns an array of failing storages' do
expect(Gitlab::Git::Storage::Health).to receive(:for_failing_storages) do
[Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
end
get api('/circuit_breakers/repository_storage/failing', admin)
expect(response).to have_gitlab_http_status(200)
expect(json_response).to be_kind_of(Array)
expect(json_response).to be_empty
end
end
end
describe 'DELETE circuit_breakers/repository_storage' do
it 'clears all circuit_breakers' do
expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)
delete api('/circuit_breakers/repository_storage', admin)
expect(response).to have_gitlab_http_status(204)
......
......@@ -2,7 +2,7 @@ require 'spec_helper'
describe API::Settings, 'Settings' do
let(:user) { create(:user) }
let(:admin) { create(:admin) }
set(:admin) { create(:admin) }
describe "GET /application/settings" do
it "returns application settings" do
......@@ -25,7 +25,6 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(0)
expect(json_response['ecdsa_key_restriction']).to eq(0)
expect(json_response['ed25519_key_restriction']).to eq(0)
expect(json_response['circuitbreaker_failure_count_threshold']).not_to be_nil
expect(json_response['performance_bar_allowed_group_id']).to be_nil
expect(json_response['instance_statistics_visibility_private']).to be(false)
expect(json_response).not_to have_key('performance_bar_allowed_group_path')
......@@ -64,7 +63,6 @@ describe API::Settings, 'Settings' do
dsa_key_restriction: 2048,
ecdsa_key_restriction: 384,
ed25519_key_restriction: 256,
circuitbreaker_check_interval: 2,
enforce_terms: true,
terms: 'Hello world!',
performance_bar_allowed_group_path: group.full_path,
......@@ -90,7 +88,6 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(2048)
expect(json_response['ecdsa_key_restriction']).to eq(384)
expect(json_response['ed25519_key_restriction']).to eq(256)
expect(json_response['circuitbreaker_check_interval']).to eq(2)
expect(json_response['enforce_terms']).to be(true)
expect(json_response['terms']).to eq('Hello world!')
expect(json_response['performance_bar_allowed_group_id']).to eq(group.id)
......
......@@ -7,24 +7,5 @@ RSpec.configure do |config|
allow(Gitlab::GitalyClient).to receive(:call) do
raise GRPC::Unavailable.new('Gitaly broken in this spec')
end
# Track the maximum number of failures
first_failure = Time.parse("2017-11-14 17:52:30")
last_failure = Time.parse("2017-11-14 18:54:37")
failure_count = Gitlab::CurrentSettings.circuitbreaker_failure_count_threshold + 1
cache_key = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}broken:#{Gitlab::Environment.hostname}"
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hset(cache_key, :failure_count, failure_count.to_i)
end
end
end
config.after(:each, :broken_storage) do
Gitlab::Git::Storage.redis.with(&:flushall)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment