Commit a447bf13 authored by Allison Browne's avatar Allison Browne Committed by Mayra Cabrera

Add prometheus health check service

Add a Service takes a Cluster and sends an alert
if prometheus is newly unhealthy, where newly unhealthy
is defined as--was healthy at the time of the last health
check but no longer is.
parent 3066bdd6
# frozen_string_literal: true
module Clusters
module Applications
class PrometheusHealthCheckService
include Gitlab::Utils::StrongMemoize
include Gitlab::Routing
def initialize(cluster)
@cluster = cluster
@logger = Gitlab::AppJsonLogger.build
end
def execute
raise 'Invalid cluster type. Only project types are allowed.' unless @cluster.project_type?
return unless prometheus_application.installed?
project = @cluster.clusterable
@logger.info(
message: 'Prometheus health check',
cluster_id: @cluster.id,
newly_unhealthy: became_unhealthy?,
currently_healthy: currently_healthy?,
was_healthy: was_healthy?
)
send_notification(project) if became_unhealthy?
prometheus_application.update_columns(healthy: currently_healthy?) if health_changed?
end
private
def prometheus_application
strong_memoize(:prometheus_application) do
@cluster.application_prometheus
end
end
def currently_healthy?
strong_memoize(:currently_healthy) do
prometheus_application.prometheus_client.healthy?
end
end
def became_unhealthy?
strong_memoize(:became_unhealthy) do
(was_healthy? || was_healthy?.nil?) && !currently_healthy?
end
end
def was_healthy?
strong_memoize(:was_healthy) do
prometheus_application.healthy
end
end
def health_changed?
was_healthy? != currently_healthy?
end
def send_notification(project)
notification_payload = build_notification_payload(project)
token = project.alerts_service.data.token
Projects::Alerting::NotifyService.new(project, nil, notification_payload).execute(token)
@logger.info(message: 'Successfully notified of Prometheus newly unhealthy', cluster_id: @cluster.id, project_id: project.id)
end
def build_notification_payload(project)
cluster_path = namespace_project_cluster_path(
project_id: project.path,
namespace_id: project.namespace.path,
id: @cluster.id
)
{
title: "Prometheus is Unhealthy. Cluster Name: #{@cluster.name}",
description: "Prometheus is unhealthy for the cluster: [#{@cluster.name}](#{cluster_path}) attached to project #{project.name}."
}
end
end
end
end
---
title: Add healthy column to clusters_applications_prometheus table
merge_request: 26168
author:
type: added
# frozen_string_literal: true
class AddHealthyToClustersApplicationsPrometheus < ActiveRecord::Migration[6.0]
DOWNTIME = false
def up
# Default is null to indicate that a health check has not run for a project
# For now, health checks will only run on monitor demo projects
add_column :clusters_applications_prometheus, :healthy, :boolean
end
def down
remove_column :clusters_applications_prometheus, :healthy
end
end
...@@ -1748,7 +1748,8 @@ CREATE TABLE public.clusters_applications_prometheus ( ...@@ -1748,7 +1748,8 @@ CREATE TABLE public.clusters_applications_prometheus (
updated_at timestamp with time zone NOT NULL, updated_at timestamp with time zone NOT NULL,
last_update_started_at timestamp with time zone, last_update_started_at timestamp with time zone,
encrypted_alert_manager_token character varying, encrypted_alert_manager_token character varying,
encrypted_alert_manager_token_iv character varying encrypted_alert_manager_token_iv character varying,
healthy boolean
); );
CREATE SEQUENCE public.clusters_applications_prometheus_id_seq CREATE SEQUENCE public.clusters_applications_prometheus_id_seq
...@@ -12731,6 +12732,7 @@ COPY "schema_migrations" (version) FROM STDIN; ...@@ -12731,6 +12732,7 @@ COPY "schema_migrations" (version) FROM STDIN;
20200302152516 20200302152516
20200303055348 20200303055348
20200303074328 20200303074328
20200303181648
20200304085423 20200304085423
20200304090155 20200304090155
20200304121828 20200304121828
......
...@@ -6,6 +6,7 @@ module Gitlab ...@@ -6,6 +6,7 @@ module Gitlab
include Gitlab::Utils::StrongMemoize include Gitlab::Utils::StrongMemoize
Error = Class.new(StandardError) Error = Class.new(StandardError)
QueryError = Class.new(Gitlab::PrometheusClient::Error) QueryError = Class.new(Gitlab::PrometheusClient::Error)
HEALTHY_RESPONSE = "Prometheus is Healthy.\n"
# Target number of data points for `query_range`. # Target number of data points for `query_range`.
# Please don't exceed the limit of 11000 data points # Please don't exceed the limit of 11000 data points
...@@ -32,13 +33,20 @@ module Gitlab ...@@ -32,13 +33,20 @@ module Gitlab
json_api_get('query', query: '1') json_api_get('query', query: '1')
end end
def healthy?
response_body = handle_management_api_response(get(health_url, {}))
# From Prometheus docs: This endpoint always returns 200 and should be used to check Prometheus health.
response_body == HEALTHY_RESPONSE
end
def proxy(type, args) def proxy(type, args)
path = api_path(type) path = api_path(type)
get(path, args) get(path, args)
rescue Gitlab::HTTP::ResponseError => ex rescue Gitlab::HTTP::ResponseError => ex
raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code) raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
handle_response(ex.response) handle_querying_api_response(ex.response)
end end
def query(query, time: Time.now) def query(query, time: Time.now)
...@@ -79,6 +87,10 @@ module Gitlab ...@@ -79,6 +87,10 @@ module Gitlab
[QUERY_RANGE_MIN_STEP, step].max [QUERY_RANGE_MIN_STEP, step].max
end end
def health_url
[api_url, '-/healthy'].join('/')
end
private private
def api_path(type) def api_path(type)
...@@ -88,11 +100,11 @@ module Gitlab ...@@ -88,11 +100,11 @@ module Gitlab
def json_api_get(type, args = {}) def json_api_get(type, args = {})
path = api_path(type) path = api_path(type)
response = get(path, args) response = get(path, args)
handle_response(response) handle_querying_api_response(response)
rescue Gitlab::HTTP::ResponseError => ex rescue Gitlab::HTTP::ResponseError => ex
raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code) raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
handle_response(ex.response) handle_querying_api_response(ex.response)
end end
def gitlab_http_key(key) def gitlab_http_key(key)
...@@ -119,7 +131,15 @@ module Gitlab ...@@ -119,7 +131,15 @@ module Gitlab
raise PrometheusClient::Error, 'Connection refused' raise PrometheusClient::Error, 'Connection refused'
end end
def handle_response(response) def handle_management_api_response(response)
if response.code == 200
response.body
else
raise PrometheusClient::Error, "#{response.code} - #{response.body}"
end
end
def handle_querying_api_response(response)
response_code = response.try(:code) response_code = response.try(:code)
response_body = response.try(:body) response_body = response.try(:body)
......
...@@ -16,6 +16,26 @@ describe Gitlab::PrometheusClient do ...@@ -16,6 +16,26 @@ describe Gitlab::PrometheusClient do
end end
end end
describe '#healthy?' do
it 'returns true when status code is 200 and healthy response body' do
stub_request(:get, subject.health_url).to_return(status: 200, body: described_class::HEALTHY_RESPONSE)
expect(subject.healthy?).to eq(true)
end
it 'returns false when status code is 200 and unhealthy response body' do
stub_request(:get, subject.health_url).to_return(status: 200, body: '')
expect(subject.healthy?).to eq(false)
end
it 'raises error when status code not 200' do
stub_request(:get, subject.health_url).to_return(status: 500, body: '')
expect { subject.healthy? }.to raise_error(Gitlab::PrometheusClient::Error)
end
end
# This shared examples expect: # This shared examples expect:
# - query_url: A query URL # - query_url: A query URL
# - execute_query: A query call # - execute_query: A query call
......
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Applications::PrometheusHealthCheckService, '#execute' do
let(:service) { described_class.new(cluster) }
subject { service.execute }
RSpec.shared_examples 'no alert' do
it 'does not send alert' do
expect(Projects::Alerting::NotifyService).not_to receive(:new)
subject
end
end
RSpec.shared_examples 'sends alert' do
it 'sends an alert' do
expect_next_instance_of(Projects::Alerting::NotifyService) do |notify_service|
expect(notify_service).to receive(:execute).with(alerts_service.token)
end
subject
end
end
RSpec.shared_examples 'correct health stored' do
it 'stores the correct health of prometheus app' do
subject
expect(prometheus.healthy).to eq(client_healthy)
end
end
context 'when cluster is not project_type' do
let(:cluster) { create(:cluster, :instance) }
it { expect { subject }.to raise_error(RuntimeError, 'Invalid cluster type. Only project types are allowed.') }
end
context 'when cluster is project_type' do
let_it_be(:alerts_service) { create(:alerts_service) }
let_it_be(:project) { create(:project, alerts_service: alerts_service) }
let(:applications_prometheus_healthy) { true }
let(:prometheus) { create(:clusters_applications_prometheus, status: prometheus_status_value, healthy: applications_prometheus_healthy) }
let(:cluster) { create(:cluster, :project, application_prometheus: prometheus, projects: [project]) }
context 'when prometheus not installed' do
let(:prometheus_status_value) { Clusters::Applications::Prometheus.state_machine.states[:installing].value }
it { expect(subject).to eq(nil) }
include_examples 'no alert'
end
context 'when prometheus installed' do
let(:prometheus_status_value) { Clusters::Applications::Prometheus.state_machine.states[:installed].value }
before do
client = instance_double('PrometheusClient', healthy?: client_healthy)
expect(prometheus).to receive(:prometheus_client).and_return(client)
end
context 'when newly unhealthy' do
let(:applications_prometheus_healthy) { true }
let(:client_healthy) { false }
include_examples 'sends alert'
include_examples 'correct health stored'
end
context 'when newly healthy' do
let(:applications_prometheus_healthy) { false }
let(:client_healthy) { true }
include_examples 'no alert'
include_examples 'correct health stored'
end
context 'when continuously unhealthy' do
let(:applications_prometheus_healthy) { false }
let(:client_healthy) { false }
include_examples 'no alert'
include_examples 'correct health stored'
end
context 'when continuously healthy' do
let(:applications_prometheus_healthy) { true }
let(:client_healthy) { true }
include_examples 'no alert'
include_examples 'correct health stored'
end
context 'when first health check and healthy' do
let(:applications_prometheus_healthy) { nil }
let(:client_healthy) { true }
include_examples 'no alert'
include_examples 'correct health stored'
end
context 'when first health check and not healthy' do
let(:applications_prometheus_healthy) { nil }
let(:client_healthy) { false }
include_examples 'sends alert'
include_examples 'correct health stored'
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment