Commit 7e40d9ef authored by Douglas Barbosa Alexandre's avatar Douglas Barbosa Alexandre

Merge branch '235739-fix-prometheus-connection-error-usage-ping' into 'master'

Fix Prometheus connection error in usage ping

Closes #235739

See merge request gitlab-org/gitlab!40971
parents 4f6645fb 814adee3
......@@ -35,19 +35,12 @@ module Gitlab
[service_address, service_port]
end
def discover_prometheus_uri
def discover_prometheus_server_address
service_address, service_port = discover_service(service_name: 'prometheus')
return unless service_address && service_port
# There really is not a way to discover whether a Prometheus connection is using TLS or not
# Try TLS first because HTTPS will return fast if failed.
%w[https http].find do |scheme|
connection_url = "#{scheme}://#{service_address}:#{service_port}"
break connection_url if Gitlab::PrometheusClient.new(connection_url, allow_local_requests: true).healthy?
rescue
nil
end
"#{service_address}:#{service_port}"
end
private
......
......@@ -25,6 +25,10 @@ module Gitlab
end
end
def self.server_address
uri&.strip&.sub(/^http[s]?:\/\//, '')
end
def self.listen_address
Gitlab.config.prometheus.listen_address.to_s if Gitlab.config.prometheus
rescue Settingslogic::MissingSetting
......
......@@ -42,6 +42,15 @@ module Gitlab
response_body == HEALTHY_RESPONSE
end
def ready?
response = get(ready_url, {})
# From Prometheus docs: This endpoint returns 200 when Prometheus is ready to serve traffic (i.e. respond to queries).
response.code == 200
rescue => e
raise PrometheusClient::UnexpectedResponseError, "#{e.message}"
end
def proxy(type, args)
path = api_path(type)
get(path, args)
......@@ -103,7 +112,11 @@ module Gitlab
end
def health_url
[api_url, '-/healthy'].join('/')
"#{api_url}/-/healthy"
end
def ready_url
"#{api_url}/-/ready"
end
private
......
......@@ -40,7 +40,7 @@ module Gitlab
private
def topology_fetch_all_data
with_prometheus_client(fallback: {}) do |client|
with_prometheus_client(fallback: {}, verify: false) do |client|
{
application_requests_per_hour: topology_app_requests_per_hour(client),
query_apdex_weekly_average: topology_query_apdex_weekly_average(client),
......
......@@ -83,11 +83,11 @@ module Gitlab
end
end
def with_prometheus_client(fallback: nil)
api_url = prometheus_api_url
return fallback unless api_url
def with_prometheus_client(fallback: nil, verify: true)
client = prometheus_client(verify: verify)
return fallback unless client
yield Gitlab::PrometheusClient.new(api_url, allow_local_requests: true)
yield client
end
def measure_duration
......@@ -111,11 +111,27 @@ module Gitlab
private
def prometheus_api_url
def prometheus_client(verify:)
server_address = prometheus_server_address
return unless server_address
# There really is not a way to discover whether a Prometheus connection is using TLS or not
# Try TLS first because HTTPS will return fast if failed.
%w[https http].find do |scheme|
api_url = "#{scheme}://#{server_address}"
client = Gitlab::PrometheusClient.new(api_url, allow_local_requests: true, verify: verify)
break client if client.ready?
rescue
nil
end
end
def prometheus_server_address
if Gitlab::Prometheus::Internal.prometheus_enabled?
Gitlab::Prometheus::Internal.uri
Gitlab::Prometheus::Internal.server_address
elsif Gitlab::Consul::Internal.api_url
Gitlab::Consul::Internal.discover_prometheus_uri
Gitlab::Consul::Internal.discover_prometheus_server_address
end
end
......
......@@ -116,44 +116,16 @@ RSpec.describe Gitlab::Consul::Internal do
it_behaves_like 'handles failure response'
end
describe '.discover_prometheus_uri' do
subject { described_class.discover_prometheus_uri }
describe '.discover_prometheus_server_address' do
subject { described_class.discover_prometheus_server_address }
before do
stub_consul_discover_prometheus
.to_return(status: 200, body: '[{"ServiceAddress":"prom.net","ServicePort":9090}]')
stub_request(:get, /\/-\/healthy/)
.to_return(status: 200, body: Gitlab::PrometheusClient::HEALTHY_RESPONSE)
end
context 'both TLS and non-TLS connection are healthy' do
it 'returns https uri' do
is_expected.to eq('https://prom.net:9090')
end
end
context 'TLS connection is not healthy' do
before do
stub_request(:get, /https:\/\/.*\/-\/healthy/)
.to_return(status: 200, body: 'failed')
end
it 'returns http uri' do
is_expected.to eq('http://prom.net:9090')
end
end
context 'neither TLS nor non-TLS connection is healthy' do
before do
stub_request(:get, /https:\/\/.*\/-\/healthy/)
.to_return(status: 200, body: 'failed')
stub_request(:get, /http:\/\/.*\/-\/healthy/)
.to_return(status: 200, body: 'failed')
end
it 'returns nil' do
is_expected.to be_nil
end
it 'returns the server address' do
is_expected.to eq('prom.net:9090')
end
it_behaves_like 'returns nil given blank value of', :api_url
......
......@@ -48,7 +48,7 @@ RSpec.describe Gitlab::Prometheus::Internal do
let(:listen_address) { nil }
it 'does not fail' do
expect(described_class.uri).to eq(nil)
expect(described_class.uri).to be_nil
end
end
......@@ -56,7 +56,27 @@ RSpec.describe Gitlab::Prometheus::Internal do
let(:listen_address) { '' }
it 'does not configure prometheus' do
expect(described_class.uri).to eq(nil)
expect(described_class.uri).to be_nil
end
end
end
describe '.server_address' do
context 'self.uri returns valid uri' do
['http://localhost:9090', 'https://localhost:9090 '].each do |valid_uri|
it 'returns correct server address' do
expect(described_class).to receive(:uri).and_return(valid_uri)
expect(described_class.server_address).to eq('localhost:9090')
end
end
end
context 'self.uri returns nil' do
it 'returns nil' do
expect(described_class).to receive(:uri).and_return(nil)
expect(described_class.server_address).to be_nil
end
end
end
......@@ -101,7 +121,7 @@ RSpec.describe Gitlab::Prometheus::Internal do
end
it 'does not fail' do
expect(described_class.listen_address).to eq(nil)
expect(described_class.listen_address).to be_nil
end
end
end
......
......@@ -36,6 +36,28 @@ RSpec.describe Gitlab::PrometheusClient do
end
end
describe '#ready?' do
it 'returns true when status code is 200' do
stub_request(:get, subject.ready_url).to_return(status: 200, body: 'Prometheus is Ready.\n')
expect(subject.ready?).to eq(true)
end
it 'returns false when status code is not 200' do
[503, 500].each do |code|
stub_request(:get, subject.ready_url).to_return(status: code, body: 'Service Unavailable')
expect(subject.ready?).to eq(false)
end
end
it 'raises error when ready api throws exception' do
stub_request(:get, subject.ready_url).to_raise(Net::OpenTimeout)
expect { subject.ready? }.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError)
end
end
# This shared examples expect:
# - query_url: A query URL
# - execute_query: A query call
......
......@@ -17,6 +17,7 @@ RSpec.describe Gitlab::UsageData::Topology do
context 'tracking node metrics' do
it 'contains node level metrics for each instance' do
expect_prometheus_api_to(
receive_ready_check_query,
receive_app_request_volume_query,
receive_query_apdex_ratio_query,
receive_node_memory_query,
......@@ -103,6 +104,7 @@ RSpec.describe Gitlab::UsageData::Topology do
context 'and some node memory metrics are missing' do
it 'removes the respective entries and includes the failures' do
expect_prometheus_api_to(
receive_ready_check_query,
receive_app_request_volume_query(result: []),
receive_query_apdex_ratio_query(result: []),
receive_node_memory_query(result: []),
......@@ -243,6 +245,7 @@ RSpec.describe Gitlab::UsageData::Topology do
it 'normalizes equivalent instance values and maps them to the same node' do
expect_prometheus_api_to(
receive_ready_check_query,
receive_app_request_volume_query(result: []),
receive_query_apdex_ratio_query(result: []),
receive_node_memory_query(result: node_memory_response),
......@@ -309,6 +312,7 @@ RSpec.describe Gitlab::UsageData::Topology do
context 'and node metrics are missing but service metrics exist' do
it 'still reports service metrics' do
expect_prometheus_api_to(
receive_ready_check_query,
receive_app_request_volume_query(result: []),
receive_query_apdex_ratio_query(result: []),
receive_node_memory_query(result: []),
......@@ -384,6 +388,7 @@ RSpec.describe Gitlab::UsageData::Topology do
it 'filters out unknown service data and reports the unknown services as a failure' do
expect_prometheus_api_to(
receive_ready_check_query,
receive_app_request_volume_query(result: []),
receive_query_apdex_ratio_query(result: []),
receive_node_memory_query(result: []),
......@@ -408,25 +413,26 @@ RSpec.describe Gitlab::UsageData::Topology do
context 'and an error is raised when querying Prometheus' do
context 'without timeout failures' do
it 'returns empty result and executes subsequent queries as usual' do
expect_prometheus_api_to receive(:query)
.at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
expect_prometheus_api_to(
receive_ready_check_query,
receive(:query).at_least(:once).and_raise(Gitlab::PrometheusClient::UnexpectedResponseError)
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'query_apdex' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
{ 'app_requests' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'query_apdex' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
{ 'service_workers' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }
],
nodes: []
})
......@@ -440,8 +446,10 @@ RSpec.describe Gitlab::UsageData::Topology do
with_them do
it 'returns empty result and cancelled subsequent queries' do
expect_prometheus_api_to receive(:query)
.and_raise(exception)
expect_prometheus_api_to(
receive_ready_check_query,
receive(:query).and_raise(exception)
)
expect(subject[:topology]).to eq({
duration_s: 0,
......@@ -467,35 +475,69 @@ RSpec.describe Gitlab::UsageData::Topology do
end
end
context 'when Prometheus is available from Prometheus settings' do
shared_examples 'returns empty result with no failures' do
it do
expect(subject[:topology]).to eq({
duration_s: 0,
failures: []
})
end
end
shared_examples 'try to query Prometheus with given address' do
context 'Prometheus is ready' do
it_behaves_like 'query topology data from Prometheus'
end
context 'Prometheus is not ready' do
before do
# readiness check over HTTPS connection returns false
expect_prometheus_api_to(receive_ready_check_query(result: false))
# readiness check over HTTP connection also returns false
expect_prometheus_api_to(receive_ready_check_query(result: false))
end
it_behaves_like 'returns empty result with no failures'
end
context 'Prometheus is not reachable' do
before do
# HTTPS connection is not reachable
expect_prometheus_api_to(receive_ready_check_query(raise_error: Errno::ECONNREFUSED))
# HTTP connection is also not reachable
expect_prometheus_api_to(receive_ready_check_query(raise_error: Errno::ECONNREFUSED))
end
it_behaves_like 'returns empty result with no failures'
end
end
context 'when Prometheus server address is available from Prometheus settings' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
end
include_examples 'query topology data from Prometheus'
include_examples 'try to query Prometheus with given address'
end
context 'when Prometheus is available from Consul service discovery' do
context 'when Prometheus server address is available from Consul service discovery' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(Gitlab::Consul::Internal).to receive(:api_url).and_return('http://127.0.0.1:8500')
expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_uri).and_return('http://prom.net:9090')
expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_server_address).and_return('prom.net:9090')
end
include_examples 'query topology data from Prometheus'
include_examples 'try to query Prometheus with given address'
end
context 'when Prometheus is not available' do
it 'returns empty result with no failures' do
context 'when Prometheus server address is not available' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(Gitlab::Consul::Internal).to receive(:api_url).and_return(nil)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: []
})
end
include_examples 'returns empty result with no failures'
end
context 'when top-level function raises error' do
......@@ -512,6 +554,14 @@ RSpec.describe Gitlab::UsageData::Topology do
end
end
def receive_ready_check_query(result: nil, raise_error: nil)
if raise_error.nil?
receive(:ready?).and_return(result.nil? ? true : result)
else
receive(:ready?).and_raise(raise_error)
end
end
def receive_app_request_volume_query(result: nil)
receive(:query)
.with(/gitlab_usage_ping:ops:rate/)
......
......@@ -106,42 +106,86 @@ RSpec.describe Gitlab::Utils::UsageData do
end
end
context 'when Prometheus is available from settings' do
shared_examples 'does not query data from Prometheus' do
it 'returns nil by default' do
result = described_class.with_prometheus_client { |client| client }
expect(result).to be_nil
end
it 'returns fallback if provided' do
result = described_class.with_prometheus_client(fallback: []) { |client| client }
expect(result).to eq([])
end
end
shared_examples 'try to query Prometheus with given address' do
context 'Prometheus is ready' do
before do
stub_request(:get, /\/-\/ready/)
.to_return(status: 200, body: 'Prometheus is Ready.\n')
end
context 'Prometheus is reachable through HTTPS' do
it_behaves_like 'query data from Prometheus'
end
context 'Prometheus is not reachable through HTTPS' do
before do
stub_request(:get, /https:\/\/.*/).to_raise(Errno::ECONNREFUSED)
end
context 'Prometheus is reachable through HTTP' do
it_behaves_like 'query data from Prometheus'
end
context 'Prometheus is not reachable through HTTP' do
before do
stub_request(:get, /http:\/\/.*/).to_raise(Errno::ECONNREFUSED)
end
it_behaves_like 'does not query data from Prometheus'
end
end
end
context 'Prometheus is not ready' do
before do
stub_request(:get, /\/-\/ready/)
.to_return(status: 503, body: 'Service Unavailable')
end
it_behaves_like 'does not query data from Prometheus'
end
end
context 'when Prometheus server address is available from settings' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
expect(Gitlab::Prometheus::Internal).to receive(:server_address).and_return('prom:9090')
end
it_behaves_like 'query data from Prometheus'
it_behaves_like 'try to query Prometheus with given address'
end
context 'when Prometheus is available from Consul service discovery' do
context 'when Prometheus server address is available from Consul service discovery' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(Gitlab::Consul::Internal).to receive(:api_url).and_return('http://localhost:8500')
expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_uri).and_return('http://prom:9090')
expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_server_address).and_return('prom:9090')
end
it_behaves_like 'query data from Prometheus'
it_behaves_like 'try to query Prometheus with given address'
end
context 'when Prometheus is not available' do
context 'when Prometheus server address is not available' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(Gitlab::Consul::Internal).to receive(:api_url).and_return(nil)
end
it 'returns nil by default' do
result = described_class.with_prometheus_client { |client| client }
expect(result).to be nil
end
it 'returns fallback if provided' do
result = described_class.with_prometheus_client(fallback: []) { |client| client }
expect(result).to eq([])
end
it_behaves_like 'does not query data from Prometheus'
end
end
......
......@@ -6,7 +6,14 @@ RSpec.describe 'gitlab:usage data take tasks' do
before do
Rake.application.rake_require 'tasks/gitlab/usage_data'
# stub prometheus external http calls https://gitlab.com/gitlab-org/gitlab/-/issues/245277
stub_request(:get, %r{^http://::1:9090/api/v1/query\?query=.*})
stub_request(:get, %r{^http[s]?://::1:9090/-/ready})
.to_return(
status: 200,
body: [{}].to_json,
headers: { 'Content-Type' => 'application/json' }
)
stub_request(:get, %r{^http[s]?://::1:9090/api/v1/query\?query=.*})
.to_return(
status: 200,
body: [{}].to_json,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment