Merge branch '235739-fix-prometheus-connection-error-usage-ping' into 'master'

Fix Prometheus connection error in usage ping Closes #235739 See merge request gitlab-org/gitlab!40971

Merge branch '235739-fix-prometheus-connection-error-usage-ping' into 'master'
Fix Prometheus connection error in usage ping Closes #235739 See merge request gitlab-org/gitlab!40971
7e40d9ef · Douglas Barbosa Alexandre · 4f6645fb · 814adee3 · 7e40d9ef · 7e40d9ef
Commit 7e40d9ef authored Sep 16, 2020 by Douglas Barbosa Alexandre
11 changed files
--- a/lib/gitlab/consul/internal.rb
+++ b/lib/gitlab/consul/internal.rb
@@ -35,19 +35,12 @@ module Gitlab
          [service_address, service_port]
        end

-        def discover_prometheus_uri
+        def discover_prometheus_server_address
          service_address, service_port = discover_service(service_name: 'prometheus')

          return unless service_address && service_port

-          # There really is not a way to discover whether a Prometheus connection is using TLS or not
-          # Try TLS first because HTTPS will return fast if failed.
-          %w[https http].find do |scheme|
-            connection_url = "#{scheme}://#{service_address}:#{service_port}"
-            break connection_url if Gitlab::PrometheusClient.new(connection_url, allow_local_requests: true).healthy?
-          rescue
-            nil
-          end
+          "#{service_address}:#{service_port}"
        end

        private

--- a/lib/gitlab/prometheus/internal.rb
+++ b/lib/gitlab/prometheus/internal.rb
@@ -25,6 +25,10 @@ module Gitlab
        end
      end

+      def self.server_address
+        uri&.strip&.sub(/^http[s]?:\/\//, '')
+      end
+
      def self.listen_address
        Gitlab.config.prometheus.listen_address.to_s if Gitlab.config.prometheus
      rescue Settingslogic::MissingSetting

--- a/lib/gitlab/prometheus_client.rb
+++ b/lib/gitlab/prometheus_client.rb
@@ -42,6 +42,15 @@ module Gitlab
      response_body == HEALTHY_RESPONSE
    end

+    def ready?
+      response = get(ready_url, {})
+
+      # From Prometheus docs: This endpoint returns 200 when Prometheus is ready to serve traffic (i.e. respond to queries).
+      response.code == 200
+    rescue => e
+      raise PrometheusClient::UnexpectedResponseError, "#{e.message}"
+    end
+
    def proxy(type, args)
      path = api_path(type)
      get(path, args)
@@ -103,7 +112,11 @@ module Gitlab
    end

    def health_url
-      [api_url, '-/healthy'].join('/')
+      "#{api_url}/-/healthy"
+    end
+
+    def ready_url
+      "#{api_url}/-/ready"
    end

    private

--- a/lib/gitlab/usage_data/topology.rb
+++ b/lib/gitlab/usage_data/topology.rb
@@ -40,7 +40,7 @@ module Gitlab
      private

      def topology_fetch_all_data
-        with_prometheus_client(fallback: {}) do |client|
+        with_prometheus_client(fallback: {}, verify: false) do |client|
          {
            application_requests_per_hour: topology_app_requests_per_hour(client),
            query_apdex_weekly_average: topology_query_apdex_weekly_average(client),

--- a/lib/gitlab/utils/usage_data.rb
+++ b/lib/gitlab/utils/usage_data.rb
@@ -83,11 +83,11 @@ module Gitlab
        end
      end

-      def with_prometheus_client(fallback: nil)
-        api_url = prometheus_api_url
-        return fallback unless api_url
+      def with_prometheus_client(fallback: nil, verify: true)
+        client = prometheus_client(verify: verify)
+        return fallback unless client

-        yield Gitlab::PrometheusClient.new(api_url, allow_local_requests: true)
+        yield client
      end

      def measure_duration
@@ -111,11 +111,27 @@ module Gitlab

      private

-      def prometheus_api_url
+      def prometheus_client(verify:)
+        server_address = prometheus_server_address
+
+        return unless server_address
+
+        # There really is not a way to discover whether a Prometheus connection is using TLS or not
+        # Try TLS first because HTTPS will return fast if failed.
+        %w[https http].find do |scheme|
+          api_url = "#{scheme}://#{server_address}"
+          client = Gitlab::PrometheusClient.new(api_url, allow_local_requests: true, verify: verify)
+          break client if client.ready?
+        rescue
+          nil
+        end
+      end
+
+      def prometheus_server_address
        if Gitlab::Prometheus::Internal.prometheus_enabled?
-          Gitlab::Prometheus::Internal.uri
+          Gitlab::Prometheus::Internal.server_address
        elsif Gitlab::Consul::Internal.api_url
-          Gitlab::Consul::Internal.discover_prometheus_uri
+          Gitlab::Consul::Internal.discover_prometheus_server_address
        end
      end


--- a/spec/lib/gitlab/consul/internal_spec.rb
+++ b/spec/lib/gitlab/consul/internal_spec.rb
@@ -116,44 +116,16 @@ RSpec.describe Gitlab::Consul::Internal do
    it_behaves_like 'handles failure response'
  end

-  describe '.discover_prometheus_uri' do
-    subject { described_class.discover_prometheus_uri }
+  describe '.discover_prometheus_server_address' do
+    subject { described_class.discover_prometheus_server_address }

    before do
      stub_consul_discover_prometheus
        .to_return(status: 200, body: '[{"ServiceAddress":"prom.net","ServicePort":9090}]')
-      stub_request(:get, /\/-\/healthy/)
-        .to_return(status: 200, body: Gitlab::PrometheusClient::HEALTHY_RESPONSE)
    end

-    context 'both TLS and non-TLS connection are healthy' do
-      it 'returns https uri' do
-        is_expected.to eq('https://prom.net:9090')
-      end
-    end
-
-    context 'TLS connection is not healthy' do
-      before do
-        stub_request(:get, /https:\/\/.*\/-\/healthy/)
-            .to_return(status: 200, body: 'failed')
-      end
-
-      it 'returns http uri' do
-        is_expected.to eq('http://prom.net:9090')
-      end
-    end
-
-    context 'neither TLS nor non-TLS connection is healthy' do
-      before do
-        stub_request(:get, /https:\/\/.*\/-\/healthy/)
-            .to_return(status: 200, body: 'failed')
-        stub_request(:get, /http:\/\/.*\/-\/healthy/)
-          .to_return(status: 200, body: 'failed')
-      end
-
-      it 'returns nil' do
-        is_expected.to be_nil
-      end
+    it 'returns the server address' do
+      is_expected.to eq('prom.net:9090')
    end

    it_behaves_like 'returns nil given blank value of', :api_url

--- a/spec/lib/gitlab/prometheus/internal_spec.rb
+++ b/spec/lib/gitlab/prometheus/internal_spec.rb
@@ -48,7 +48,7 @@ RSpec.describe Gitlab::Prometheus::Internal do
      let(:listen_address) { nil }

      it 'does not fail' do
-        expect(described_class.uri).to eq(nil)
+        expect(described_class.uri).to be_nil
      end
    end

@@ -56,7 +56,27 @@ RSpec.describe Gitlab::Prometheus::Internal do
      let(:listen_address) { '' }

      it 'does not configure prometheus' do
-        expect(described_class.uri).to eq(nil)
+        expect(described_class.uri).to be_nil
+      end
+    end
+  end
+
+  describe '.server_address' do
+    context 'self.uri returns valid uri' do
+      ['http://localhost:9090', 'https://localhost:9090 '].each do |valid_uri|
+        it 'returns correct server address' do
+          expect(described_class).to receive(:uri).and_return(valid_uri)
+
+          expect(described_class.server_address).to eq('localhost:9090')
+        end
+      end
+    end
+
+    context 'self.uri returns nil' do
+      it 'returns nil' do
+        expect(described_class).to receive(:uri).and_return(nil)
+
+        expect(described_class.server_address).to be_nil
      end
    end
  end
@@ -101,7 +121,7 @@ RSpec.describe Gitlab::Prometheus::Internal do
      end

      it 'does not fail' do
-        expect(described_class.listen_address).to eq(nil)
+        expect(described_class.listen_address).to be_nil
      end
    end
  end

--- a/spec/lib/gitlab/prometheus_client_spec.rb
+++ b/spec/lib/gitlab/prometheus_client_spec.rb
@@ -36,6 +36,28 @@ RSpec.describe Gitlab::PrometheusClient do
    end
  end

+  describe '#ready?' do
+    it 'returns true when status code is 200' do
+      stub_request(:get, subject.ready_url).to_return(status: 200, body: 'Prometheus is Ready.\n')
+
+      expect(subject.ready?).to eq(true)
+    end
+
+    it 'returns false when status code is not 200' do
+      [503, 500].each do |code|
+        stub_request(:get, subject.ready_url).to_return(status: code, body: 'Service Unavailable')
+
+        expect(subject.ready?).to eq(false)
+      end
+    end
+
+    it 'raises error when ready api throws exception' do
+      stub_request(:get, subject.ready_url).to_raise(Net::OpenTimeout)
+
+      expect { subject.ready? }.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError)
+    end
+  end
+
  # This shared examples expect:
  # - query_url: A query URL
  # - execute_query: A query call

--- a/spec/lib/gitlab/usage_data/topology_spec.rb
+++ b/spec/lib/gitlab/usage_data/topology_spec.rb
@@ -17,6 +17,7 @@ RSpec.describe Gitlab::UsageData::Topology do
      context 'tracking node metrics' do
        it 'contains node level metrics for each instance' do
          expect_prometheus_api_to(
+            receive_ready_check_query,
            receive_app_request_volume_query,
            receive_query_apdex_ratio_query,
            receive_node_memory_query,
@@ -103,6 +104,7 @@ RSpec.describe Gitlab::UsageData::Topology do
      context 'and some node memory metrics are missing' do
        it 'removes the respective entries and includes the failures' do
          expect_prometheus_api_to(
+            receive_ready_check_query,
            receive_app_request_volume_query(result: []),
            receive_query_apdex_ratio_query(result: []),
            receive_node_memory_query(result: []),
@@ -243,6 +245,7 @@ RSpec.describe Gitlab::UsageData::Topology do

        it 'normalizes equivalent instance values and maps them to the same node' do
          expect_prometheus_api_to(
+            receive_ready_check_query,
            receive_app_request_volume_query(result: []),
            receive_query_apdex_ratio_query(result: []),
            receive_node_memory_query(result: node_memory_response),
@@ -309,6 +312,7 @@ RSpec.describe Gitlab::UsageData::Topology do
      context 'and node metrics are missing but service metrics exist' do
        it 'still reports service metrics' do
          expect_prometheus_api_to(
+            receive_ready_check_query,
            receive_app_request_volume_query(result: []),
            receive_query_apdex_ratio_query(result: []),
            receive_node_memory_query(result: []),
@@ -384,6 +388,7 @@ RSpec.describe Gitlab::UsageData::Topology do

        it 'filters out unknown service data and reports the unknown services as a failure' do
          expect_prometheus_api_to(
+            receive_ready_check_query,
            receive_app_request_volume_query(result: []),
            receive_query_apdex_ratio_query(result: []),
            receive_node_memory_query(result: []),
@@ -408,25 +413,26 @@ RSpec.describe Gitlab::UsageData::Topology do
      context 'and an error is raised when querying Prometheus' do
        context 'without timeout failures' do
          it 'returns empty result and executes subsequent queries as usual' do
-            expect_prometheus_api_to receive(:query)
-              .at_least(:once)
-              .and_raise(Gitlab::PrometheusClient::ConnectionError)
+            expect_prometheus_api_to(
+              receive_ready_check_query,
+              receive(:query).at_least(:once).and_raise(Gitlab::PrometheusClient::UnexpectedResponseError)
+            )

            expect(subject[:topology]).to eq({
              duration_s: 0,
              failures: [
-                { 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'query_apdex' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
-                { 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
+                { 'app_requests' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'query_apdex' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'node_memory' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'node_memory_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'node_cpus' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'node_uname_info' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'service_rss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'service_uss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'service_pss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'service_process_count' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
+                { 'service_workers' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }
              ],
              nodes: []
            })
@@ -440,8 +446,10 @@ RSpec.describe Gitlab::UsageData::Topology do

          with_them do
            it 'returns empty result and cancelled subsequent queries' do
-              expect_prometheus_api_to receive(:query)
-                .and_raise(exception)
+              expect_prometheus_api_to(
+                receive_ready_check_query,
+                receive(:query).and_raise(exception)
+              )

              expect(subject[:topology]).to eq({
                duration_s: 0,
@@ -467,35 +475,69 @@ RSpec.describe Gitlab::UsageData::Topology do
      end
    end

-    context 'when Prometheus is available from Prometheus settings' do
+    shared_examples 'returns empty result with no failures' do
+      it do
+        expect(subject[:topology]).to eq({
+          duration_s: 0,
+          failures: []
+        })
+      end
+    end
+
+    shared_examples 'try to query Prometheus with given address' do
+      context 'Prometheus is ready' do
+        it_behaves_like 'query topology data from Prometheus'
+      end
+
+      context 'Prometheus is not ready' do
+        before do
+          # readiness check over HTTPS connection returns false
+          expect_prometheus_api_to(receive_ready_check_query(result: false))
+          # readiness check over HTTP connection also returns false
+          expect_prometheus_api_to(receive_ready_check_query(result: false))
+        end
+
+        it_behaves_like 'returns empty result with no failures'
+      end
+
+      context 'Prometheus is not reachable' do
+        before do
+          # HTTPS connection is not reachable
+          expect_prometheus_api_to(receive_ready_check_query(raise_error: Errno::ECONNREFUSED))
+          # HTTP connection is also not reachable
+          expect_prometheus_api_to(receive_ready_check_query(raise_error: Errno::ECONNREFUSED))
+        end
+
+        it_behaves_like 'returns empty result with no failures'
+      end
+    end
+
+    context 'when Prometheus server address is available from Prometheus settings' do
      before do
        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
        expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
      end

-      include_examples 'query topology data from Prometheus'
+      include_examples 'try to query Prometheus with given address'
    end

-    context 'when Prometheus is available from Consul service discovery' do
+    context 'when Prometheus server address is available from Consul service discovery' do
      before do
        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return('http://127.0.0.1:8500')
-        expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_uri).and_return('http://prom.net:9090')
+        expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_server_address).and_return('prom.net:9090')
      end

-      include_examples 'query topology data from Prometheus'
+      include_examples 'try to query Prometheus with given address'
    end

-    context 'when Prometheus is not available' do
-      it 'returns empty result with no failures' do
+    context 'when Prometheus server address is not available' do
+      before do
        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return(nil)
-
-        expect(subject[:topology]).to eq({
-          duration_s: 0,
-          failures: []
-        })
      end
+
+      include_examples 'returns empty result with no failures'
    end

    context 'when top-level function raises error' do
@@ -512,6 +554,14 @@ RSpec.describe Gitlab::UsageData::Topology do
    end
  end

+  def receive_ready_check_query(result: nil, raise_error: nil)
+    if raise_error.nil?
+      receive(:ready?).and_return(result.nil? ? true : result)
+    else
+      receive(:ready?).and_raise(raise_error)
+    end
+  end
+
  def receive_app_request_volume_query(result: nil)
    receive(:query)
      .with(/gitlab_usage_ping:ops:rate/)

--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -106,42 +106,86 @@ RSpec.describe Gitlab::Utils::UsageData do
      end
    end

-    context 'when Prometheus is available from settings' do
+    shared_examples 'does not query data from Prometheus' do
+      it 'returns nil by default' do
+        result = described_class.with_prometheus_client { |client| client }
+
+        expect(result).to be_nil
+      end
+
+      it 'returns fallback if provided' do
+        result = described_class.with_prometheus_client(fallback: []) { |client| client }
+
+        expect(result).to eq([])
+      end
+    end
+
+    shared_examples 'try to query Prometheus with given address' do
+      context 'Prometheus is ready' do
        before do
-        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
-        expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
+          stub_request(:get, /\/-\/ready/)
+              .to_return(status: 200, body: 'Prometheus is Ready.\n')
        end

+        context 'Prometheus is reachable through HTTPS' do
          it_behaves_like 'query data from Prometheus'
        end

-    context 'when Prometheus is available from Consul service discovery' do
+        context 'Prometheus is not reachable through HTTPS' do
          before do
-        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
-        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return('http://localhost:8500')
-        expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_uri).and_return('http://prom:9090')
+            stub_request(:get, /https:\/\/.*/).to_raise(Errno::ECONNREFUSED)
          end

+          context 'Prometheus is reachable through HTTP' do
            it_behaves_like 'query data from Prometheus'
          end

-    context 'when Prometheus is not available' do
+          context 'Prometheus is not reachable through HTTP' do
            before do
-        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
-        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return(nil)
+              stub_request(:get, /http:\/\/.*/).to_raise(Errno::ECONNREFUSED)
            end

-      it 'returns nil by default' do
-        result = described_class.with_prometheus_client { |client| client }
+            it_behaves_like 'does not query data from Prometheus'
+          end
+        end
+      end

-        expect(result).to be nil
+      context 'Prometheus is not ready' do
+        before do
+          stub_request(:get, /\/-\/ready/)
+              .to_return(status: 503, body: 'Service Unavailable')
        end

-      it 'returns fallback if provided' do
-        result = described_class.with_prometheus_client(fallback: []) { |client| client }
+        it_behaves_like 'does not query data from Prometheus'
+      end
+    end

-        expect(result).to eq([])
+    context 'when Prometheus server address is available from settings' do
+      before do
+        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
+        expect(Gitlab::Prometheus::Internal).to receive(:server_address).and_return('prom:9090')
+      end
+
+      it_behaves_like 'try to query Prometheus with given address'
+    end
+
+    context 'when Prometheus server address is available from Consul service discovery' do
+      before do
+        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
+        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return('http://localhost:8500')
+        expect(Gitlab::Consul::Internal).to receive(:discover_prometheus_server_address).and_return('prom:9090')
      end
+
+      it_behaves_like 'try to query Prometheus with given address'
+    end
+
+    context 'when Prometheus server address is not available' do
+      before do
+        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
+        expect(Gitlab::Consul::Internal).to receive(:api_url).and_return(nil)
+      end
+
+      it_behaves_like 'does not query data from Prometheus'
    end
  end


--- a/spec/tasks/gitlab/usage_data_rake_spec.rb
+++ b/spec/tasks/gitlab/usage_data_rake_spec.rb
@@ -6,7 +6,14 @@ RSpec.describe 'gitlab:usage data take tasks' do
  before do
    Rake.application.rake_require 'tasks/gitlab/usage_data'
    # stub prometheus external http calls https://gitlab.com/gitlab-org/gitlab/-/issues/245277
-    stub_request(:get, %r{^http://::1:9090/api/v1/query\?query=.*})
+    stub_request(:get, %r{^http[s]?://::1:9090/-/ready})
+      .to_return(
+        status: 200,
+        body: [{}].to_json,
+        headers: { 'Content-Type' => 'application/json' }
+      )
+
+    stub_request(:get, %r{^http[s]?://::1:9090/api/v1/query\?query=.*})
      .to_return(
        status: 200,
        body: [{}].to_json,