Commit bae6efb8 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Record counters for request apdex

With this, we'll emit 2 new counters from web processes that can be
used to monitor apdex.

The `gitlab_sli:rails_request_apdex:total` counter is incremented for
every successful (not a 500) that is not to a health endpoint.

The `gitlab_sli:rails_request_apdex:success_total` is incremented when
the request took less than 1 second. We intend to customize this value
per endpoint in the future.

Both these counters are labelled with `feature_category` and
`endpoint_id` from the context.

The metrics would also be initialized on the first scrape. This means
that a 0 would be available for every set of labels, avoiding bugs in
calculations with these metrics.

To get to all of the `feature_category`s and `endpoint_id`s for the
initialization, we had to move some code that iterates all endpoints
that was only used in tests to the application.

We know this would initialize about 2 * 2500 metrics per pod
running a web server. So we'd like to roll this out in a controlled
fashion, to make sure this doesn't impact our monitoring. Which is why
this is feature flagged.

This also limits the initialization of these metrics to just
web-processes. So they don't get generated for consoles or runner
processes.

This also includes a developer-api to define SLIs and encourages
initializing them with the known label sets.
parent 4c04855e
......@@ -69,6 +69,10 @@ class ApplicationController < ActionController::Base
# concerns due to caching private data.
DEFAULT_GITLAB_CACHE_CONTROL = "#{ActionDispatch::Http::Cache::Response::DEFAULT_CACHE_CONTROL}, no-store"
def self.endpoint_id_for_action(action_name)
"#{self.name}##{action_name}"
end
rescue_from Encoding::CompatibilityError do |exception|
log_exception(exception)
render "errors/encoding", layout: "errors", status: :internal_server_error
......@@ -456,7 +460,7 @@ class ApplicationController < ActionController::Base
user: -> { context_user },
project: -> { @project if @project&.persisted? },
namespace: -> { @group if @group&.persisted? },
caller_id: caller_id,
caller_id: self.class.endpoint_id_for_action(action_name),
remote_ip: request.ip,
feature_category: feature_category
)
......@@ -542,10 +546,6 @@ class ApplicationController < ActionController::Base
auth_user if strong_memoized?(:auth_user)
end
def caller_id
"#{self.class.name}##{action_name}"
end
def feature_category
self.class.feature_category_for_action(action_name).to_s
end
......
......@@ -7,6 +7,7 @@ class MetricsController < ActionController::Base
def index
response = if Gitlab::Metrics.prometheus_metrics_enabled?
Gitlab::Metrics::RailsSlis.initialize_request_slis_if_needed!
metrics_service.metrics_text
else
help_page = help_page_url('administration/monitoring/prometheus/gitlab_metrics',
......
---
name: request_apdex_counters
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/69154
rollout_issue_url: https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1099
milestone: '14.3'
type: development
group: team::Scalability
default_enabled: false
......@@ -48,7 +48,7 @@ if !Rails.env.test? && Gitlab::Metrics.prometheus_metrics_enabled?
Gitlab::Metrics.gauge(:deployments, 'GitLab Version', {}, :max).set({ version: Gitlab::VERSION, revision: Gitlab.revision }, 1)
unless Gitlab::Runtime.sidekiq?
if Gitlab::Runtime.web_server?
Gitlab::Metrics::RequestsRackMiddleware.initialize_metrics
end
......
......@@ -13,6 +13,10 @@ module API
normalize_path(app.namespace, app.options[:path].first)
end
def endpoint_id_for_route(route)
"#{route.request_method} #{route.origin}"
end
def route(methods, paths = ['/'], route_options = {}, &block)
if category = route_options.delete(:feature_category)
feature_category(category, Array(paths).map { |path| normalize_path(namespace, path) })
......
......@@ -34,7 +34,7 @@ module API
end
def endpoint_id
"#{request.request_method} #{route.origin}"
::API::Base.endpoint_id_for_route(route)
end
end
end
......
......@@ -15,6 +15,14 @@ module Gitlab
end
end
RailsMetricsInitializer = Struct.new(:app) do
def call(env)
Gitlab::Metrics::RailsSlis.initialize_request_slis_if_needed!
app.call(env)
end
end
attr_reader :running
# This exporter is always run on master process
......@@ -45,6 +53,15 @@ module Gitlab
private
def rack_app
app = super
Rack::Builder.app do
use RailsMetricsInitializer
run app
end
end
def start_working
@running = true
super
......
# frozen_string_literal: true
module Gitlab
module Metrics
module RailsSlis
class << self
def request_apdex_counters_enabled?
Feature.enabled?(:request_apdex_counters)
end
def initialize_request_slis_if_needed!
return unless request_apdex_counters_enabled?
return if Gitlab::Metrics::Sli.initialized?(:rails_request_apdex)
Gitlab::Metrics::Sli.initialize_sli(:rails_request_apdex, possible_request_labels)
end
def request_apdex
Gitlab::Metrics::Sli[:rails_request_apdex]
end
private
def possible_request_labels
possible_controller_labels + possible_api_labels
end
def possible_api_labels
Gitlab::RequestEndpoints.all_api_endpoints.map do |route|
endpoint_id = API::Base.endpoint_id_for_route(route)
route_class = route.app.options[:for]
feature_category = route_class.feature_category_for_app(route.app)
{
endpoint_id: endpoint_id,
feature_category: feature_category
}
end
end
def possible_controller_labels
Gitlab::RequestEndpoints.all_controller_actions.map do |controller, action|
{
endpoint_id: controller.endpoint_id_for_action(action),
feature_category: controller.feature_category_for_action(action)
}
end
end
end
end
end
end
......@@ -16,6 +16,7 @@ module Gitlab
HEALTH_ENDPOINT = %r{^/-/(liveness|readiness|health|metrics)/?$}.freeze
FEATURE_CATEGORY_DEFAULT = 'unknown'
ENDPOINT_MISSING = 'unknown'
# These were the top 5 categories at a point in time, chosen as a
# reasonable default. If we initialize every category we'll end up
......@@ -77,6 +78,8 @@ module Gitlab
if !health_endpoint && ::Gitlab::Metrics.record_duration_for_status?(status)
self.class.http_request_duration_seconds.observe({ method: method }, elapsed)
record_apdex_if_needed(elapsed)
end
[status, headers, body]
......@@ -105,6 +108,28 @@ module Gitlab
def feature_category
::Gitlab::ApplicationContext.current_context_attribute(:feature_category)
end
def endpoint_id
::Gitlab::ApplicationContext.current_context_attribute(:caller_id)
end
def record_apdex_if_needed(elapsed)
return unless Gitlab::Metrics::RailsSlis.request_apdex_counters_enabled?
Gitlab::Metrics::Sli[:rails_request_apdex].increment(
labels: labels_from_context,
# hardcoded 1s here will be replaced by a per-endpoint value.
# https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1223
success: elapsed < 1
)
end
def labels_from_context
{
feature_category: feature_category.presence || FEATURE_CATEGORY_DEFAULT,
endpoint_id: endpoint_id.presence || ENDPOINT_MISSING
}
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Metrics
class Sli
SliNotInitializedError = Class.new(StandardError)
COUNTER_PREFIX = 'gitlab_sli'
class << self
INITIALIZATION_MUTEX = Mutex.new
def [](name)
known_slis[name] || initialize_sli(name, [])
end
def initialize_sli(name, possible_label_combinations)
INITIALIZATION_MUTEX.synchronize do
sli = new(name)
sli.initialize_counters(possible_label_combinations)
known_slis[name] = sli
end
end
def initialized?(name)
known_slis.key?(name) && known_slis[name].initialized?
end
private
def known_slis
@known_slis ||= {}
end
end
attr_reader :name
def initialize(name)
@name = name
@initialized_with_combinations = false
end
def initialize_counters(possible_label_combinations)
@initialized_with_combinations = possible_label_combinations.any?
possible_label_combinations.each do |label_combination|
total_counter.get(label_combination)
success_counter.get(label_combination)
end
end
def increment(labels:, success:)
total_counter.increment(labels)
success_counter.increment(labels) if success
end
def initialized?
@initialized_with_combinations
end
private
def total_counter
prometheus.counter(total_counter_name.to_sym, "Total number of measurements for #{name}")
end
def success_counter
prometheus.counter(success_counter_name.to_sym, "Number of successful measurements for #{name}")
end
def total_counter_name
"#{COUNTER_PREFIX}:#{name}:total"
end
def success_counter_name
"#{COUNTER_PREFIX}:#{name}:success_total"
end
def prometheus
Gitlab::Metrics
end
end
end
end
# frozen_string_literal: true
module Gitlab
module RequestEndpoints
class << self
def all_api_endpoints
# This compile does not do anything if the routes were already built
# but if they weren't, the routes will be drawn and available for the rest of
# application.
API::API.compile!
API::API.routes.select { |route| route.app.options[:for] < API::Base }
end
def all_controller_actions
# This will return tuples of all controller actions defined in the routes
# Only for controllers inheriting ApplicationController
# Excluding controllers from gems (OAuth, Sidekiq)
Rails.application.routes.routes.filter_map do |route|
route_info = route.required_defaults.presence
next unless route_info
next if route_info[:controller].blank? || route_info[:action].blank?
controller = constantize_controller(route_info[:controller])
next unless controller&.include?(::Gitlab::WithFeatureCategory)
next if controller == ApplicationController
next if controller == Devise::UnlocksController
[controller, route_info[:action]]
end
end
private
def constantize_controller(name)
"#{name.camelize}Controller".constantize
rescue NameError
nil # some controllers, like the omniauth ones are dynamic
end
end
end
end
......@@ -967,6 +967,14 @@ RSpec.describe ApplicationController do
end
end
describe '.endpoint_id_for_action' do
controller(described_class) { }
it 'returns an expected endpoint id' do
expect(controller.class.endpoint_id_for_action('hello')).to eq('AnonymousController#hello')
end
end
describe '#current_user' do
controller(described_class) do
def index; end
......
......@@ -9,16 +9,7 @@ RSpec.describe "Every controller" do
end
let_it_be(:controller_actions) do
# This will return tuples of all controller actions defined in the routes
# Only for controllers inheriting ApplicationController
# Excluding controllers from gems (OAuth, Sidekiq)
Rails.application.routes.routes
.map { |route| route.required_defaults.presence }
.compact
.select { |route| route[:controller].present? && route[:action].present? }
.map { |route| [constantize_controller(route[:controller]), route[:action]] }
.select { |(controller, action)| controller&.include?(::Gitlab::WithFeatureCategory) }
.reject { |(controller, action)| controller == ApplicationController || controller == Devise::UnlocksController }
Gitlab::RequestEndpoints.all_controller_actions
end
let_it_be(:routes_without_category) do
......
......@@ -67,6 +67,12 @@ RSpec.describe MetricsController, :request_store do
expect(response.body).to match(/^prometheus_counter 1$/)
end
it 'initializes the rails request SLIs' do
expect(Gitlab::Metrics::RailsSlis).to receive(:initialize_request_slis_if_needed!).and_call_original
get :index
end
context 'prometheus metrics are disabled' do
before do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(false)
......
......@@ -9,7 +9,7 @@ RSpec.describe 'Every API endpoint' do
end
let_it_be(:api_endpoints) do
API::API.routes.map do |route|
Gitlab::RequestEndpoints.all_api_endpoints.map do |route|
[route.app.options[:for], API::Base.path_for_app(route.app)]
end
end
......
......@@ -30,6 +30,15 @@ RSpec.describe Gitlab::Metrics::Exporter::WebExporter do
expect(readiness_probe.json).to include(status: 'ok')
expect(readiness_probe.json).to include('web_exporter' => [{ 'status': 'ok' }])
end
it 'initializes request metrics', :prometheus do
expect(Gitlab::Metrics::RailsSlis).to receive(:initialize_request_slis_if_needed!).and_call_original
http = Net::HTTP.new(exporter.server.config[:BindAddress], exporter.server.config[:Port])
response = http.request(Net::HTTP::Get.new('/metrics'))
expect(response.body).to include('gitlab_sli:rails_request_apdex')
end
end
describe '#mark_as_not_running!' do
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Metrics::RailsSlis do
# Limit what routes we'll initialize so we don't have to load the entire thing
before do
api_route = API::API.routes.find do |route|
API::Base.endpoint_id_for_route(route) == "GET /api/:version/version"
end
allow(Gitlab::RequestEndpoints).to receive(:all_api_endpoints).and_return([api_route])
allow(Gitlab::RequestEndpoints).to receive(:all_controller_actions).and_return([[ProjectsController, 'show']])
end
describe '.initialize_request_slis_if_needed!' do
it "initializes the SLI for all possible endpoints if they weren't" do
possible_labels = [
{
endpoint_id: "GET /api/:version/version",
feature_category: :not_owned
},
{
endpoint_id: "ProjectsController#show",
feature_category: :projects
}
]
expect(Gitlab::Metrics::Sli).to receive(:initialized?).with(:rails_request_apdex) { false }
expect(Gitlab::Metrics::Sli).to receive(:initialize_sli).with(:rails_request_apdex, array_including(*possible_labels)).and_call_original
described_class.initialize_request_slis_if_needed!
end
it 'does not initialize the SLI if they were initialized already' do
expect(Gitlab::Metrics::Sli).to receive(:initialized?).with(:rails_request_apdex) { true }
expect(Gitlab::Metrics::Sli).not_to receive(:initialize_sli)
described_class.initialize_request_slis_if_needed!
end
it 'does not initialize anything if the feature flag is disabled' do
stub_feature_flags(request_apdex_counters: false)
expect(Gitlab::Metrics::Sli).not_to receive(:initialize_sli)
expect(Gitlab::Metrics::Sli).not_to receive(:initialized?)
described_class.initialize_request_slis_if_needed!
end
end
describe '.request_apdex' do
it 'returns the initialized request apdex SLI object' do
described_class.initialize_request_slis_if_needed!
expect(described_class.request_apdex).to be_initialized
end
end
end
......@@ -36,6 +36,7 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
it 'tracks request count and duration' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: '200', feature_category: 'unknown')
expect(described_class).to receive_message_chain(:http_request_duration_seconds, :observe).with({ method: 'get' }, a_positive_execution_time)
expect(Gitlab::Metrics::RailsSlis.request_apdex).to receive(:increment).with(labels: { feature_category: 'unknown', endpoint_id: 'unknown' }, success: true)
subject.call(env)
end
......@@ -82,9 +83,10 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
context '@app.call returns an error code' do
let(:status) { '500' }
it 'tracks count but not duration' do
it 'tracks count but not duration or apdex' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: '500', feature_category: 'unknown')
expect(described_class).not_to receive(:http_request_duration_seconds)
expect(Gitlab::Metrics::RailsSlis).not_to receive(:request_apdex)
subject.call(env)
end
......@@ -104,20 +106,23 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
expect(described_class).to receive_message_chain(:rack_uncaught_errors_count, :increment)
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: 'undefined', feature_category: 'unknown')
expect(described_class.http_request_duration_seconds).not_to receive(:observe)
expect(Gitlab::Metrics::RailsSlis).not_to receive(:request_apdex)
expect { subject.call(env) }.to raise_error(StandardError)
end
end
context 'feature category header' do
context 'when a feature category context is present' do
context 'application context' do
context 'when a context is present' do
before do
::Gitlab::ApplicationContext.push(feature_category: 'issue_tracking')
::Gitlab::ApplicationContext.push(feature_category: 'issue_tracking', caller_id: 'IssuesController#show')
end
it 'adds the feature category to the labels for http_requests_total' do
it 'adds the feature category to the labels for required metrics' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: '200', feature_category: 'issue_tracking')
expect(described_class).not_to receive(:http_health_requests_total)
expect(Gitlab::Metrics::RailsSlis.request_apdex)
.to receive(:increment).with(labels: { feature_category: 'issue_tracking', endpoint_id: 'IssuesController#show' }, success: true)
subject.call(env)
end
......@@ -127,6 +132,7 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
expect(described_class).to receive_message_chain(:http_health_requests_total, :increment).with(method: 'get', status: '200')
expect(described_class).not_to receive(:http_requests_total)
expect(Gitlab::Metrics::RailsSlis).not_to receive(:request_apdex)
subject.call(env)
end
......@@ -140,15 +146,17 @@ RSpec.describe Gitlab::Metrics::RequestsRackMiddleware, :aggregate_failures do
it 'adds the feature category to the labels for http_requests_total' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: 'undefined', feature_category: 'issue_tracking')
expect(Gitlab::Metrics::RailsSlis).not_to receive(:request_apdex)
expect { subject.call(env) }.to raise_error(StandardError)
end
end
context 'when the feature category context is not available' do
it 'sets the feature category to unknown' do
context 'when the context is not available' do
it 'sets the required labels to unknown' do
expect(described_class).to receive_message_chain(:http_requests_total, :increment).with(method: 'get', status: '200', feature_category: 'unknown')
expect(described_class).not_to receive(:http_health_requests_total)
expect(Gitlab::Metrics::RailsSlis.request_apdex).to receive(:increment).with(labels: { feature_category: 'unknown', endpoint_id: 'unknown' }, success: true)
subject.call(env)
end
......
# frozen_string_literal: true
require 'fast_spec_helper'
RSpec.describe Gitlab::Metrics::Sli do
let(:prometheus) { double("prometheus") }
before do
stub_const("Gitlab::Metrics", prometheus)
end
describe 'Class methods' do
before do
described_class.instance_variable_set(:@known_slis, nil)
end
describe '.[]' do
it 'warns about an uninitialized SLI but returns and stores a new one' do
sli = described_class[:bar]
expect(described_class[:bar]).to be(sli)
end
it 'returns the same object for multiple accesses' do
sli = described_class.initialize_sli(:huzzah, [])
2.times do
expect(described_class[:huzzah]).to be(sli)
end
end
end
describe '.initialized?' do
before do
fake_total_counter(:boom)
fake_success_counter(:boom)
end
it 'is true when an SLI was initialized with labels' do
expect { described_class.initialize_sli(:boom, [{ hello: :world }]) }
.to change { described_class.initialized?(:boom) }.from(false).to(true)
end
it 'is false when an SLI was not initialized with labels' do
expect { described_class.initialize_sli(:boom, []) }
.not_to change { described_class.initialized?(:boom) }.from(false)
end
end
end
describe '#initialize_counters' do
it 'initializes counters for the passed label combinations' do
counters = [fake_total_counter(:hey), fake_success_counter(:hey)]
described_class.new(:hey).initialize_counters([{ foo: 'bar' }, { foo: 'baz' }])
expect(counters).to all(have_received(:get).with({ foo: 'bar' }))
expect(counters).to all(have_received(:get).with({ foo: 'baz' }))
end
end
describe "#increment" do
let!(:sli) { described_class.new(:heyo) }
let!(:total_counter) { fake_total_counter(:heyo) }
let!(:success_counter) { fake_success_counter(:heyo) }
it 'increments both counters for labels successes' do
sli.increment(labels: { hello: "world" }, success: true)
expect(total_counter).to have_received(:increment).with({ hello: 'world' })
expect(success_counter).to have_received(:increment).with({ hello: 'world' })
end
it 'only increments the total counters for labels when not successful' do
sli.increment(labels: { hello: "world" }, success: false)
expect(total_counter).to have_received(:increment).with({ hello: 'world' })
expect(success_counter).not_to have_received(:increment).with({ hello: 'world' })
end
end
def fake_prometheus_counter(name)
fake_counter = double("prometheus counter: #{name}")
allow(fake_counter).to receive(:get)
allow(fake_counter).to receive(:increment)
allow(prometheus).to receive(:counter).with(name.to_sym, anything).and_return(fake_counter)
fake_counter
end
def fake_total_counter(name)
fake_prometheus_counter("gitlab_sli:#{name}:total")
end
def fake_success_counter(name)
fake_prometheus_counter("gitlab_sli:#{name}:success_total")
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::RequestEndpoints do
describe '.all_api_endpoints' do
it 'selects all feature API classes' do
api_classes = described_class.all_api_endpoints.map { |route| route.app.options[:for] }
expect(api_classes).to all(include(Gitlab::WithFeatureCategory))
end
end
describe '.all_controller_actions' do
it 'selects all feature controllers and action names' do
all_controller_actions = described_class.all_controller_actions
controller_classes = all_controller_actions.map(&:first)
all_actions = all_controller_actions.map(&:last)
expect(controller_classes).to all(include(Gitlab::WithFeatureCategory))
expect(controller_classes).not_to include(ApplicationController, Devise::UnlocksController)
expect(all_actions).to all(be_a(String))
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment