Commit abc0e8e6 authored by Matthias Käppler's avatar Matthias Käppler Committed by Ash McKenzie

Query recorded metrics instead of ad-hoc

See https://gitlab.com/gitlab-org/omnibus-gitlab/-/merge_requests/4343
parent 731bd999
......@@ -18,55 +18,70 @@ module Gitlab
def topology_usage_data
topology_data, duration = measure_duration do
alt_usage_data(fallback: {}) do
{
nodes: topology_node_data
}.compact
end
alt_usage_data(fallback: {}) { topology_fetch_all_data }
end
{ topology: topology_data.merge(duration_s: duration) }
end
private
def topology_node_data
with_prometheus_client do |client|
# node-level data
by_instance_mem = topology_node_memory(client)
by_instance_cpus = topology_node_cpus(client)
# service-level data
by_instance_by_job_by_metric_memory = topology_all_service_memory(client)
by_instance_by_job_process_count = topology_all_service_process_count(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance],
node_services:
topology_node_services(instance, by_instance_by_job_process_count, by_instance_by_job_by_metric_memory)
}.compact
end
def topology_fetch_all_data
with_prometheus_client(fallback: {}) do |client|
{
nodes: topology_node_data(client)
}
end
end
def topology_node_data(client)
# node-level data
by_instance_mem = topology_node_memory(client)
by_instance_cpus = topology_node_cpus(client)
# service-level data
by_instance_by_job_by_type_memory = topology_all_service_memory(client)
by_instance_by_job_process_count = topology_all_service_process_count(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance],
node_services:
topology_node_services(instance, by_instance_by_job_process_count, by_instance_by_job_by_type_memory)
}.compact
end
end
def topology_node_memory(client)
aggregate_single(client, 'avg (node_memory_MemTotal_bytes) by (instance)')
aggregate_by_instance(client, 'gitlab_usage_ping:node_memory_total_bytes:avg')
end
def topology_node_cpus(client)
aggregate_single(client, 'count (node_cpu_seconds_total{mode="idle"}) by (instance)')
aggregate_by_instance(client, 'gitlab_usage_ping:node_cpus:count')
end
def topology_all_service_memory(client)
aggregate_many(
client,
'avg ({__name__ =~ "(ruby_){0,1}process_(resident|unique|proportional)_memory_bytes", job != "gitlab_exporter_process"}) by (instance, job, __name__)'
)
{
rss: topology_service_memory_rss(client),
uss: topology_service_memory_uss(client),
pss: topology_service_memory_pss(client)
}
end
def topology_service_memory_rss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg')
end
def topology_service_memory_uss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg')
end
def topology_service_memory_pss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg')
end
def topology_all_service_process_count(client)
aggregate_many(client, 'count ({__name__ =~ "(ruby_){0,1}process_start_time_seconds", job != "gitlab_exporter_process"}) by (instance, job)')
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process:count')
end
def topology_node_services(instance, all_process_counts, all_process_memory)
......@@ -92,24 +107,21 @@ module Gitlab
end
end
def topology_instance_service_memory(instance, all_instance_data)
topology_data_for_instance(instance, all_instance_data).each_with_object({}) do |entry, hash|
metric, memory = entry
job = metric['job']
key =
case metric['__name__']
when match_process_memory_metric_for_type('resident') then :process_memory_rss
when match_process_memory_metric_for_type('unique') then :process_memory_uss
when match_process_memory_metric_for_type('proportional') then :process_memory_pss
end
hash[job] ||= {}
hash[job][key] ||= memory
# Given a hash mapping memory set types to Prometheus response data, returns a hash
# mapping instance/node names to services and their respective memory use in bytes
def topology_instance_service_memory(instance, instance_data_by_type)
result = {}
instance_data_by_type.each do |memory_type, instance_data|
topology_data_for_instance(instance, instance_data).each do |metric, memory_bytes|
job = metric['job']
key = "process_memory_#{memory_type}".to_sym
result[job] ||= {}
result[job][key] ||= memory_bytes
end
end
end
def match_process_memory_metric_for_type(type)
/(ruby_){0,1}process_#{type}_memory_bytes/
result
end
def topology_data_for_instance(instance, all_instance_data)
......@@ -120,14 +132,17 @@ module Gitlab
instance.gsub(/:.+$/, '')
end
# Will retain a single `instance` key that values are mapped to
def aggregate_single(client, query)
client.aggregate(query) { |metric| drop_port(metric['instance']) }
def one_week_average(query)
"avg_over_time (#{query}[1w])"
end
def aggregate_by_instance(client, query)
client.aggregate(one_week_average(query)) { |metric| drop_port(metric['instance']) }
end
# Will retain a composite key that values are mapped to
def aggregate_many(client, query)
client.aggregate(query) do |metric|
def aggregate_by_labels(client, query)
client.aggregate(one_week_average(query)) do |metric|
metric['instance'] = drop_port(metric['instance'])
metric
end
......
......@@ -77,11 +77,11 @@ module Gitlab
end
end
def with_prometheus_client
if Gitlab::Prometheus::Internal.prometheus_enabled?
prometheus_address = Gitlab::Prometheus::Internal.uri
yield Gitlab::PrometheusClient.new(prometheus_address, allow_local_requests: true)
end
def with_prometheus_client(fallback: nil)
return fallback unless Gitlab::Prometheus::Internal.prometheus_enabled?
prometheus_address = Gitlab::Prometheus::Internal.uri
yield Gitlab::PrometheusClient.new(prometheus_address, allow_local_requests: true)
end
def measure_duration
......
......@@ -19,18 +19,14 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
end
it 'contains a topology element' do
allow_prometheus_queries
expect(subject).to have_key(:topology)
end
context 'tracking node metrics' do
it 'contains node level metrics for each instance' do
expect_prometheus_api_to(
receive_node_memory_query,
receive_node_cpu_count_query,
receive_node_service_memory_query,
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query,
receive_node_service_memory_pss_query,
receive_node_service_process_count_query
)
......@@ -82,19 +78,51 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect_prometheus_api_to(
receive_node_memory_query(result: []),
receive_node_cpu_count_query,
receive_node_service_memory_query,
receive_node_service_memory_rss_query(result: []),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query,
receive_node_service_process_count_query
)
keys = subject[:topology][:nodes].flat_map(&:keys)
expect(keys).not_to include(:node_memory_total_bytes)
expect(keys).to include(:node_cpus, :node_services)
expect(subject[:topology]).to eq({
duration_s: 0,
nodes: [
{
node_cpus: 16,
node_services: [
{
name: 'sidekiq',
process_count: 15,
process_memory_pss: 401
},
{
name: 'redis',
process_count: 1
}
]
},
{
node_cpus: 8,
node_services: [
{
name: 'web',
process_count: 10,
process_memory_pss: 302
},
{
name: 'sidekiq',
process_count: 5
}
]
}
]
})
end
end
context 'and no results are found' do
it 'does not report anything' do
expect_prometheus_api_to receive(:aggregate).at_least(:once).and_return({})
expect_prometheus_api_to receive(:query).at_least(:once).and_return({})
expect(subject[:topology]).to eq({
duration_s: 0,
......@@ -105,7 +133,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
context 'and a connection error is raised' do
it 'does not report anything' do
expect_prometheus_api_to receive(:aggregate).and_raise('Connection failed')
expect_prometheus_api_to receive(:query).and_raise('Connection failed')
expect(subject[:topology]).to eq({ duration_s: 0 })
end
......@@ -123,7 +151,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
def receive_node_memory_query(result: nil)
receive(:query)
.with(/node_memory_MemTotal_bytes/, an_instance_of(Hash))
.with(/node_memory_total_bytes/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080' },
......@@ -138,7 +166,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
def receive_node_cpu_count_query(result: nil)
receive(:query)
.with(/node_cpu_seconds_total/, an_instance_of(Hash))
.with(/node_cpus/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance2:8090' },
......@@ -151,46 +179,59 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
])
end
def receive_node_service_memory_query(result: nil)
def receive_node_service_memory_rss_query(result: nil)
receive(:query)
.with(/process_.+_memory_bytes/, an_instance_of(Hash))
.with(/process_resident_memory_bytes/, an_instance_of(Hash))
.and_return(result || [
# instance 1: runs Puma + a small Sidekiq
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_resident_memory_bytes' },
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '300']
},
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_unique_memory_bytes' },
'value' => [1000, '301']
},
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_proportional_memory_bytes' },
'value' => [1000, '302']
},
{
'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_resident_memory_bytes' },
'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '303']
},
# instance 2: runs a dedicated Sidekiq + Redis (which uses a different metric name)
{
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_resident_memory_bytes' },
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '400']
},
{
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_proportional_memory_bytes' },
'value' => [1000, '401']
'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' },
'value' => [1000, '402']
}
])
end
def receive_node_service_memory_uss_query(result: nil)
receive(:query)
.with(/process_unique_memory_bytes/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '301']
}
])
end
def receive_node_service_memory_pss_query(result: nil)
receive(:query)
.with(/process_proportional_memory_bytes/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '302']
},
{
'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis', '__name__' => 'process_resident_memory_bytes' },
'value' => [1000, '402']
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '401']
}
])
end
def receive_node_service_process_count_query(result: nil)
receive(:query)
.with(/process_start_time_seconds/, an_instance_of(Hash))
.with(/service_process:count/, an_instance_of(Hash))
.and_return(result || [
# instance 1
{
......
......@@ -88,13 +88,21 @@ RSpec.describe Gitlab::Utils::UsageData do
end
context 'when Prometheus is disabled' do
it 'returns nil' do
before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
end
it 'returns nil by default' do
result = described_class.with_prometheus_client { |client| client }
expect(result).to be nil
end
it 'returns fallback if provided' do
result = described_class.with_prometheus_client(fallback: []) { |client| client }
expect(result).to eq([])
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment