Commit abc0e8e6 authored by Matthias Käppler's avatar Matthias Käppler Committed by Ash McKenzie

Query recorded metrics instead of ad-hoc

See https://gitlab.com/gitlab-org/omnibus-gitlab/-/merge_requests/4343
parent 731bd999
...@@ -18,55 +18,70 @@ module Gitlab ...@@ -18,55 +18,70 @@ module Gitlab
def topology_usage_data def topology_usage_data
topology_data, duration = measure_duration do topology_data, duration = measure_duration do
alt_usage_data(fallback: {}) do alt_usage_data(fallback: {}) { topology_fetch_all_data }
{
nodes: topology_node_data
}.compact
end
end end
{ topology: topology_data.merge(duration_s: duration) } { topology: topology_data.merge(duration_s: duration) }
end end
private private
def topology_node_data def topology_fetch_all_data
with_prometheus_client do |client| with_prometheus_client(fallback: {}) do |client|
# node-level data {
by_instance_mem = topology_node_memory(client) nodes: topology_node_data(client)
by_instance_cpus = topology_node_cpus(client) }
# service-level data end
by_instance_by_job_by_metric_memory = topology_all_service_memory(client) end
by_instance_by_job_process_count = topology_all_service_process_count(client)
def topology_node_data(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys) # node-level data
instances.map do |instance| by_instance_mem = topology_node_memory(client)
{ by_instance_cpus = topology_node_cpus(client)
node_memory_total_bytes: by_instance_mem[instance], # service-level data
node_cpus: by_instance_cpus[instance], by_instance_by_job_by_type_memory = topology_all_service_memory(client)
node_services: by_instance_by_job_process_count = topology_all_service_process_count(client)
topology_node_services(instance, by_instance_by_job_process_count, by_instance_by_job_by_metric_memory)
}.compact instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
end instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance],
node_services:
topology_node_services(instance, by_instance_by_job_process_count, by_instance_by_job_by_type_memory)
}.compact
end end
end end
def topology_node_memory(client) def topology_node_memory(client)
aggregate_single(client, 'avg (node_memory_MemTotal_bytes) by (instance)') aggregate_by_instance(client, 'gitlab_usage_ping:node_memory_total_bytes:avg')
end end
def topology_node_cpus(client) def topology_node_cpus(client)
aggregate_single(client, 'count (node_cpu_seconds_total{mode="idle"}) by (instance)') aggregate_by_instance(client, 'gitlab_usage_ping:node_cpus:count')
end end
def topology_all_service_memory(client) def topology_all_service_memory(client)
aggregate_many( {
client, rss: topology_service_memory_rss(client),
'avg ({__name__ =~ "(ruby_){0,1}process_(resident|unique|proportional)_memory_bytes", job != "gitlab_exporter_process"}) by (instance, job, __name__)' uss: topology_service_memory_uss(client),
) pss: topology_service_memory_pss(client)
}
end
def topology_service_memory_rss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg')
end
def topology_service_memory_uss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg')
end
def topology_service_memory_pss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg')
end end
def topology_all_service_process_count(client) def topology_all_service_process_count(client)
aggregate_many(client, 'count ({__name__ =~ "(ruby_){0,1}process_start_time_seconds", job != "gitlab_exporter_process"}) by (instance, job)') aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process:count')
end end
def topology_node_services(instance, all_process_counts, all_process_memory) def topology_node_services(instance, all_process_counts, all_process_memory)
...@@ -92,24 +107,21 @@ module Gitlab ...@@ -92,24 +107,21 @@ module Gitlab
end end
end end
def topology_instance_service_memory(instance, all_instance_data) # Given a hash mapping memory set types to Prometheus response data, returns a hash
topology_data_for_instance(instance, all_instance_data).each_with_object({}) do |entry, hash| # mapping instance/node names to services and their respective memory use in bytes
metric, memory = entry def topology_instance_service_memory(instance, instance_data_by_type)
job = metric['job'] result = {}
key = instance_data_by_type.each do |memory_type, instance_data|
case metric['__name__'] topology_data_for_instance(instance, instance_data).each do |metric, memory_bytes|
when match_process_memory_metric_for_type('resident') then :process_memory_rss job = metric['job']
when match_process_memory_metric_for_type('unique') then :process_memory_uss key = "process_memory_#{memory_type}".to_sym
when match_process_memory_metric_for_type('proportional') then :process_memory_pss
end result[job] ||= {}
result[job][key] ||= memory_bytes
hash[job] ||= {} end
hash[job][key] ||= memory
end end
end
def match_process_memory_metric_for_type(type) result
/(ruby_){0,1}process_#{type}_memory_bytes/
end end
def topology_data_for_instance(instance, all_instance_data) def topology_data_for_instance(instance, all_instance_data)
...@@ -120,14 +132,17 @@ module Gitlab ...@@ -120,14 +132,17 @@ module Gitlab
instance.gsub(/:.+$/, '') instance.gsub(/:.+$/, '')
end end
# Will retain a single `instance` key that values are mapped to def one_week_average(query)
def aggregate_single(client, query) "avg_over_time (#{query}[1w])"
client.aggregate(query) { |metric| drop_port(metric['instance']) } end
def aggregate_by_instance(client, query)
client.aggregate(one_week_average(query)) { |metric| drop_port(metric['instance']) }
end end
# Will retain a composite key that values are mapped to # Will retain a composite key that values are mapped to
def aggregate_many(client, query) def aggregate_by_labels(client, query)
client.aggregate(query) do |metric| client.aggregate(one_week_average(query)) do |metric|
metric['instance'] = drop_port(metric['instance']) metric['instance'] = drop_port(metric['instance'])
metric metric
end end
......
...@@ -77,11 +77,11 @@ module Gitlab ...@@ -77,11 +77,11 @@ module Gitlab
end end
end end
def with_prometheus_client def with_prometheus_client(fallback: nil)
if Gitlab::Prometheus::Internal.prometheus_enabled? return fallback unless Gitlab::Prometheus::Internal.prometheus_enabled?
prometheus_address = Gitlab::Prometheus::Internal.uri
yield Gitlab::PrometheusClient.new(prometheus_address, allow_local_requests: true) prometheus_address = Gitlab::Prometheus::Internal.uri
end yield Gitlab::PrometheusClient.new(prometheus_address, allow_local_requests: true)
end end
def measure_duration def measure_duration
......
...@@ -19,18 +19,14 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -19,18 +19,14 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090') expect(Gitlab::Prometheus::Internal).to receive(:uri).and_return('http://prom:9090')
end end
it 'contains a topology element' do
allow_prometheus_queries
expect(subject).to have_key(:topology)
end
context 'tracking node metrics' do context 'tracking node metrics' do
it 'contains node level metrics for each instance' do it 'contains node level metrics for each instance' do
expect_prometheus_api_to( expect_prometheus_api_to(
receive_node_memory_query, receive_node_memory_query,
receive_node_cpu_count_query, receive_node_cpu_count_query,
receive_node_service_memory_query, receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query,
receive_node_service_memory_pss_query,
receive_node_service_process_count_query receive_node_service_process_count_query
) )
...@@ -82,19 +78,51 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -82,19 +78,51 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect_prometheus_api_to( expect_prometheus_api_to(
receive_node_memory_query(result: []), receive_node_memory_query(result: []),
receive_node_cpu_count_query, receive_node_cpu_count_query,
receive_node_service_memory_query, receive_node_service_memory_rss_query(result: []),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query,
receive_node_service_process_count_query receive_node_service_process_count_query
) )
keys = subject[:topology][:nodes].flat_map(&:keys) expect(subject[:topology]).to eq({
expect(keys).not_to include(:node_memory_total_bytes) duration_s: 0,
expect(keys).to include(:node_cpus, :node_services) nodes: [
{
node_cpus: 16,
node_services: [
{
name: 'sidekiq',
process_count: 15,
process_memory_pss: 401
},
{
name: 'redis',
process_count: 1
}
]
},
{
node_cpus: 8,
node_services: [
{
name: 'web',
process_count: 10,
process_memory_pss: 302
},
{
name: 'sidekiq',
process_count: 5
}
]
}
]
})
end end
end end
context 'and no results are found' do context 'and no results are found' do
it 'does not report anything' do it 'does not report anything' do
expect_prometheus_api_to receive(:aggregate).at_least(:once).and_return({}) expect_prometheus_api_to receive(:query).at_least(:once).and_return({})
expect(subject[:topology]).to eq({ expect(subject[:topology]).to eq({
duration_s: 0, duration_s: 0,
...@@ -105,7 +133,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -105,7 +133,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
context 'and a connection error is raised' do context 'and a connection error is raised' do
it 'does not report anything' do it 'does not report anything' do
expect_prometheus_api_to receive(:aggregate).and_raise('Connection failed') expect_prometheus_api_to receive(:query).and_raise('Connection failed')
expect(subject[:topology]).to eq({ duration_s: 0 }) expect(subject[:topology]).to eq({ duration_s: 0 })
end end
...@@ -123,7 +151,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -123,7 +151,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
def receive_node_memory_query(result: nil) def receive_node_memory_query(result: nil)
receive(:query) receive(:query)
.with(/node_memory_MemTotal_bytes/, an_instance_of(Hash)) .with(/node_memory_total_bytes/, an_instance_of(Hash))
.and_return(result || [ .and_return(result || [
{ {
'metric' => { 'instance' => 'instance1:8080' }, 'metric' => { 'instance' => 'instance1:8080' },
...@@ -138,7 +166,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -138,7 +166,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
def receive_node_cpu_count_query(result: nil) def receive_node_cpu_count_query(result: nil)
receive(:query) receive(:query)
.with(/node_cpu_seconds_total/, an_instance_of(Hash)) .with(/node_cpus/, an_instance_of(Hash))
.and_return(result || [ .and_return(result || [
{ {
'metric' => { 'instance' => 'instance2:8090' }, 'metric' => { 'instance' => 'instance2:8090' },
...@@ -151,46 +179,59 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do ...@@ -151,46 +179,59 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
]) ])
end end
def receive_node_service_memory_query(result: nil) def receive_node_service_memory_rss_query(result: nil)
receive(:query) receive(:query)
.with(/process_.+_memory_bytes/, an_instance_of(Hash)) .with(/process_resident_memory_bytes/, an_instance_of(Hash))
.and_return(result || [ .and_return(result || [
# instance 1: runs Puma + a small Sidekiq
{ {
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_resident_memory_bytes' }, 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '300'] 'value' => [1000, '300']
}, },
{ {
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_unique_memory_bytes' }, 'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '301']
},
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', '__name__' => 'ruby_process_proportional_memory_bytes' },
'value' => [1000, '302']
},
{
'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_resident_memory_bytes' },
'value' => [1000, '303'] 'value' => [1000, '303']
}, },
# instance 2: runs a dedicated Sidekiq + Redis (which uses a different metric name) # instance 2: runs a dedicated Sidekiq + Redis (which uses a different metric name)
{ {
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_resident_memory_bytes' }, 'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '400'] 'value' => [1000, '400']
}, },
{ {
'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq', '__name__' => 'ruby_process_proportional_memory_bytes' }, 'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' },
'value' => [1000, '401'] 'value' => [1000, '402']
}
])
end
def receive_node_service_memory_uss_query(result: nil)
receive(:query)
.with(/process_unique_memory_bytes/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '301']
}
])
end
def receive_node_service_memory_pss_query(result: nil)
receive(:query)
.with(/process_proportional_memory_bytes/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '302']
}, },
{ {
'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis', '__name__' => 'process_resident_memory_bytes' }, 'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '402'] 'value' => [1000, '401']
} }
]) ])
end end
def receive_node_service_process_count_query(result: nil) def receive_node_service_process_count_query(result: nil)
receive(:query) receive(:query)
.with(/process_start_time_seconds/, an_instance_of(Hash)) .with(/service_process:count/, an_instance_of(Hash))
.and_return(result || [ .and_return(result || [
# instance 1 # instance 1
{ {
......
...@@ -88,13 +88,21 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -88,13 +88,21 @@ RSpec.describe Gitlab::Utils::UsageData do
end end
context 'when Prometheus is disabled' do context 'when Prometheus is disabled' do
it 'returns nil' do before do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false) expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
end
it 'returns nil by default' do
result = described_class.with_prometheus_client { |client| client } result = described_class.with_prometheus_client { |client| client }
expect(result).to be nil expect(result).to be nil
end end
it 'returns fallback if provided' do
result = described_class.with_prometheus_client(fallback: []) { |client| client }
expect(result).to eq([])
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment