Commit 27a9888e authored by Matthias Käppler's avatar Matthias Käppler Committed by Heinrich Lee Yu

Normalize instance label values

This is so that services binding to e.g. 0.0.0.0 or report
IP addresses and not 'localhost' won't be considered to
run on separate nodes.
parent ebf56669
...@@ -24,6 +24,7 @@ module Gitlab ...@@ -24,6 +24,7 @@ module Gitlab
def topology_usage_data def topology_usage_data
@failures = [] @failures = []
@instances = Set[]
topology_data, duration = measure_duration { topology_fetch_all_data } topology_data, duration = measure_duration { topology_fetch_all_data }
{ {
topology: topology_data topology: topology_data
...@@ -68,8 +69,7 @@ module Gitlab ...@@ -68,8 +69,7 @@ module Gitlab
by_instance_by_job_process_count = topology_all_service_process_count(client) by_instance_by_job_process_count = topology_all_service_process_count(client)
by_instance_by_job_server_types = topology_all_service_server_types(client) by_instance_by_job_server_types = topology_all_service_server_types(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys) @instances.map do |instance|
instances.map do |instance|
{ {
node_memory_total_bytes: by_instance_mem[instance], node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance], node_cpus: by_instance_cpus[instance],
...@@ -203,8 +203,28 @@ module Gitlab ...@@ -203,8 +203,28 @@ module Gitlab
all_instance_data.filter { |metric, _value| metric['instance'] == instance } all_instance_data.filter { |metric, _value| metric['instance'] == instance }
end end
def drop_port(instance) def normalize_instance_label(instance)
instance.gsub(/:.+$/, '') normalize_localhost_address(drop_port_number(instance))
end
def normalize_localhost_address(instance)
ip_addr = IPAddr.new(instance)
is_local_ip = ip_addr.loopback? || ip_addr.to_i.zero?
is_local_ip ? 'localhost' : instance
rescue IPAddr::InvalidAddressError
# This most likely means it was a host name, not an IP address
instance
end
def drop_port_number(instance)
instance.gsub(/:\d+$/, '')
end
def normalize_and_track_instance(instance)
normalize_instance_label(instance).tap do |normalized_instance|
@instances << normalized_instance
end
end end
def one_week_average(query) def one_week_average(query)
...@@ -212,13 +232,13 @@ module Gitlab ...@@ -212,13 +232,13 @@ module Gitlab
end end
def aggregate_by_instance(client, query) def aggregate_by_instance(client, query)
client.aggregate(query) { |metric| drop_port(metric['instance']) } client.aggregate(query) { |metric| normalize_and_track_instance(metric['instance']) }
end end
# Will retain a composite key that values are mapped to # Will retain a composite key that values are mapped to
def aggregate_by_labels(client, query) def aggregate_by_labels(client, query)
client.aggregate(query) do |metric| client.aggregate(query) do |metric|
metric['instance'] = drop_port(metric['instance']) metric['instance'] = normalize_and_track_instance(metric['instance'])
metric metric
end end
end end
...@@ -227,7 +247,7 @@ module Gitlab ...@@ -227,7 +247,7 @@ module Gitlab
# @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector # @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector
def map_instance_labels(query_result_vector, target_labels) def map_instance_labels(query_result_vector, target_labels)
query_result_vector.to_h do |result| query_result_vector.to_h do |result|
key = drop_port(result['metric']['instance']) key = normalize_and_track_instance(result['metric']['instance'])
value = result['metric'].slice(*target_labels).symbolize_keys value = result['metric'].slice(*target_labels).symbolize_keys
[key, value] [key, value]
end end
......
...@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do ...@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do
end end
end end
context 'and services run on the same node but report different instance values' do
let(:node_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:9100' },
'value' => [1000, '512']
}
]
end
let(:node_uname_info_response) do
[
{
"metric" => {
"__name__" => "node_uname_info",
"domainname" => "(none)",
"instance" => "127.0.0.1:9100",
"job" => "node_exporter",
"machine" => "x86_64",
"nodename" => "127.0.0.1",
"release" => "4.19.76-linuxkit",
"sysname" => "Linux"
},
"value" => [1592463033.359, "1"]
}
]
end
# The services in this response should all be mapped to localhost i.e. the same node
let(:service_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '10']
},
{
'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '11']
},
{
'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' },
'value' => [1000, '12']
},
{
'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' },
'value' => [1000, '13']
},
{
'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' },
'value' => [1000, '14']
}
]
end
it 'normalizes equivalent instance values and maps them to the same node' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: node_memory_response),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: node_uname_info_response),
receive_node_service_memory_rss_query(result: service_memory_response),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_memory_total_bytes: 512,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
release: '4.19.76-linuxkit'
},
node_services: [
{
name: 'web',
process_memory_rss: 10
},
{
name: 'sidekiq',
process_memory_rss: 11
},
{
name: 'prometheus',
process_memory_rss: 12
},
{
name: 'redis',
process_memory_rss: 13
},
{
name: 'postgres',
process_memory_rss: 14
}
]
}
]
})
end
end
context 'and node metrics are missing but service metrics exist' do
it 'still reports service metrics' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_uname_info' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_services: [
{
name: 'web',
process_memory_rss: 300
},
{
name: 'sidekiq',
process_memory_rss: 303
}
]
},
{
node_services: [
{
name: 'sidekiq',
process_memory_rss: 400
},
{
name: 'redis',
process_memory_rss: 402
}
]
}
]
})
end
end
context 'and an error is raised when querying Prometheus' do context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query) expect_prometheus_api_to receive(:query)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment