Commit 27a9888e authored by Matthias Käppler's avatar Matthias Käppler Committed by Heinrich Lee Yu

Normalize instance label values

This is so that services binding to e.g. 0.0.0.0 or report
IP addresses and not 'localhost' won't be considered to
run on separate nodes.
parent ebf56669
......@@ -24,6 +24,7 @@ module Gitlab
def topology_usage_data
@failures = []
@instances = Set[]
topology_data, duration = measure_duration { topology_fetch_all_data }
{
topology: topology_data
......@@ -68,8 +69,7 @@ module Gitlab
by_instance_by_job_process_count = topology_all_service_process_count(client)
by_instance_by_job_server_types = topology_all_service_server_types(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
instances.map do |instance|
@instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance],
......@@ -203,8 +203,28 @@ module Gitlab
all_instance_data.filter { |metric, _value| metric['instance'] == instance }
end
def drop_port(instance)
instance.gsub(/:.+$/, '')
def normalize_instance_label(instance)
normalize_localhost_address(drop_port_number(instance))
end
def normalize_localhost_address(instance)
ip_addr = IPAddr.new(instance)
is_local_ip = ip_addr.loopback? || ip_addr.to_i.zero?
is_local_ip ? 'localhost' : instance
rescue IPAddr::InvalidAddressError
# This most likely means it was a host name, not an IP address
instance
end
def drop_port_number(instance)
instance.gsub(/:\d+$/, '')
end
def normalize_and_track_instance(instance)
normalize_instance_label(instance).tap do |normalized_instance|
@instances << normalized_instance
end
end
def one_week_average(query)
......@@ -212,13 +232,13 @@ module Gitlab
end
def aggregate_by_instance(client, query)
client.aggregate(query) { |metric| drop_port(metric['instance']) }
client.aggregate(query) { |metric| normalize_and_track_instance(metric['instance']) }
end
# Will retain a composite key that values are mapped to
def aggregate_by_labels(client, query)
client.aggregate(query) do |metric|
metric['instance'] = drop_port(metric['instance'])
metric['instance'] = normalize_and_track_instance(metric['instance'])
metric
end
end
......@@ -227,7 +247,7 @@ module Gitlab
# @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector
def map_instance_labels(query_result_vector, target_labels)
query_result_vector.to_h do |result|
key = drop_port(result['metric']['instance'])
key = normalize_and_track_instance(result['metric']['instance'])
value = result['metric'].slice(*target_labels).symbolize_keys
[key, value]
end
......
......@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do
end
end
context 'and services run on the same node but report different instance values' do
let(:node_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:9100' },
'value' => [1000, '512']
}
]
end
let(:node_uname_info_response) do
[
{
"metric" => {
"__name__" => "node_uname_info",
"domainname" => "(none)",
"instance" => "127.0.0.1:9100",
"job" => "node_exporter",
"machine" => "x86_64",
"nodename" => "127.0.0.1",
"release" => "4.19.76-linuxkit",
"sysname" => "Linux"
},
"value" => [1592463033.359, "1"]
}
]
end
# The services in this response should all be mapped to localhost i.e. the same node
let(:service_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '10']
},
{
'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '11']
},
{
'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' },
'value' => [1000, '12']
},
{
'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' },
'value' => [1000, '13']
},
{
'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' },
'value' => [1000, '14']
}
]
end
it 'normalizes equivalent instance values and maps them to the same node' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: node_memory_response),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: node_uname_info_response),
receive_node_service_memory_rss_query(result: service_memory_response),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_memory_total_bytes: 512,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
release: '4.19.76-linuxkit'
},
node_services: [
{
name: 'web',
process_memory_rss: 10
},
{
name: 'sidekiq',
process_memory_rss: 11
},
{
name: 'prometheus',
process_memory_rss: 12
},
{
name: 'redis',
process_memory_rss: 13
},
{
name: 'postgres',
process_memory_rss: 14
}
]
}
]
})
end
end
context 'and node metrics are missing but service metrics exist' do
it 'still reports service metrics' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_uname_info' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_services: [
{
name: 'web',
process_memory_rss: 300
},
{
name: 'sidekiq',
process_memory_rss: 303
}
]
},
{
node_services: [
{
name: 'sidekiq',
process_memory_rss: 400
},
{
name: 'redis',
process_memory_rss: 402
}
]
}
]
})
end
end
context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment