Commit fbc32fad authored by Heinrich Lee Yu's avatar Heinrich Lee Yu

Merge branch '222530-improve-instance-label-handling' into 'master'

Increase reliability of node to service mappings

Closes #222530

See merge request gitlab-org/gitlab!35756
parents 3a2073f9 27a9888e
......@@ -24,6 +24,7 @@ module Gitlab
def topology_usage_data
@failures = []
@instances = Set[]
topology_data, duration = measure_duration { topology_fetch_all_data }
{
topology: topology_data
......@@ -68,8 +69,7 @@ module Gitlab
by_instance_by_job_process_count = topology_all_service_process_count(client)
by_instance_by_job_server_types = topology_all_service_server_types(client)
instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
instances.map do |instance|
@instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_cpus: by_instance_cpus[instance],
......@@ -203,8 +203,28 @@ module Gitlab
all_instance_data.filter { |metric, _value| metric['instance'] == instance }
end
def drop_port(instance)
instance.gsub(/:.+$/, '')
def normalize_instance_label(instance)
normalize_localhost_address(drop_port_number(instance))
end
def normalize_localhost_address(instance)
ip_addr = IPAddr.new(instance)
is_local_ip = ip_addr.loopback? || ip_addr.to_i.zero?
is_local_ip ? 'localhost' : instance
rescue IPAddr::InvalidAddressError
# This most likely means it was a host name, not an IP address
instance
end
def drop_port_number(instance)
instance.gsub(/:\d+$/, '')
end
def normalize_and_track_instance(instance)
normalize_instance_label(instance).tap do |normalized_instance|
@instances << normalized_instance
end
end
def one_week_average(query)
......@@ -212,13 +232,13 @@ module Gitlab
end
def aggregate_by_instance(client, query)
client.aggregate(query) { |metric| drop_port(metric['instance']) }
client.aggregate(query) { |metric| normalize_and_track_instance(metric['instance']) }
end
# Will retain a composite key that values are mapped to
def aggregate_by_labels(client, query)
client.aggregate(query) do |metric|
metric['instance'] = drop_port(metric['instance'])
metric['instance'] = normalize_and_track_instance(metric['instance'])
metric
end
end
......@@ -227,7 +247,7 @@ module Gitlab
# @return [Hash] mapping instance to a hash of target labels key/value, or the empty hash if input empty vector
def map_instance_labels(query_result_vector, target_labels)
query_result_vector.to_h do |result|
key = drop_port(result['metric']['instance'])
key = normalize_and_track_instance(result['metric']['instance'])
value = result['metric'].slice(*target_labels).symbolize_keys
[key, value]
end
......
......@@ -160,6 +160,173 @@ RSpec.describe Gitlab::UsageData::Topology do
end
end
context 'and services run on the same node but report different instance values' do
let(:node_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:9100' },
'value' => [1000, '512']
}
]
end
let(:node_uname_info_response) do
[
{
"metric" => {
"__name__" => "node_uname_info",
"domainname" => "(none)",
"instance" => "127.0.0.1:9100",
"job" => "node_exporter",
"machine" => "x86_64",
"nodename" => "127.0.0.1",
"release" => "4.19.76-linuxkit",
"sysname" => "Linux"
},
"value" => [1592463033.359, "1"]
}
]
end
# The services in this response should all be mapped to localhost i.e. the same node
let(:service_memory_response) do
[
{
'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' },
'value' => [1000, '10']
},
{
'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' },
'value' => [1000, '11']
},
{
'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' },
'value' => [1000, '12']
},
{
'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' },
'value' => [1000, '13']
},
{
'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' },
'value' => [1000, '14']
}
]
end
it 'normalizes equivalent instance values and maps them to the same node' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: node_memory_response),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: node_uname_info_response),
receive_node_service_memory_rss_query(result: service_memory_response),
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_memory_total_bytes: 512,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
release: '4.19.76-linuxkit'
},
node_services: [
{
name: 'web',
process_memory_rss: 10
},
{
name: 'sidekiq',
process_memory_rss: 11
},
{
name: 'prometheus',
process_memory_rss: 12
},
{
name: 'redis',
process_memory_rss: 13
},
{
name: 'postgres',
process_memory_rss: 14
}
]
}
]
})
end
end
context 'and node metrics are missing but service metrics exist' do
it 'still reports service metrics' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query(result: []),
receive_node_service_memory_pss_query(result: []),
receive_node_service_process_count_query(result: []),
receive_node_service_app_server_workers_query(result: [])
)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_uname_info' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
],
nodes: [
{
node_services: [
{
name: 'web',
process_memory_rss: 300
},
{
name: 'sidekiq',
process_memory_rss: 303
}
]
},
{
node_services: [
{
name: 'sidekiq',
process_memory_rss: 400
},
{
name: 'redis',
process_memory_rss: 402
}
]
}
]
})
end
end
context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment