Commit 44c184e9 authored by Heinrich Lee Yu's avatar Heinrich Lee Yu

Merge branch...

Merge branch 'mwaw/273391-extract-product-analytics-metrics-aggregation-out-of-hllrediscounter-module' into 'master'

Extract Product Analytics metrics aggregation out of HLLRedisCounter module

See merge request gitlab-org/gitlab!52334
parents 0224e857 5ce4b4d2
# frozen_string_literal: true
module Gitlab
module Usage
module Metrics
module Aggregates
UNION_OF_AGGREGATED_METRICS = 'OR'
INTERSECTION_OF_AGGREGATED_METRICS = 'AND'
ALLOWED_METRICS_AGGREGATIONS = [UNION_OF_AGGREGATED_METRICS, INTERSECTION_OF_AGGREGATED_METRICS].freeze
AGGREGATED_METRICS_PATH = Rails.root.join('lib/gitlab/usage_data_counters/aggregated_metrics/*.yml')
UnknownAggregationOperator = Class.new(StandardError)
class Aggregate
delegate :calculate_events_union,
:weekly_time_range,
:monthly_time_range,
to: Gitlab::UsageDataCounters::HLLRedisCounter
def initialize
@aggregated_metrics = load_events(AGGREGATED_METRICS_PATH)
end
def monthly_data
aggregated_metrics_data(**monthly_time_range)
end
def weekly_data
aggregated_metrics_data(**weekly_time_range)
end
private
attr_accessor :aggregated_metrics
def aggregated_metrics_data(start_date:, end_date:)
aggregated_metrics.each_with_object({}) do |aggregation, weekly_data|
next if aggregation[:feature_flag] && Feature.disabled?(aggregation[:feature_flag], default_enabled: false, type: :development)
weekly_data[aggregation[:name]] = calculate_count_for_aggregation(aggregation, start_date: start_date, end_date: end_date)
end
end
def calculate_count_for_aggregation(aggregation, start_date:, end_date:)
case aggregation[:operator]
when UNION_OF_AGGREGATED_METRICS
calculate_events_union(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
when INTERSECTION_OF_AGGREGATED_METRICS
calculate_events_intersections(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
else
Gitlab::ErrorTracking
.track_and_raise_for_dev_exception(UnknownAggregationOperator.new("Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}"))
Gitlab::Utils::UsageData::FALLBACK
end
rescue Gitlab::UsageDataCounters::HLLRedisCounter::EventError => error
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(error)
Gitlab::Utils::UsageData::FALLBACK
end
# calculate intersection of 'n' sets based on inclusion exclusion principle https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
# this method will be extracted to dedicated module with https://gitlab.com/gitlab-org/gitlab/-/issues/273391
def calculate_events_intersections(event_names:, start_date:, end_date:, subset_powers_cache: Hash.new({}))
# calculate power of intersection of all given metrics from inclusion exclusion principle
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C|) =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
# calculate each components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ...
subset_powers_data = subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
# calculate last component of the equation |A & B & C & D| = .... - |A + B + C + D|
power_of_union_of_all_events = begin
subset_powers_cache[event_names.size][event_names.join('_+_')] ||= \
calculate_events_union(event_names: event_names, start_date: start_date, end_date: end_date)
end
# in order to determine if part of equation (|A & B & C|, |A & B & C & D|), that represents the intersection that we need to calculate,
# is positive or negative in particular equation we need to determine if number of subsets is even or odd. Please take a look at two examples below
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + |A & B & C| =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
subset_powers_size_even = subset_powers_data.size.even?
# sum all components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... =>
sum_of_all_subset_powers = sum_subset_powers(subset_powers_data, subset_powers_size_even)
# add last component of the equation |A & B & C & D| = sum_of_all_subset_powers - |A + B + C + D|
sum_of_all_subset_powers + (subset_powers_size_even ? power_of_union_of_all_events : -power_of_union_of_all_events)
end
def sum_subset_powers(subset_powers_data, subset_powers_size_even)
sum_without_sign = subset_powers_data.to_enum.with_index.sum do |value, index|
(index + 1).odd? ? value : -value
end
(subset_powers_size_even ? -1 : 1) * sum_without_sign
end
def subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
subset_sizes = (1..(event_names.size - 1))
subset_sizes.map do |subset_size|
if subset_size > 1
# calculate sum of powers of intersection between each subset (with given size) of metrics: #|A + B + C + D| = ... - (|A & B| + |A & C| + .. + |C & D|)
event_names.combination(subset_size).sum do |events_subset|
subset_powers_cache[subset_size][events_subset.join('_&_')] ||= \
calculate_events_intersections(event_names: events_subset, start_date: start_date, end_date: end_date, subset_powers_cache: subset_powers_cache)
end
else
# calculate sum of powers of each set (metric) alone #|A + B + C + D| = (|A| + |B| + |C| + |D|) - ...
event_names.sum do |event|
subset_powers_cache[subset_size][event] ||= \
calculate_events_union(event_names: event, start_date: start_date, end_date: end_date)
end
end
end
end
def load_events(wildcard)
Dir[wildcard].each_with_object([]) do |path, events|
events.push(*load_yaml_from_path(path))
end
end
def load_yaml_from_path(path)
YAML.safe_load(File.read(path))&.map(&:with_indifferent_access)
end
end
end
end
end
end
...@@ -23,6 +23,7 @@ module Gitlab ...@@ -23,6 +23,7 @@ module Gitlab
deployment_minimum_id deployment_minimum_id
deployment_maximum_id deployment_maximum_id
auth_providers auth_providers
aggregated_metrics
recorded_at recorded_at
).freeze ).freeze
...@@ -691,13 +692,13 @@ module Gitlab ...@@ -691,13 +692,13 @@ module Gitlab
def aggregated_metrics_monthly def aggregated_metrics_monthly
{ {
aggregated_metrics: ::Gitlab::UsageDataCounters::HLLRedisCounter.aggregated_metrics_monthly_data aggregated_metrics: aggregated_metrics.monthly_data
} }
end end
def aggregated_metrics_weekly def aggregated_metrics_weekly
{ {
aggregated_metrics: ::Gitlab::UsageDataCounters::HLLRedisCounter.aggregated_metrics_weekly_data aggregated_metrics: aggregated_metrics.weekly_data
} }
end end
...@@ -742,6 +743,10 @@ module Gitlab ...@@ -742,6 +743,10 @@ module Gitlab
private private
def aggregated_metrics
@aggregated_metrics ||= ::Gitlab::Usage::Metrics::Aggregates::Aggregate.new
end
def event_monthly_active_users(date_range) def event_monthly_active_users(date_range)
data = { data = {
action_monthly_active_users_project_repo: Gitlab::UsageDataCounters::TrackUniqueEvents::PUSH_ACTION, action_monthly_active_users_project_repo: Gitlab::UsageDataCounters::TrackUniqueEvents::PUSH_ACTION,
......
...@@ -13,15 +13,10 @@ module Gitlab ...@@ -13,15 +13,10 @@ module Gitlab
AggregationMismatch = Class.new(EventError) AggregationMismatch = Class.new(EventError)
SlotMismatch = Class.new(EventError) SlotMismatch = Class.new(EventError)
CategoryMismatch = Class.new(EventError) CategoryMismatch = Class.new(EventError)
UnknownAggregationOperator = Class.new(EventError)
InvalidContext = Class.new(EventError) InvalidContext = Class.new(EventError)
KNOWN_EVENTS_PATH = File.expand_path('known_events/*.yml', __dir__) KNOWN_EVENTS_PATH = File.expand_path('known_events/*.yml', __dir__)
ALLOWED_AGGREGATIONS = %i(daily weekly).freeze ALLOWED_AGGREGATIONS = %i(daily weekly).freeze
UNION_OF_AGGREGATED_METRICS = 'OR'
INTERSECTION_OF_AGGREGATED_METRICS = 'AND'
ALLOWED_METRICS_AGGREGATIONS = [UNION_OF_AGGREGATED_METRICS, INTERSECTION_OF_AGGREGATED_METRICS].freeze
AGGREGATED_METRICS_PATH = File.expand_path('aggregated_metrics/*.yml', __dir__)
# Track event on entity_id # Track event on entity_id
# Increment a Redis HLL counter for unique event_name and entity_id # Increment a Redis HLL counter for unique event_name and entity_id
...@@ -90,37 +85,40 @@ module Gitlab ...@@ -90,37 +85,40 @@ module Gitlab
events_names = events_for_category(category) events_names = events_for_category(category)
event_results = events_names.each_with_object({}) do |event, hash| event_results = events_names.each_with_object({}) do |event, hash|
hash["#{event}_weekly"] = unique_events(event_names: [event], start_date: 7.days.ago.to_date, end_date: Date.current) hash["#{event}_weekly"] = unique_events(**weekly_time_range.merge(event_names: [event]))
hash["#{event}_monthly"] = unique_events(event_names: [event], start_date: 4.weeks.ago.to_date, end_date: Date.current) hash["#{event}_monthly"] = unique_events(**monthly_time_range.merge(event_names: [event]))
end end
if eligible_for_totals?(events_names) if eligible_for_totals?(events_names)
event_results["#{category}_total_unique_counts_weekly"] = unique_events(event_names: events_names, start_date: 7.days.ago.to_date, end_date: Date.current) event_results["#{category}_total_unique_counts_weekly"] = unique_events(**weekly_time_range.merge(event_names: events_names))
event_results["#{category}_total_unique_counts_monthly"] = unique_events(event_names: events_names, start_date: 4.weeks.ago.to_date, end_date: Date.current) event_results["#{category}_total_unique_counts_monthly"] = unique_events(**monthly_time_range.merge(event_names: events_names))
end end
category_results["#{category}"] = event_results category_results["#{category}"] = event_results
end end
end end
def known_event?(event_name) def weekly_time_range
event_for(event_name).present? { start_date: 7.days.ago.to_date, end_date: Date.current }
end end
def aggregated_metrics_monthly_data def monthly_time_range
aggregated_metrics_data(4.weeks.ago.to_date) { start_date: 4.weeks.ago.to_date, end_date: Date.current }
end end
def aggregated_metrics_weekly_data def known_event?(event_name)
aggregated_metrics_data(7.days.ago.to_date) event_for(event_name).present?
end end
def known_events def known_events
@known_events ||= load_events(KNOWN_EVENTS_PATH) @known_events ||= load_events(KNOWN_EVENTS_PATH)
end end
def aggregated_metrics def calculate_events_union(event_names:, start_date:, end_date:)
@aggregated_metrics ||= load_events(AGGREGATED_METRICS_PATH) count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date) do |events|
raise SlotMismatch, events unless events_in_same_slot?(events)
raise AggregationMismatch, events unless events_same_aggregation?(events)
end
end end
private private
...@@ -139,93 +137,6 @@ module Gitlab ...@@ -139,93 +137,6 @@ module Gitlab
Plan.all_plans Plan.all_plans
end end
def aggregated_metrics_data(start_date)
aggregated_metrics.each_with_object({}) do |aggregation, weekly_data|
next if aggregation[:feature_flag] && Feature.disabled?(aggregation[:feature_flag], default_enabled: false, type: :development)
weekly_data[aggregation[:name]] = calculate_count_for_aggregation(aggregation, start_date: start_date, end_date: Date.current)
end
end
def calculate_count_for_aggregation(aggregation, start_date:, end_date:)
case aggregation[:operator]
when UNION_OF_AGGREGATED_METRICS
calculate_events_union(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
when INTERSECTION_OF_AGGREGATED_METRICS
calculate_events_intersections(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
else
raise UnknownAggregationOperator, "Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}"
end
end
# calculate intersection of 'n' sets based on inclusion exclusion principle https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
# this method will be extracted to dedicated module with https://gitlab.com/gitlab-org/gitlab/-/issues/273391
def calculate_events_intersections(event_names:, start_date:, end_date:, subset_powers_cache: Hash.new({}))
# calculate power of intersection of all given metrics from inclusion exclusion principle
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C|) =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
# calculate each components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ...
subset_powers_data = subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
# calculate last component of the equation |A & B & C & D| = .... - |A + B + C + D|
power_of_union_of_all_events = begin
subset_powers_cache[event_names.size][event_names.join('_+_')] ||= \
calculate_events_union(event_names: event_names, start_date: start_date, end_date: end_date)
end
# in order to determine if part of equation (|A & B & C|, |A & B & C & D|), that represents the intersection that we need to calculate,
# is positive or negative in particular equation we need to determine if number of subsets is even or odd. Please take a look at two examples below
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + |A & B & C| =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
subset_powers_size_even = subset_powers_data.size.even?
# sum all components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... =>
sum_of_all_subset_powers = sum_subset_powers(subset_powers_data, subset_powers_size_even)
# add last component of the equation |A & B & C & D| = sum_of_all_subset_powers - |A + B + C + D|
sum_of_all_subset_powers + (subset_powers_size_even ? power_of_union_of_all_events : -power_of_union_of_all_events)
end
def sum_subset_powers(subset_powers_data, subset_powers_size_even)
sum_without_sign = subset_powers_data.to_enum.with_index.sum do |value, index|
(index + 1).odd? ? value : -value
end
(subset_powers_size_even ? -1 : 1) * sum_without_sign
end
def subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
subset_sizes = (1..(event_names.size - 1))
subset_sizes.map do |subset_size|
if subset_size > 1
# calculate sum of powers of intersection between each subset (with given size) of metrics: #|A + B + C + D| = ... - (|A & B| + |A & C| + .. + |C & D|)
event_names.combination(subset_size).sum do |events_subset|
subset_powers_cache[subset_size][events_subset.join('_&_')] ||= \
calculate_events_intersections(event_names: events_subset, start_date: start_date, end_date: end_date, subset_powers_cache: subset_powers_cache)
end
else
# calculate sum of powers of each set (metric) alone #|A + B + C + D| = (|A| + |B| + |C| + |D|) - ...
event_names.sum do |event|
subset_powers_cache[subset_size][event] ||= \
unique_events(event_names: event, start_date: start_date, end_date: end_date)
end
end
end
end
def calculate_events_union(event_names:, start_date:, end_date:)
count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date) do |events|
raise SlotMismatch, events unless events_in_same_slot?(events)
raise AggregationMismatch, events unless events_same_aggregation?(events)
end
end
def count_unique_events(event_names:, start_date:, end_date:, context: '') def count_unique_events(event_names:, start_date:, end_date:, context: '')
events = events_for(Array(event_names).map(&:to_s)) events = events_for(Array(event_names).map(&:to_s))
...@@ -340,12 +251,6 @@ module Gitlab ...@@ -340,12 +251,6 @@ module Gitlab
end.flatten end.flatten
end end
def validate_aggregation_operator!(operator)
return true if ALLOWED_METRICS_AGGREGATIONS.include?(operator)
raise UnknownAggregationOperator.new("Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}")
end
def weekly_redis_keys(events:, start_date:, end_date:, context: '') def weekly_redis_keys(events:, start_date:, end_date:, context: '')
end_date = end_date.end_of_week - 1.week end_date = end_date.end_of_week - 1.week
(start_date.to_date..end_date.to_date).map do |date| (start_date.to_date..end_date.to_date).map do |date|
......
This diff is collapsed.
...@@ -17,7 +17,7 @@ RSpec.describe 'aggregated metrics' do ...@@ -17,7 +17,7 @@ RSpec.describe 'aggregated metrics' do
Gitlab::UsageDataCounters::HLLRedisCounter.known_events Gitlab::UsageDataCounters::HLLRedisCounter.known_events
end end
Gitlab::UsageDataCounters::HLLRedisCounter.aggregated_metrics.tap do |aggregated_metrics| Gitlab::Usage::Metrics::Aggregates::Aggregate.new.send(:aggregated_metrics).tap do |aggregated_metrics|
it 'all events has unique name' do it 'all events has unique name' do
event_names = aggregated_metrics&.map { |event| event[:name] } event_names = aggregated_metrics&.map { |event| event[:name] }
...@@ -37,7 +37,7 @@ RSpec.describe 'aggregated metrics' do ...@@ -37,7 +37,7 @@ RSpec.describe 'aggregated metrics' do
end end
it "uses allowed aggregation operators" do it "uses allowed aggregation operators" do
expect(Gitlab::UsageDataCounters::HLLRedisCounter::ALLOWED_METRICS_AGGREGATIONS).to include aggregate[:operator] expect(Gitlab::Usage::Metrics::Aggregates::ALLOWED_METRICS_AGGREGATIONS).to include aggregate[:operator]
end end
it "uses events from the same Redis slot" do it "uses events from the same Redis slot" do
......
...@@ -1286,8 +1286,10 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do ...@@ -1286,8 +1286,10 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do
describe '.aggregated_metrics_weekly' do describe '.aggregated_metrics_weekly' do
subject(:aggregated_metrics_payload) { described_class.aggregated_metrics_weekly } subject(:aggregated_metrics_payload) { described_class.aggregated_metrics_weekly }
it 'uses ::Gitlab::UsageDataCounters::HLLRedisCounter#aggregated_metrics_data', :aggregate_failures do it 'uses ::Gitlab::Usage::Metrics::Aggregates::Aggregate#weekly_data', :aggregate_failures do
expect(::Gitlab::UsageDataCounters::HLLRedisCounter).to receive(:aggregated_metrics_weekly_data).and_return(global_search_gmau: 123) expect_next_instance_of(::Gitlab::Usage::Metrics::Aggregates::Aggregate) do |instance|
expect(instance).to receive(:weekly_data).and_return(global_search_gmau: 123)
end
expect(aggregated_metrics_payload).to eq(aggregated_metrics: { global_search_gmau: 123 }) expect(aggregated_metrics_payload).to eq(aggregated_metrics: { global_search_gmau: 123 })
end end
end end
...@@ -1295,8 +1297,10 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do ...@@ -1295,8 +1297,10 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do
describe '.aggregated_metrics_monthly' do describe '.aggregated_metrics_monthly' do
subject(:aggregated_metrics_payload) { described_class.aggregated_metrics_monthly } subject(:aggregated_metrics_payload) { described_class.aggregated_metrics_monthly }
it 'uses ::Gitlab::UsageDataCounters::HLLRedisCounter#aggregated_metrics_data', :aggregate_failures do it 'uses ::Gitlab::Usage::Metrics::Aggregates::Aggregate#monthly_data', :aggregate_failures do
expect(::Gitlab::UsageDataCounters::HLLRedisCounter).to receive(:aggregated_metrics_monthly_data).and_return(global_search_gmau: 123) expect_next_instance_of(::Gitlab::Usage::Metrics::Aggregates::Aggregate) do |instance|
expect(instance).to receive(:monthly_data).and_return(global_search_gmau: 123)
end
expect(aggregated_metrics_payload).to eq(aggregated_metrics: { global_search_gmau: 123 }) expect(aggregated_metrics_payload).to eq(aggregated_metrics: { global_search_gmau: 123 })
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment