Commit 3e99032d authored by Catalin Irimie's avatar Catalin Irimie

Update formula for PG-HyperLogLog small cardinality estimates

Based on the original HLL paper and referenced linear probabilistic
counting paper, this should be m*log(m/V) instead of using the
alpha(m) constant for small cardinality, as linear counting is
used instead.
parent 6bd1a6f6
......@@ -65,8 +65,7 @@ module Gitlab
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
TOTAL_BUCKETS * Math.log(TOTAL_BUCKETS.to_f / num_zero_buckets)
else
num_uniques
end
......
......@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_1) { 'metric_1' }
let(:metric_2) { 'metric_2' }
let(:metric_names) { [metric_1, metric_2] }
let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
describe 'metric calculations' do
before do
......@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end
it 'returns the number of unique events in the union of all metrics' do
expect(calculate_metrics_union.round(2)).to eq(3.12)
expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(3)
end
context 'when there is no aggregated data saved' do
......@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] }
it 'returns the number of unique events for that metric' do
expect(calculate_metrics_union.round(2)).to eq(2.08)
expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(2)
end
end
end
......@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
end
it 'returns the number of common events in the intersection of all metrics' do
expect(calculate_metrics_intersections.round(2)).to eq(1.04)
expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(1)
end
context 'when there is no aggregated data saved' do
......@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
let(:metric_names) { [metric_1] }
it 'returns the number of common/unique events for the intersection of that metric' do
expect(calculate_metrics_intersections.round(2)).to eq(2.08)
expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(2)
end
end
end
......
......@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do
end.new(time_frame: 'all')
end
it 'calculates a correct result', quarantine: 'https://gitlab.com/gitlab-org/gitlab/-/issues/348139' do
it 'calculates a correct result' do
expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3)
end
......
......@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do
# build_needs set: ['1', '2', '3', '4', '5']
# ci_build set ['a', 'b']
# with them, current implementation is expected to consistently report
# 5.217656147118495 and 2.0809220082170614 values
# the same static values
# This test suite is expected to assure, that HyperLogLog implementation
# behaves consistently between changes made to other parts of codebase.
# In case of fine tuning or changes to HyperLogLog algorithm implementation
......@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do
let(:model) { Ci::BuildNeed }
let(:column) { :name }
let(:build_needs_estimated_cardinality) { 5.217656147118495 }
let(:ci_builds_estimated_cardinality) { 2.0809220082170614 }
let(:build_needs_estimated_cardinality) { 5.024574181542231 }
let(:ci_builds_estimated_cardinality) { 2.003916452421793 }
before do
allow(model.connection).to receive(:transaction_open?).and_return(false)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment