Update formula for PG-HyperLogLog small cardinality estimates

Based on the original HLL paper and referenced linear probabilistic counting paper, this should be m*log(m/V) instead of using the alpha(m) constant for small cardinality, as linear counting is used instead.

Update formula for PG-HyperLogLog small cardinality estimates
Based on the original HLL paper and referenced linear probabilistic counting paper, this should be m*log(m/V) instead of using the alpha(m) constant for small cardinality, as linear counting is used instead.
3e99032d · Catalin Irimie · 6bd1a6f6 · 3e99032d · 3e99032d · 3e99032d
Commit 3e99032d authored Dec 12, 2021 by Catalin Irimie
4 changed files
--- a/lib/gitlab/database/postgres_hll/buckets.rb
+++ b/lib/gitlab/database/postgres_hll/buckets.rb
@@ -65,8 +65,7 @@ module Gitlab
          ).to_i

          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
-            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
-              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
+            TOTAL_BUCKETS * Math.log(TOTAL_BUCKETS.to_f / num_zero_buckets)
          else
            num_uniques
          end

--- a/spec/lib/gitlab/usage/metrics/aggregates/sources/postgres_hll_spec.rb
+++ b/spec/lib/gitlab/usage/metrics/aggregates/sources/postgres_hll_spec.rb
@@ -11,6 +11,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
  let(:metric_1) { 'metric_1' }
  let(:metric_2) { 'metric_2' }
  let(:metric_names) { [metric_1, metric_2] }
+  let(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }

  describe 'metric calculations' do
    before do
@@ -38,7 +39,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
      end

      it 'returns the number of unique events in the union of all metrics' do
-        expect(calculate_metrics_union.round(2)).to eq(3.12)
+        expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(3)
      end

      context 'when there is no aggregated data saved' do
@@ -53,7 +54,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
        let(:metric_names) { [metric_1] }

        it 'returns the number of unique events for that metric' do
-          expect(calculate_metrics_union.round(2)).to eq(2.08)
+          expect(calculate_metrics_union.round(2)).to be_within(error_rate).percent_of(2)
        end
      end
    end
@@ -64,7 +65,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
      end

      it 'returns the number of common events in the intersection of all metrics' do
-        expect(calculate_metrics_intersections.round(2)).to eq(1.04)
+        expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(1)
      end

      context 'when there is no aggregated data saved' do
@@ -79,7 +80,7 @@ RSpec.describe Gitlab::Usage::Metrics::Aggregates::Sources::PostgresHll, :clean_
        let(:metric_names) { [metric_1] }

        it 'returns the number of common/unique events for the intersection of that metric' do
-          expect(calculate_metrics_intersections.round(2)).to eq(2.08)
+          expect(calculate_metrics_intersections.round(2)).to be_within(error_rate).percent_of(2)
        end
      end
    end

--- a/spec/lib/gitlab/usage/metrics/instrumentations/database_metric_spec.rb
+++ b/spec/lib/gitlab/usage/metrics/instrumentations/database_metric_spec.rb
@@ -82,7 +82,7 @@ RSpec.describe Gitlab::Usage::Metrics::Instrumentations::DatabaseMetric do
        end.new(time_frame: 'all')
      end

-      it 'calculates a correct result', quarantine: 'https://gitlab.com/gitlab-org/gitlab/-/issues/348139' do
+      it 'calculates a correct result' do
        expect(subject.value).to be_within(Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE).percent_of(3)
      end


--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -118,7 +118,7 @@ RSpec.describe Gitlab::Utils::UsageData do
      # build_needs set: ['1', '2', '3', '4', '5']
      # ci_build set ['a', 'b']
      # with them, current implementation is expected to consistently report
-      # 5.217656147118495 and 2.0809220082170614 values
+      # the same static values
      # This test suite is expected to assure, that HyperLogLog implementation
      # behaves consistently between changes made to other parts of codebase.
      # In case of fine tuning or changes to HyperLogLog algorithm implementation
@@ -130,8 +130,8 @@ RSpec.describe Gitlab::Utils::UsageData do

      let(:model) { Ci::BuildNeed }
      let(:column) { :name }
-      let(:build_needs_estimated_cardinality) { 5.217656147118495 }
-      let(:ci_builds_estimated_cardinality) { 2.0809220082170614 }
+      let(:build_needs_estimated_cardinality) { 5.024574181542231 }
+      let(:ci_builds_estimated_cardinality) { 2.003916452421793 }

      before do
        allow(model.connection).to receive(:transaction_open?).and_return(false)