Create PostgresHll namespace

In order to provide designated namespace to house future clases that will be break apart from counter we need to add dedicated namespace.

Create PostgresHll namespace
In order to provide designated namespace to house future clases that will be break apart from counter we need to add dedicated namespace.
b7f5a84a · Mikolaj Wawrzyniak · 7ead5a08 · b7f5a84a · b7f5a84a · 7ead5a08
Commit b7f5a84a authored Dec 03, 2020 by Mikolaj Wawrzyniak
7 changed files
--- a/ee/spec/lib/ee/gitlab/usage_data_spec.rb
+++ b/ee/spec/lib/ee/gitlab/usage_data_spec.rb
@@ -504,7 +504,7 @@ RSpec.describe Gitlab::UsageData do
  end

  describe 'usage_activity_by_stage_secure' do
-    let_it_be(:error_rate) { Gitlab::Database::PostgresHllBatchDistinctCounter::ERROR_RATE }
+    let_it_be(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
    let_it_be(:days_ago_within_monthly_time_period) { 3.days.ago }
    let_it_be(:user) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
    let_it_be(:user2) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
@@ -693,7 +693,7 @@ RSpec.describe Gitlab::UsageData do

      allow(Gitlab::Database::BatchCount).to receive(:batch_distinct_count).and_raise(ActiveRecord::StatementInvalid)
      allow(Gitlab::Database::BatchCount).to receive(:batch_count).and_raise(ActiveRecord::StatementInvalid)
-      allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
+      allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
      allow(::Ci::Build).to receive(:distinct_count_by).and_raise(ActiveRecord::StatementInvalid)

      expect(described_class.usage_activity_by_stage_secure(described_class.last_28_days_time_period)).to include(

--- a/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll/batch_distinct_counter.rb
+# frozen_string_literal: true
+
+module Gitlab
+  module Database
+    module PostgresHll
+      # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
+      # Implements a distinct batch counter based on HyperLogLog algorithm
+      # Needs indexes on the column below to calculate max, min and range queries
+      # For larger tables just set higher batch_size with index optimization
+      #
+      # In order to not use a possible complex time consuming query when calculating min and max values,
+      # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
+      # relation (most cases `id` column) rather than counted attribute eg:
+      # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
+      #
+      # Grouped relations are NOT supported yet.
+      #
+      # @example Usage
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
+      #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
+      #    .estimate_distinct_count(
+      #      batch_size: 1_000,
+      #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
+      #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
+      #    )
+      #
+      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
+      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
+      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
+      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
+      class BatchDistinctCounter
+        ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
+        FALLBACK = -1
+        MIN_REQUIRED_BATCH_SIZE = 750
+        SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
+        MAX_DATA_VOLUME = 4_000_000_000
+
+        # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
+        DEFAULT_BATCH_SIZE = 10_000
+
+        BIT_31_MASK = "B'0#{'1' * 31}'"
+        BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
+        # @example source_query
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start}
+        #   AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        BUCKETED_DATA_SQL = <<~SQL
+          WITH hashed_attributes AS (%{source_query})
+          SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
+            (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
+          FROM hashed_attributes
+          GROUP BY 1
+        SQL
+
+        TOTAL_BUCKETS_NUMBER = 512
+
+        def initialize(relation, column = nil)
+          @relation = relation
+          @column = column || relation.primary_key
+        end
+
+        def unwanted_configuration?(finish, batch_size, start)
+          batch_size <= MIN_REQUIRED_BATCH_SIZE ||
+            (finish - start) >= MAX_DATA_VOLUME ||
+            start > finish
+        end
+
+        def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
+          raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
+
+          batch_size ||= DEFAULT_BATCH_SIZE
+
+          start = actual_start(start)
+          finish = actual_finish(finish)
+
+          raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+          return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+
+          batch_start = start
+          hll_blob = {}
+
+          while batch_start <= finish
+            begin
+              hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
+              batch_start += batch_size
+            end
+            sleep(SLEEP_TIME_IN_SECONDS)
+          end
+
+          estimate_cardinality(hll_blob)
+        end
+
+        private
+
+        # arbitrary values that are present in #estimate_cardinality
+        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
+        # article, they are not representing any entity and serves as tune value
+        # for the whole equation
+        def estimate_cardinality(hll_blob)
+          num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
+
+          num_uniques = (
+            ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
+              (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
+          ).to_i
+
+          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
+            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
+              Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
+          else
+            num_uniques
+          end
+        end
+
+        def hll_blob_for_batch(start, finish)
+          @relation
+            .connection
+            .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
+            .map(&:values)
+            .to_h
+        end
+
+        # Generate the source query SQL snippet for the provided id range
+        #
+        # @example SQL query template
+        #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
+        #   FROM %{relation}
+        #   WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
+        #   AND %{column} IS NOT NULL
+        #
+        # @param start initial id range
+        # @param finish final id range
+        # @return [String] SQL query fragment
+        def source_query(start, finish)
+          col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
+          col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
+          md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
+          md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
+          bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
+
+          @relation
+            .where(@relation.primary_key => (start...finish))
+            .where(col_as_arel.not_eq(nil))
+            .select(bits.as('attr_hash_32_bits')).to_sql
+        end
+
+        def actual_start(start)
+          start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
+        end
+
+        def actual_finish(finish)
+          finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
+++ b/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
-# frozen_string_literal: true
-
-module Gitlab
-  module Database
-    # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-    # Implements a distinct batch counter based on HyperLogLog algorithm
-    # Needs indexes on the column below to calculate max, min and range queries
-    # For larger tables just set higher batch_size with index optimization
-    #
-    # In order to not use a possible complex time consuming query when calculating min and max values,
-    # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
-    # relation (most cases `id` column) rather than counted attribute eg:
-    # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
-    #
-    # Grouped relations are NOT supported yet.
-    #
-    # @example Usage
-    #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
-    #  ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
-    #    .estimate_distinct_count(
-    #      batch_size: 1_000,
-    #      start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
-    #      finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
-    #    )
-    #
-    # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
-    #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
-    #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
-    #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
-    class PostgresHllBatchDistinctCounter
-      ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
-      FALLBACK = -1
-      MIN_REQUIRED_BATCH_SIZE = 750
-      SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
-      MAX_DATA_VOLUME = 4_000_000_000
-
-      # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
-      DEFAULT_BATCH_SIZE = 10_000
-
-      BIT_31_MASK = "B'0#{'1' * 31}'"
-      BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
-      # @example source_query
-      #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
-      #   FROM %{relation}
-      #   WHERE %{pkey} >= %{batch_start}
-      #   AND %{pkey} < %{batch_end}
-      #   AND %{column} IS NOT NULL
-      BUCKETED_DATA_SQL = <<~SQL
-        WITH hashed_attributes AS (%{source_query})
-        SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
-          (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
-        FROM hashed_attributes
-        GROUP BY 1
-      SQL
-
-      TOTAL_BUCKETS_NUMBER = 512
-
-      def initialize(relation, column = nil)
-        @relation = relation
-        @column = column || relation.primary_key
-      end
-
-      def unwanted_configuration?(finish, batch_size, start)
-        batch_size <= MIN_REQUIRED_BATCH_SIZE ||
-          (finish - start) >= MAX_DATA_VOLUME ||
-          start > finish
-      end
-
-      def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
-        raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
-
-        batch_size ||= DEFAULT_BATCH_SIZE
-
-        start = actual_start(start)
-        finish = actual_finish(finish)
-
-        raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
-        return FALLBACK if unwanted_configuration?(finish, batch_size, start)
-
-        batch_start = start
-        hll_blob = {}
-
-        while batch_start <= finish
-          begin
-            hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
-            batch_start += batch_size
-          end
-          sleep(SLEEP_TIME_IN_SECONDS)
-        end
-
-        estimate_cardinality(hll_blob)
-      end
-
-      private
-
-      # arbitrary values that are present in #estimate_cardinality
-      # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
-      # article, they are not representing any entity and serves as tune value
-      # for the whole equation
-      def estimate_cardinality(hll_blob)
-        num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
-
-        num_uniques = (
-          ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
-            (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
-        ).to_i
-
-        if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
-          ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
-            Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
-        else
-          num_uniques
-        end
-      end
-
-      def hll_blob_for_batch(start, finish)
-        @relation
-          .connection
-          .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
-          .map(&:values)
-          .to_h
-      end
-
-      # Generate the source query SQL snippet for the provided id range
-      #
-      # @example SQL query template
-      #   SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
-      #   FROM %{relation}
-      #   WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
-      #   AND %{column} IS NOT NULL
-      #
-      # @param start initial id range
-      # @param finish final id range
-      # @return [String] SQL query fragment
-      def source_query(start, finish)
-        col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
-        col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
-        md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
-        md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
-        bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
-
-        @relation
-          .where(@relation.primary_key => (start...finish))
-          .where(col_as_arel.not_eq(nil))
-          .select(bits.as('attr_hash_32_bits')).to_sql
-      end
-
-      def actual_start(start)
-        start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
-      end
-
-      def actual_finish(finish)
-        finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
-      end
-    end
-  end
-end
--- a/lib/gitlab/usage_data_queries.rb
+++ b/lib/gitlab/usage_data_queries.rb
@@ -27,7 +27,7 @@ module Gitlab

      # For estimated distinct count use exact query instead of hll
      # buckets query, because it can't be used to obtain estimations without
-      # supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
+      # supplementary ruby code present in Gitlab::Database::PostgresHll::BatchDistinctCounter
      def estimate_batch_distinct_count(relation, column = nil, *rest)
        raw_sql(relation, column, :distinct)
      end

--- a/lib/gitlab/utils/usage_data.rb
+++ b/lib/gitlab/utils/usage_data.rb
@@ -61,7 +61,7 @@ module Gitlab
      end

      def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
-        Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
+        Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
      rescue ActiveRecord::StatementInvalid
        FALLBACK
      # catch all rescue should be removed as a part of feature flag rollout issue

--- a/spec/lib/gitlab/database/postgres_hll_batch_distinct_counter_spec.rb
+++ b/spec/lib/gitlab/database/postgres_hll_batch_distinct_counter_spec.rb
@@ -2,7 +2,7 @@

 require 'spec_helper'

-RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
+RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
  let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
  let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
  let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
@@ -85,7 +85,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
      end

      it 'counts with different number of batches and aggregates total result' do
-        stub_const('Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
+        stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)

        [1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
      end
@@ -94,9 +94,9 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
        expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
      end

-      it "defaults the batch size to #{Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
+      it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
        min_id = model.minimum(:id)
-        batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE)
+        batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)

        expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original

@@ -112,14 +112,14 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
      end

      context 'disallowed configurations' do
-        let(:default_batch_size) { Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE }
+        let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }

        it 'returns fallback if start is bigger than finish' do
          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
        end

        it 'returns fallback if data volume exceeds upper limit' do
-          large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
+          large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
          expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
        end


--- a/spec/lib/gitlab/utils/usage_data_spec.rb
+++ b/spec/lib/gitlab/utils/usage_data_spec.rb
@@ -41,7 +41,7 @@ RSpec.describe Gitlab::Utils::UsageData do
    let(:relation) { double(:relation) }

    it 'delegates counting to counter class instance' do
-      expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
+      expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
        expect(instance).to receive(:estimate_distinct_count)
                              .with(batch_size: nil, start: nil, finish: nil)
                              .and_return(5)
@@ -52,7 +52,7 @@ RSpec.describe Gitlab::Utils::UsageData do

    it 'returns default fallback value when counting fails due to database error' do
      stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
-      allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
+      allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))

      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
    end
@@ -60,7 +60,7 @@ RSpec.describe Gitlab::Utils::UsageData do
    it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
      error = StandardError.new('')
      stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
-      allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
+      allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)

      expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
      expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)