Commit b7f5a84a authored by Mikolaj Wawrzyniak's avatar Mikolaj Wawrzyniak

Create PostgresHll namespace

In order to provide designated namespace to house future
clases that will be break apart from counter we need to add dedicated
namespace.
parent 7ead5a08
......@@ -504,7 +504,7 @@ RSpec.describe Gitlab::UsageData do
end
describe 'usage_activity_by_stage_secure' do
let_it_be(:error_rate) { Gitlab::Database::PostgresHllBatchDistinctCounter::ERROR_RATE }
let_it_be(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
let_it_be(:days_ago_within_monthly_time_period) { 3.days.ago }
let_it_be(:user) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
let_it_be(:user2) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
......@@ -693,7 +693,7 @@ RSpec.describe Gitlab::UsageData do
allow(Gitlab::Database::BatchCount).to receive(:batch_distinct_count).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::BatchCount).to receive(:batch_count).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
allow(::Ci::Build).to receive(:distinct_count_by).and_raise(ActiveRecord::StatementInvalid)
expect(described_class.usage_activity_by_stage_secure(described_class.last_28_days_time_period)).to include(
......
# frozen_string_literal: true
module Gitlab
module Database
module PostgresHll
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct batch counter based on HyperLogLog algorithm
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max values,
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
# relation (most cases `id` column) rather than counted attribute eg:
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class BatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start}
# AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
batch_start = start
hll_blob = {}
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
end
def hll_blob_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
.map(&:values)
.to_h
end
# Generate the source query SQL snippet for the provided id range
#
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def source_query(start, finish)
col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
@relation
.where(@relation.primary_key => (start...finish))
.where(col_as_arel.not_eq(nil))
.select(bits.as('attr_hash_32_bits')).to_sql
end
def actual_start(start)
start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
end
def actual_finish(finish)
finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct batch counter based on HyperLogLog algorithm
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max values,
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
# relation (most cases `id` column) rather than counted attribute eg:
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start}
# AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
batch_start = start
hll_blob = {}
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
end
def hll_blob_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
.map(&:values)
.to_h
end
# Generate the source query SQL snippet for the provided id range
#
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def source_query(start, finish)
col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
@relation
.where(@relation.primary_key => (start...finish))
.where(col_as_arel.not_eq(nil))
.select(bits.as('attr_hash_32_bits')).to_sql
end
def actual_start(start)
start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
end
def actual_finish(finish)
finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
end
end
end
end
......@@ -27,7 +27,7 @@ module Gitlab
# For estimated distinct count use exact query instead of hll
# buckets query, because it can't be used to obtain estimations without
# supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
# supplementary ruby code present in Gitlab::Database::PostgresHll::BatchDistinctCounter
def estimate_batch_distinct_count(relation, column = nil, *rest)
raw_sql(relation, column, :distinct)
end
......
......@@ -61,7 +61,7 @@ module Gitlab
end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
......
......@@ -2,7 +2,7 @@
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
......@@ -85,7 +85,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
end
......@@ -94,9 +94,9 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
end
it "defaults the batch size to #{Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
min_id = model.minimum(:id)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
......@@ -112,14 +112,14 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
end
context 'disallowed configurations' do
let(:default_batch_size) { Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE }
let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
it 'returns fallback if start is bigger than finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end
it 'returns fallback if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end
......
......@@ -41,7 +41,7 @@ RSpec.describe Gitlab::Utils::UsageData do
let(:relation) { double(:relation) }
it 'delegates counting to counter class instance' do
expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
......@@ -52,7 +52,7 @@ RSpec.describe Gitlab::Utils::UsageData do
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
......@@ -60,7 +60,7 @@ RSpec.describe Gitlab::Utils::UsageData do
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment