Commit 6c107b90 authored by Luke Duncalfe's avatar Luke Duncalfe

Merge branch 'mwaw/extract_postgres_hll_namespace' into 'master'

Extract postgres hll namespace

See merge request gitlab-org/gitlab!49128
parents cf337d65 b7f5a84a
......@@ -504,7 +504,7 @@ RSpec.describe Gitlab::UsageData do
end
describe 'usage_activity_by_stage_secure' do
let_it_be(:error_rate) { Gitlab::Database::PostgresHllBatchDistinctCounter::ERROR_RATE }
let_it_be(:error_rate) { Gitlab::Database::PostgresHll::BatchDistinctCounter::ERROR_RATE }
let_it_be(:days_ago_within_monthly_time_period) { 3.days.ago }
let_it_be(:user) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
let_it_be(:user2) { create(:user, group_view: :security_dashboard, created_at: days_ago_within_monthly_time_period) }
......@@ -693,7 +693,7 @@ RSpec.describe Gitlab::UsageData do
allow(Gitlab::Database::BatchCount).to receive(:batch_distinct_count).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::BatchCount).to receive(:batch_count).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid)
allow(::Ci::Build).to receive(:distinct_count_by).and_raise(ActiveRecord::StatementInvalid)
expect(described_class.usage_activity_by_stage_secure(described_class.last_28_days_time_period)).to include(
......
# frozen_string_literal: true
module Gitlab
module Database
module PostgresHll
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct batch counter based on HyperLogLog algorithm
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max values,
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
# relation (most cases `id` column) rather than counted attribute eg:
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class BatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start}
# AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
batch_start = start
hll_blob = {}
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
end
def hll_blob_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
.map(&:values)
.to_h
end
# Generate the source query SQL snippet for the provided id range
#
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def source_query(start, finish)
col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
@relation
.where(@relation.primary_key => (start...finish))
.where(col_as_arel.not_eq(nil))
.select(bits.as('attr_hash_32_bits')).to_sql
end
def actual_start(start)
start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
end
def actual_finish(finish)
finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct batch counter based on HyperLogLog algorithm
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max values,
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
# relation (most cases `id` column) rather than counted attribute eg:
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start}
# AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
end
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
start = actual_start(start)
finish = actual_finish(finish)
raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
return FALLBACK if unwanted_configuration?(finish, batch_size, start)
batch_start = start
hll_blob = {}
while batch_start <= finish
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
batch_start += batch_size
end
sleep(SLEEP_TIME_IN_SECONDS)
end
estimate_cardinality(hll_blob)
end
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
end
def hll_blob_for_batch(start, finish)
@relation
.connection
.execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) })
.map(&:values)
.to_h
end
# Generate the source query SQL snippet for the provided id range
#
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def source_query(start, finish)
col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text])
md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col)
bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')])
@relation
.where(@relation.primary_key => (start...finish))
.where(col_as_arel.not_eq(nil))
.select(bits.as('attr_hash_32_bits')).to_sql
end
def actual_start(start)
start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0
end
def actual_finish(finish)
finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0
end
end
end
end
......@@ -27,7 +27,7 @@ module Gitlab
# For estimated distinct count use exact query instead of hll
# buckets query, because it can't be used to obtain estimations without
# supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
# supplementary ruby code present in Gitlab::Database::PostgresHll::BatchDistinctCounter
def estimate_batch_distinct_count(relation, column = nil, *rest)
raw_sql(relation, column, :distinct)
end
......
......@@ -61,7 +61,7 @@ module Gitlab
end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
Gitlab::Database::PostgresHll::BatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
......
......@@ -2,7 +2,7 @@
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
RSpec.describe Gitlab::Database::PostgresHll::BatchDistinctCounter do
let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
......@@ -85,7 +85,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
stub_const('Gitlab::Database::PostgresHll::BatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
end
......@@ -94,9 +94,9 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
end
it "defaults the batch size to #{Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
it "defaults the batch size to #{Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
min_id = model.minimum(:id)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE)
batch_end_id = min_id + calculate_batch_size(Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE)
expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
......@@ -112,14 +112,14 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
end
context 'disallowed configurations' do
let(:default_batch_size) { Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE }
let(:default_batch_size) { Gitlab::Database::PostgresHll::BatchDistinctCounter::DEFAULT_BATCH_SIZE }
it 'returns fallback if start is bigger than finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end
it 'returns fallback if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
large_finish = Gitlab::Database::PostgresHll::BatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end
......
......@@ -41,7 +41,7 @@ RSpec.describe Gitlab::Utils::UsageData do
let(:relation) { double(:relation) }
it 'delegates counting to counter class instance' do
expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
expect_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
......@@ -52,7 +52,7 @@ RSpec.describe Gitlab::Utils::UsageData do
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
......@@ -60,7 +60,7 @@ RSpec.describe Gitlab::Utils::UsageData do
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
allow(Gitlab::Database::PostgresHll::BatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment