Commit 93f8c757 authored by Mikolaj Wawrzyniak's avatar Mikolaj Wawrzyniak

Remove fallback batch size reduce mechanism

parent 636fb4c9
......@@ -2,16 +2,31 @@
module Gitlab
module Database
module PostgresHllBatchDistinctCount
def batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
PostgresHllBatchDistinctCounter.new(relation, column: column).count(batch_size: batch_size, start: start, finish: finish)
end
class << self
include PostgresHllBatchDistinctCount
end
end
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# Implements a distinct batch counter based on HyperLogLog algorithm
# Needs indexes on the column below to calculate max, min and range queries
# For larger tables just set higher batch_size with index optimization
#
# In order to not use a possible complex time consuming query when calculating min and max values,
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
# relation (most cases `id` column) rather than counted attribute eg:
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 1_250
......@@ -23,11 +38,11 @@ module Gitlab
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
# source_query:
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# @example source_query
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start}
# AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL = <<~SQL
WITH hashed_attributes AS (%{source_query})
......@@ -37,10 +52,11 @@ module Gitlab
GROUP BY 1 ORDER BY 1
SQL
def initialize(relation, column: nil, operation_args: nil)
TOTAL_BUCKETS_NUMBER = 512
def initialize(relation, column = nil)
@relation = relation
@column = column || relation.primary_key
@operation_args = operation_args
end
def unwanted_configuration?(finish, batch_size, start)
......@@ -49,7 +65,7 @@ module Gitlab
start > finish
end
def count(batch_size: nil, start: nil, finish: nil)
def estimate_distinct_count(batch_size: nil, start: nil, finish: nil)
raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
batch_size ||= DEFAULT_BATCH_SIZE
......@@ -67,13 +83,6 @@ module Gitlab
begin
hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old }
batch_start += batch_size
rescue ActiveRecord::QueryCanceled
# retry with a safe batch size & warmer cache
if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
batch_size /= 2
else
return FALLBACK
end
end
sleep(SLEEP_TIME_IN_SECONDS)
end
......@@ -83,17 +92,21 @@ module Gitlab
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def estimate_cardinality(hll_blob)
num_zero_buckets = 512 - hll_blob.size
num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size
num_uniques = (
((512**2) * (0.7213 / (1 + 1.079 / 512))) /
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * 512
((0.7213 / (1 + 1.079 / 512)) * (512 *
Math.log2(512.0 / num_zero_buckets)))
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER *
Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets)))
else
num_uniques
end
......@@ -107,10 +120,17 @@ module Gitlab
.to_h
end
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# Generate the source query SQL snippet for the provided id range
#
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def source_query(start, finish)
col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s)
col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')])
......
......@@ -2,7 +2,7 @@
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
let_it_be(:error_rate) { 4.9 } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(::Gitlab::Database::BatchCounter::MIN_REQUIRED_BATCH_SIZE) }
......@@ -35,7 +35,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
end
it 'counts table' do
expect(described_class.batch_distinct_count(model)).to be_within(error_rate).percent_of(10)
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(10)
end
end
end
......@@ -47,86 +47,51 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
create_list(:issue, 2, author: another_user)
end
shared_examples 'disallowed configurations' do |method|
it 'returns fallback if start is bigger than finish' do
expect(described_class.public_send(method, *args, start: 1, finish: 0)).to eq(fallback)
end
it 'returns fallback if loops more than allowed' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_ALLOWED_LOOPS * default_batch_size + 1
expect(described_class.public_send(method, *args, start: 1, finish: large_finish)).to eq(fallback)
end
it 'returns fallback if batch size is less than min required' do
expect(described_class.public_send(method, *args, batch_size: small_batch_size)).to eq(fallback)
end
end
shared_examples 'when a transaction is open' do
let(:in_transaction) { true }
it 'raises an error' do
expect { subject }.to raise_error('BatchCount can not be run inside a transaction')
end
end
shared_examples 'when batch fetch query is canceled' do
let(:batch_size) { 22_000 }
it 'reduces batch size by half and retry fetch' do
allow(model).to receive(:where).with("id" => 0..calculate_batch_size(batch_size)).and_raise(ActiveRecord::QueryCanceled)
expect(model).to receive(:where).with("id" => 0..calculate_batch_size(batch_size / 2)).and_call_original
subject.call(model, column, batch_size: batch_size, start: 0)
end
end
describe '#batch_distinct_count' do
describe '#estimate_distinct_count' do
it 'counts table' do
expect(described_class.batch_distinct_count(model)).to be_within(error_rate).percent_of(5)
expect(described_class.new(model).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with column field' do
expect(described_class.batch_distinct_count(model, column)).to be_within(error_rate).percent_of(2)
expect(described_class.new(model, column).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :id field' do
expect(described_class.batch_distinct_count(model, :id)).to be_within(error_rate).percent_of(5)
expect(described_class.new(model, :id).estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with "id" field' do
expect(described_class.batch_distinct_count(model, "id")).to be_within(error_rate).percent_of(5)
expect(described_class.new(model, "id").estimate_distinct_count).to be_within(error_rate).percent_of(5)
end
it 'counts with table.column field' do
expect(described_class.batch_distinct_count(model, "#{model.table_name}.#{column}")).to be_within(error_rate).percent_of(2)
expect(described_class.new(model, "#{model.table_name}.#{column}").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with Arel column' do
expect(described_class.batch_distinct_count(model, model.arel_table[column])).to be_within(error_rate).percent_of(2)
expect(described_class.new(model, model.arel_table[column]).estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts over joined relations' do
expect(described_class.batch_distinct_count(model.joins(:author), "users.email")).to be_within(error_rate).percent_of(2)
expect(described_class.new(model.joins(:author), "users.email").estimate_distinct_count).to be_within(error_rate).percent_of(2)
end
it 'counts with :column field with batch_size of 50K' do
expect(described_class.batch_distinct_count(model, column, batch_size: 50_000)).to be_within(error_rate).percent_of(2)
expect(described_class.new(model, column).estimate_distinct_count(batch_size: 50_000)).to be_within(error_rate).percent_of(2)
end
it 'will not count table with a batch size less than allowed' do
expect(described_class.batch_distinct_count(model, column, batch_size: small_batch_size)).to eq(fallback)
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
end
it 'counts with different number of batches and aggregates total result' do
stub_const('Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE', 0)
[1, 2, 4, 5, 6].each { |i| expect(described_class.batch_distinct_count(model, batch_size: i)).to be_within(error_rate).percent_of(5) }
[1, 2, 4, 5, 6].each { |i| expect(described_class.new(model).estimate_distinct_count(batch_size: i)).to be_within(error_rate).percent_of(5) }
end
it 'counts with a start and finish' do
expect(described_class.batch_distinct_count(model, column, start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
expect(described_class.new(model, column).estimate_distinct_count(start: model.minimum(:id), finish: model.maximum(:id))).to be_within(error_rate).percent_of(2)
end
it "defaults the batch size to #{Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE}" do
......@@ -135,27 +100,32 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
expect(model).to receive(:where).with("id" => min_id..batch_end_id).and_call_original
described_class.batch_distinct_count(model)
described_class.new(model).estimate_distinct_count
end
it_behaves_like 'when a transaction is open' do
subject { described_class.batch_distinct_count(model, column) }
context 'when a transaction is open' do
let(:in_transaction) { true }
it 'raises an error' do
expect { described_class.new(model, column).estimate_distinct_count }.to raise_error('BatchCount can not be run inside a transaction')
end
end
context 'disallowed configurations' do
include_examples 'disallowed configurations', :batch_distinct_count do
let(:args) { [model, column] }
let(:default_batch_size) { Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE }
let(:default_batch_size) { Gitlab::Database::PostgresHllBatchDistinctCounter::DEFAULT_BATCH_SIZE }
it 'returns fallback if start is bigger than finish' do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end
end
it_behaves_like 'when batch fetch query is canceled' do
let(:mode) { :distinct }
let(:operation) { :count }
let(:operation_args) { nil }
let(:column) { nil }
it 'returns fallback if loops more than allowed' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_ALLOWED_LOOPS * default_batch_size + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end
subject { described_class.method(:batch_distinct_count) }
it 'returns fallback if batch size is less than min required' do
expect(described_class.new(model, column).estimate_distinct_count(batch_size: small_batch_size)).to eq(fallback)
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment