Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
gitlab-ce
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
gitlab-ce
Commits
93f8c757
Commit
93f8c757
authored
Nov 10, 2020
by
Mikolaj Wawrzyniak
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Remove fallback batch size reduce mechanism
parent
636fb4c9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
185 additions
and
33 deletions
+185
-33
lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
lib/gitlab/database/postgres_hll_batch_distinct_counter.rb
+53
-33
spec/lib/gitlab/database/postgres_hll_batch_distinct_counter_spec.rb
...tlab/database/postgres_hll_batch_distinct_counter_spec.rb
+132
-0
No files found.
lib/gitlab/database/postgres_hll_batch_distinct_count.rb
→
lib/gitlab/database/postgres_hll_batch_distinct_count
er
.rb
View file @
93f8c757
...
@@ -2,16 +2,31 @@
...
@@ -2,16 +2,31 @@
module
Gitlab
module
Gitlab
module
Database
module
Database
module
PostgresHllBatchDistinctCount
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
def
batch_distinct_count
(
relation
,
column
=
nil
,
batch_size:
nil
,
start:
nil
,
finish:
nil
)
# Implements a distinct batch counter based on HyperLogLog algorithm
PostgresHllBatchDistinctCounter
.
new
(
relation
,
column:
column
).
count
(
batch_size:
batch_size
,
start:
start
,
finish:
finish
)
# Needs indexes on the column below to calculate max, min and range queries
end
# For larger tables just set higher batch_size with index optimization
#
class
<<
self
# In order to not use a possible complex time consuming query when calculating min and max values,
include
PostgresHllBatchDistinctCount
# the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of
end
# relation (most cases `id` column) rather than counted attribute eg:
end
# estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id))
#
# Grouped relations are NOT supported yet.
#
# @example Usage
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count
# ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period))
# .estimate_distinct_count(
# batch_size: 1_000,
# start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id),
# finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id)
# )
#
# @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
# Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class
PostgresHllBatchDistinctCounter
class
PostgresHllBatchDistinctCounter
FALLBACK
=
-
1
FALLBACK
=
-
1
MIN_REQUIRED_BATCH_SIZE
=
1_250
MIN_REQUIRED_BATCH_SIZE
=
1_250
...
@@ -23,11 +38,11 @@ module Gitlab
...
@@ -23,11 +38,11 @@ module Gitlab
BIT_31_MASK
=
"B'0
#{
'1'
*
31
}
'"
BIT_31_MASK
=
"B'0
#{
'1'
*
31
}
'"
BIT_9_MASK
=
"B'
#{
'0'
*
23
}#{
'1'
*
9
}
'"
BIT_9_MASK
=
"B'
#{
'0'
*
23
}#{
'1'
*
9
}
'"
# @example source_query
#
source_query:
#
SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
#
SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
#
FROM %{relation}
#
FROM %{relation
}
#
WHERE %{pkey} >= %{batch_start
}
#
WHERE %{pkey} >= %{batch_start}
AND %{pkey} < %{batch_end}
#
AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL
=
<<~
SQL
BUCKETED_DATA_SQL
=
<<~
SQL
WITH hashed_attributes AS (%{source_query})
WITH hashed_attributes AS (%{source_query})
...
@@ -37,10 +52,11 @@ module Gitlab
...
@@ -37,10 +52,11 @@ module Gitlab
GROUP BY 1 ORDER BY 1
GROUP BY 1 ORDER BY 1
SQL
SQL
def
initialize
(
relation
,
column:
nil
,
operation_args:
nil
)
TOTAL_BUCKETS_NUMBER
=
512
def
initialize
(
relation
,
column
=
nil
)
@relation
=
relation
@relation
=
relation
@column
=
column
||
relation
.
primary_key
@column
=
column
||
relation
.
primary_key
@operation_args
=
operation_args
end
end
def
unwanted_configuration?
(
finish
,
batch_size
,
start
)
def
unwanted_configuration?
(
finish
,
batch_size
,
start
)
...
@@ -49,7 +65,7 @@ module Gitlab
...
@@ -49,7 +65,7 @@ module Gitlab
start
>
finish
start
>
finish
end
end
def
count
(
batch_size:
nil
,
start:
nil
,
finish:
nil
)
def
estimate_distinct_
count
(
batch_size:
nil
,
start:
nil
,
finish:
nil
)
raise
'BatchCount can not be run inside a transaction'
if
ActiveRecord
::
Base
.
connection
.
transaction_open?
raise
'BatchCount can not be run inside a transaction'
if
ActiveRecord
::
Base
.
connection
.
transaction_open?
batch_size
||=
DEFAULT_BATCH_SIZE
batch_size
||=
DEFAULT_BATCH_SIZE
...
@@ -67,13 +83,6 @@ module Gitlab
...
@@ -67,13 +83,6 @@ module Gitlab
begin
begin
hll_blob
.
merge!
(
hll_blob_for_batch
(
batch_start
,
batch_start
+
batch_size
))
{
|
_key
,
old
,
new
|
new
>
old
?
new
:
old
}
hll_blob
.
merge!
(
hll_blob_for_batch
(
batch_start
,
batch_start
+
batch_size
))
{
|
_key
,
old
,
new
|
new
>
old
?
new
:
old
}
batch_start
+=
batch_size
batch_start
+=
batch_size
rescue
ActiveRecord
::
QueryCanceled
# retry with a safe batch size & warmer cache
if
batch_size
>=
2
*
MIN_REQUIRED_BATCH_SIZE
batch_size
/=
2
else
return
FALLBACK
end
end
end
sleep
(
SLEEP_TIME_IN_SECONDS
)
sleep
(
SLEEP_TIME_IN_SECONDS
)
end
end
...
@@ -83,17 +92,21 @@ module Gitlab
...
@@ -83,17 +92,21 @@ module Gitlab
private
private
# arbitrary values that are present in #estimate_cardinality
# are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
# article, they are not representing any entity and serves as tune value
# for the whole equation
def
estimate_cardinality
(
hll_blob
)
def
estimate_cardinality
(
hll_blob
)
num_zero_buckets
=
512
-
hll_blob
.
size
num_zero_buckets
=
TOTAL_BUCKETS_NUMBER
-
hll_blob
.
size
num_uniques
=
(
num_uniques
=
(
((
512
**
2
)
*
(
0.7213
/
(
1
+
1.079
/
512
)))
/
((
TOTAL_BUCKETS_NUMBER
**
2
)
*
(
0.7213
/
(
1
+
1.079
/
TOTAL_BUCKETS_NUMBER
)))
/
(
num_zero_buckets
+
hll_blob
.
values
.
sum
{
|
bucket_hash
,
_
|
2
**
(
-
1
*
bucket_hash
)}
)
(
num_zero_buckets
+
hll_blob
.
values
.
sum
{
|
bucket_hash
,
_
|
2
**
(
-
1
*
bucket_hash
)}
)
).
to_i
).
to_i
if
num_zero_buckets
>
0
&&
num_uniques
<
2.5
*
512
if
num_zero_buckets
>
0
&&
num_uniques
<
2.5
*
TOTAL_BUCKETS_NUMBER
((
0.7213
/
(
1
+
1.079
/
512
))
*
(
512
*
((
0.7213
/
(
1
+
1.079
/
TOTAL_BUCKETS_NUMBER
))
*
(
TOTAL_BUCKETS_NUMBER
*
Math
.
log2
(
512.0
/
num_zero_buckets
)))
Math
.
log2
(
TOTAL_BUCKETS_NUMBER
.
to_f
/
num_zero_buckets
)))
else
else
num_uniques
num_uniques
end
end
...
@@ -107,10 +120,17 @@ module Gitlab
...
@@ -107,10 +120,17 @@ module Gitlab
.
to_h
.
to_h
end
end
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# Generate the source query SQL snippet for the provided id range
# FROM %{relation}
#
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# @example SQL query template
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
# AND %{column} IS NOT NULL
#
# @param start initial id range
# @param finish final id range
# @return [String] SQL query fragment
def
source_query
(
start
,
finish
)
def
source_query
(
start
,
finish
)
col_as_arel
=
@column
.
is_a?
(
Arel
::
Attributes
::
Attribute
)
?
@column
:
Arel
.
sql
(
@column
.
to_s
)
col_as_arel
=
@column
.
is_a?
(
Arel
::
Attributes
::
Attribute
)
?
@column
:
Arel
.
sql
(
@column
.
to_s
)
col_as_text
=
Arel
::
Nodes
::
NamedFunction
.
new
(
'CAST'
,
[
col_as_arel
.
as
(
'text'
)])
col_as_text
=
Arel
::
Nodes
::
NamedFunction
.
new
(
'CAST'
,
[
col_as_arel
.
as
(
'text'
)])
...
...
spec/lib/gitlab/database/postgres_hll_batch_distinct_count_spec.rb
→
spec/lib/gitlab/database/postgres_hll_batch_distinct_count
er
_spec.rb
View file @
93f8c757
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
require
'spec_helper'
require
'spec_helper'
RSpec
.
describe
Gitlab
::
Database
::
PostgresHllBatchDistinctCount
do
RSpec
.
describe
Gitlab
::
Database
::
PostgresHllBatchDistinctCount
er
do
let_it_be
(
:error_rate
)
{
4.9
}
# HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be
(
:error_rate
)
{
4.9
}
# HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be
(
:fallback
)
{
::
Gitlab
::
Database
::
BatchCounter
::
FALLBACK
}
let_it_be
(
:fallback
)
{
::
Gitlab
::
Database
::
BatchCounter
::
FALLBACK
}
let_it_be
(
:small_batch_size
)
{
calculate_batch_size
(
::
Gitlab
::
Database
::
BatchCounter
::
MIN_REQUIRED_BATCH_SIZE
)
}
let_it_be
(
:small_batch_size
)
{
calculate_batch_size
(
::
Gitlab
::
Database
::
BatchCounter
::
MIN_REQUIRED_BATCH_SIZE
)
}
...
@@ -35,7 +35,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
...
@@ -35,7 +35,7 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
end
end
it
'counts table'
do
it
'counts table'
do
expect
(
described_class
.
batch_distinct_count
(
model
)
).
to
be_within
(
error_rate
).
percent_of
(
10
)
expect
(
described_class
.
new
(
model
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
10
)
end
end
end
end
end
end
...
@@ -47,86 +47,51 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
...
@@ -47,86 +47,51 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
create_list
(
:issue
,
2
,
author:
another_user
)
create_list
(
:issue
,
2
,
author:
another_user
)
end
end
shared_examples
'disallowed configurations'
do
|
method
|
describe
'#estimate_distinct_count'
do
it
'returns fallback if start is bigger than finish'
do
expect
(
described_class
.
public_send
(
method
,
*
args
,
start:
1
,
finish:
0
)).
to
eq
(
fallback
)
end
it
'returns fallback if loops more than allowed'
do
large_finish
=
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
MAX_ALLOWED_LOOPS
*
default_batch_size
+
1
expect
(
described_class
.
public_send
(
method
,
*
args
,
start:
1
,
finish:
large_finish
)).
to
eq
(
fallback
)
end
it
'returns fallback if batch size is less than min required'
do
expect
(
described_class
.
public_send
(
method
,
*
args
,
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
end
end
shared_examples
'when a transaction is open'
do
let
(
:in_transaction
)
{
true
}
it
'raises an error'
do
expect
{
subject
}.
to
raise_error
(
'BatchCount can not be run inside a transaction'
)
end
end
shared_examples
'when batch fetch query is canceled'
do
let
(
:batch_size
)
{
22_000
}
it
'reduces batch size by half and retry fetch'
do
allow
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
0
..
calculate_batch_size
(
batch_size
)).
and_raise
(
ActiveRecord
::
QueryCanceled
)
expect
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
0
..
calculate_batch_size
(
batch_size
/
2
)).
and_call_original
subject
.
call
(
model
,
column
,
batch_size:
batch_size
,
start:
0
)
end
end
describe
'#batch_distinct_count'
do
it
'counts table'
do
it
'counts table'
do
expect
(
described_class
.
batch_distinct_count
(
model
)
).
to
be_within
(
error_rate
).
percent_of
(
5
)
expect
(
described_class
.
new
(
model
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
end
it
'counts with column field'
do
it
'counts with column field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
)
).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
'counts with :id field'
do
it
'counts with :id field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
:id
)
).
to
be_within
(
error_rate
).
percent_of
(
5
)
expect
(
described_class
.
new
(
model
,
:id
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
end
it
'counts with "id" field'
do
it
'counts with "id" field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
"id"
)
).
to
be_within
(
error_rate
).
percent_of
(
5
)
expect
(
described_class
.
new
(
model
,
"id"
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
end
it
'counts with table.column field'
do
it
'counts with table.column field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
"
#{
model
.
table_name
}
.
#{
column
}
"
)
).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
,
"
#{
model
.
table_name
}
.
#{
column
}
"
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
'counts with Arel column'
do
it
'counts with Arel column'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
model
.
arel_table
[
column
])
).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
,
model
.
arel_table
[
column
]).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
'counts over joined relations'
do
it
'counts over joined relations'
do
expect
(
described_class
.
batch_distinct_count
(
model
.
joins
(
:author
),
"users.email"
)
).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
.
joins
(
:author
),
"users.email"
).
estimate_distinct_count
).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
'counts with :column field with batch_size of 50K'
do
it
'counts with :column field with batch_size of 50K'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
batch_size:
50_000
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
batch_size:
50_000
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
'will not count table with a batch size less than allowed'
do
it
'will not count table with a batch size less than allowed'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
end
end
it
'counts with different number of batches and aggregates total result'
do
it
'counts with different number of batches and aggregates total result'
do
stub_const
(
'Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE'
,
0
)
stub_const
(
'Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE'
,
0
)
[
1
,
2
,
4
,
5
,
6
].
each
{
|
i
|
expect
(
described_class
.
batch_distinct_count
(
model
,
batch_size:
i
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
}
[
1
,
2
,
4
,
5
,
6
].
each
{
|
i
|
expect
(
described_class
.
new
(
model
).
estimate_distinct_count
(
batch_size:
i
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
}
end
end
it
'counts with a start and finish'
do
it
'counts with a start and finish'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
start:
model
.
minimum
(
:id
),
finish:
model
.
maximum
(
:id
))).
to
be_within
(
error_rate
).
percent_of
(
2
)
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
start:
model
.
minimum
(
:id
),
finish:
model
.
maximum
(
:id
))).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
end
it
"defaults the batch size to
#{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
"
do
it
"defaults the batch size to
#{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
"
do
...
@@ -135,27 +100,32 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
...
@@ -135,27 +100,32 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCount do
expect
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
min_id
..
batch_end_id
).
and_call_original
expect
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
min_id
..
batch_end_id
).
and_call_original
described_class
.
batch_distinct_count
(
model
)
described_class
.
new
(
model
).
estimate_distinct_count
end
end
it_behaves_like
'when a transaction is open'
do
context
'when a transaction is open'
do
subject
{
described_class
.
batch_distinct_count
(
model
,
column
)
}
let
(
:in_transaction
)
{
true
}
it
'raises an error'
do
expect
{
described_class
.
new
(
model
,
column
).
estimate_distinct_count
}.
to
raise_error
(
'BatchCount can not be run inside a transaction'
)
end
end
end
context
'disallowed configurations'
do
context
'disallowed configurations'
do
include_examples
'disallowed configurations'
,
:batch_distinct_count
do
let
(
:default_batch_size
)
{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
let
(
:args
)
{
[
model
,
column
]
}
let
(
:default_batch_size
)
{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
it
'returns fallback if start is bigger than finish'
do
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
start:
1
,
finish:
0
)).
to
eq
(
fallback
)
end
end
end
it_behaves_like
'when batch fetch query is canceled'
do
it
'returns fallback if loops more than allowed'
do
let
(
:mode
)
{
:distinct
}
large_finish
=
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
MAX_ALLOWED_LOOPS
*
default_batch_size
+
1
let
(
:operation
)
{
:count
}
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
start:
1
,
finish:
large_finish
)).
to
eq
(
fallback
)
let
(
:operation_args
)
{
nil
}
end
let
(
:column
)
{
nil
}
subject
{
described_class
.
method
(
:batch_distinct_count
)
}
it
'returns fallback if batch size is less than min required'
do
expect
(
described_class
.
new
(
model
,
column
).
estimate_distinct_count
(
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
end
end
end
end
end
end
end
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment