Commit 83f0798e authored by Stan Hu's avatar Stan Hu

Merge branch 'ab-approximate-counts' into 'master'

Approximate table counts based on TABLESAMPLE

See merge request gitlab-org/gitlab-ce!22650
parents eeb0e98d 9c059a02
---
title: Approximate counting strategy with TABLESAMPLE.
merge_request: 22650
author:
type: performance
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
# We can optimize this by using various strategies for approximate counting.
#
# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
#
# However, since statistics are not always up to date, we also implement a table sampling strategy
# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
module Gitlab
module Database
module Count
......@@ -20,68 +25,30 @@ module Gitlab
end
# Takes in an array of models and returns a Hash for the approximate
# counts for them. If the model's table has not been vacuumed or
# analyzed recently, simply run the Model.count to get the data.
# counts for them.
#
# Various count strategies can be specified that are executed in
# sequence until all tables have an approximate count attached
# or we run out of strategies.
#
# Note that not all strategies are available on all supported RDBMS.
#
# @param [Array]
# @return [Hash] of Model -> count mapping
def self.approximate_counts(models)
table_to_model_map = models.each_with_object({}) do |model, hash|
hash[model.table_name] = model
end
def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
strategies.each_with_object({}) do |strategy, counts_by_model|
if strategy.enabled?
models_with_missing_counts = models - counts_by_model.keys
table_names = table_to_model_map.keys
counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
break if models_with_missing_counts.empty?
# Convert table -> count to Model -> count
counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
model = table_to_model_map[pair.first]
hash[model] = pair.second
end
counts = strategy.new(models_with_missing_counts).count
missing_tables = table_names - counts_by_table_name.keys
missing_tables.each do |table|
model = table_to_model_map[table]
counts_by_model[model] = model.count
counts.each do |model, count|
counts_by_model[model] = count
end
counts_by_model
end
# Returns a hash of the table names that have recently updated tuples.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def self.reltuples_from_recently_updated(table_names)
query = postgresql_estimate_query(table_names)
rows = []
# Querying tuple stats only works on the primary. Due to load
# balancing, we need to ensure this query hits the load balancer. The
# easiest way to do this is to start a transaction.
ActiveRecord::Base.transaction do
rows = ActiveRecord::Base.connection.select_all(query)
end
rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
rescue *CONNECTION_ERRORS
{}
end
# Generates the PostgreSQL query to return the tuples for tables
# that have been vacuumed or analyzed in the last hour.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def self.postgresql_estimate_query(table_names)
time = "to_timestamp(#{1.hour.ago.to_i})"
<<~SQL
SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
SQL
end
end
end
......
# frozen_string_literal: true
module Gitlab
module Database
module Count
# This strategy performs an exact count on the model.
#
# This is guaranteed to be accurate, however it also scans the
# whole table. Hence, there are no guarantees with respect
# to runtime.
#
# Note that for very large tables, this may even timeout.
class ExactCountStrategy
attr_reader :models
def initialize(models)
@models = models
end
def count
models.each_with_object({}) do |model, data|
data[model] = model.count
end
end
def self.enabled?
true
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
module Count
class PgClass < ActiveRecord::Base
self.table_name = 'pg_class'
end
# This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
#
# Specifically, it relies on the column reltuples in said table. An additional
# check is performed to make sure statistics were updated within the last hour.
#
# Otherwise, this strategy skips tables with outdated statistics.
#
# There are no guarantees with respect to the accuracy of this strategy. Runtime
# however is guaranteed to be "fast", because it only looks up statistics.
class ReltuplesCountStrategy
attr_reader :models
def initialize(models)
@models = models
end
# Returns a hash of the table names that have recently updated tuples.
#
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def count
size_estimates
rescue *CONNECTION_ERRORS
{}
end
def self.enabled?
Gitlab::Database.postgresql?
end
private
def table_names
models.map(&:table_name)
end
def size_estimates(check_statistics: true)
table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
# Querying tuple stats only works on the primary. Due to load balancing, the
# easiest way to do this is to start a transaction.
ActiveRecord::Base.transaction do
get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
model = table_to_model[row.table_name]
data[model] = row.estimate
end
end
end
# Generates the PostgreSQL query to return the tuples for tables
# that have been vacuumed or analyzed in the last hour.
#
# @param [Array] table names
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
def get_statistics(table_names, check_statistics: true)
time = 1.hour.ago
query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
.where(relname: table_names)
.select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
if check_statistics
query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
time, time, time, time)
end
query
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Database
module Count
# A tablesample count executes in two phases:
# * Estimate table sizes based on reltuples.
# * Based on the estimate:
# * If the table is considered 'small', execute an exact relation count.
# * Otherwise, count on a sample of the table using TABLESAMPLE.
#
# The size of the sample is chosen in a way that we always roughly scan
# the same amount of rows (see TABLESAMPLE_ROW_TARGET).
#
# There are no guarantees with respect to the accuracy of the result or runtime.
class TablesampleCountStrategy < ReltuplesCountStrategy
EXACT_COUNT_THRESHOLD = 10_000
TABLESAMPLE_ROW_TARGET = 10_000
def count
estimates = size_estimates(check_statistics: false)
models.each_with_object({}) do |model, count_by_model|
count = perform_count(model, estimates[model])
count_by_model[model] = count if count
end
rescue *CONNECTION_ERRORS
{}
end
def self.enabled?
Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
end
private
def perform_count(model, estimate)
# If we estimate 0, we may not have statistics at all. Don't use them.
return nil unless estimate && estimate > 0
if estimate < EXACT_COUNT_THRESHOLD
# The table is considered small, the assumption here is that
# the exact count will be fast anyways.
model.count
else
# The table is considered large, let's only count on a sample.
tablesample_count(model, estimate)
end
end
def tablesample_count(model, estimate)
portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
inverse = 1 / portion
query = <<~SQL
SELECT (COUNT(*)*#{inverse})::integer AS count
FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
SQL
rows = ActiveRecord::Base.connection.select_all(query)
Integer(rows.first['count'])
end
end
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::ExactCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
subject { described_class.new(models).count }
describe '#count' do
it 'counts all models' do
expect(models).to all(receive(:count).and_call_original)
expect(subject).to eq({ Project => 3, Identity => 1 })
end
end
describe '.enabled?' do
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is enabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_truthy
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::ReltuplesCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
subject { described_class.new(models).count }
describe '#count', :postgresql do
context 'when reltuples is up to date' do
before do
ActiveRecord::Base.connection.execute('ANALYZE projects')
ActiveRecord::Base.connection.execute('ANALYZE identities')
end
it 'uses statistics to do the count' do
models.each { |model| expect(model).not_to receive(:count) }
expect(subject).to eq({ Project => 3, Identity => 1 })
end
end
context 'insufficient permissions' do
it 'returns an empty hash' do
allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
expect(subject).to eq({})
end
end
end
describe '.enabled?' do
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is disabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_falsey
end
end
end
require 'spec_helper'
describe Gitlab::Database::Count::TablesampleCountStrategy do
before do
create_list(:project, 3)
create(:identity)
end
let(:models) { [Project, Identity] }
let(:strategy) { described_class.new(models) }
subject { strategy.count }
describe '#count', :postgresql do
let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
before do
allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
end
context 'for tables with an estimated small size' do
it 'performs an exact count' do
expect(Identity).to receive(:count).and_call_original
expect(subject).to include({ Identity => 1 })
end
end
context 'for tables with an estimated large size' do
it 'performs a tablesample count' do
expect(Project).not_to receive(:count)
result = subject
expect(result[Project]).to eq(3)
end
end
context 'insufficient permissions' do
it 'returns an empty hash' do
allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
expect(subject).to eq({})
end
end
end
describe '.enabled?' do
before do
stub_feature_flags(tablesample_counts: true)
end
it 'is enabled for PostgreSQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
expect(described_class.enabled?).to be_truthy
end
it 'is disabled for MySQL' do
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
expect(described_class.enabled?).to be_falsey
end
end
end
......@@ -8,63 +8,51 @@ describe Gitlab::Database::Count do
let(:models) { [Project, Identity] }
describe '.approximate_counts' do
context 'with MySQL' do
context 'when reltuples have not been updated' do
it 'counts all models the normal way' do
expect(Gitlab::Database).to receive(:postgresql?).and_return(false)
context '.approximate_counts' do
context 'selecting strategies' do
let(:strategies) { [double('s1', enabled?: true), double('s2', enabled?: false)] }
expect(Project).to receive(:count).and_call_original
expect(Identity).to receive(:count).and_call_original
it 'uses only enabled strategies' do
expect(strategies[0]).to receive(:new).and_return(double('strategy1', count: {}))
expect(strategies[1]).not_to receive(:new)
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
described_class.approximate_counts(models, strategies: strategies)
end
end
context 'with PostgreSQL', :postgresql do
describe 'when reltuples have not been updated' do
it 'counts all models the normal way' do
expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({})
context 'fallbacks' do
subject { described_class.approximate_counts(models, strategies: strategies) }
expect(Project).to receive(:count).and_call_original
expect(Identity).to receive(:count).and_call_original
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
let(:strategies) do
[
double('s1', enabled?: true, new: first_strategy),
double('s2', enabled?: true, new: second_strategy)
]
end
describe 'no permission' do
it 'falls back to standard query' do
allow(described_class).to receive(:postgresql_estimate_query).and_raise(PG::InsufficientPrivilege)
expect(Project).to receive(:count).and_call_original
expect(Identity).to receive(:count).and_call_original
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
end
let(:first_strategy) { double('first strategy', count: {}) }
let(:second_strategy) { double('second strategy', count: {}) }
describe 'when some reltuples have been updated' do
it 'counts projects in the fast way' do
expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({ 'projects' => 3 })
it 'gets results from first strategy' do
expect(strategies[0]).to receive(:new).with(models).and_return(first_strategy)
expect(first_strategy).to receive(:count)
expect(Project).not_to receive(:count).and_call_original
expect(Identity).to receive(:count).and_call_original
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
subject
end
describe 'when all reltuples have been updated' do
before do
ActiveRecord::Base.connection.execute('ANALYZE projects')
ActiveRecord::Base.connection.execute('ANALYZE identities')
it 'gets more results from second strategy if some counts are missing' do
expect(first_strategy).to receive(:count).and_return({ Project => 3 })
expect(strategies[1]).to receive(:new).with([Identity]).and_return(second_strategy)
expect(second_strategy).to receive(:count).and_return({ Identity => 1 })
expect(subject).to eq({ Project => 3, Identity => 1 })
end
it 'counts models with the standard way' do
expect(Project).not_to receive(:count)
expect(Identity).not_to receive(:count)
it 'does not get more results as soon as all counts are present' do
expect(first_strategy).to receive(:count).and_return({ Project => 3, Identity => 1 })
expect(strategies[1]).not_to receive(:new)
expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
end
subject
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment