Commit e105810c authored by Adam Hegyi's avatar Adam Hegyi

Remove LooseIndexScanDistinctCount class

This change removes the LooseIndexScanDistinctCount class and the
loose_index_scan_for_distinct_values feature flag.

Related issue:
https://gitlab.com/gitlab-org/quality/triage-reports/-/issues/5833

Changelog: removed
parent 55d94517
---
name: loose_index_scan_for_distinct_values
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/55985
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/324210
milestone: '13.10'
type: development
group: group::optimize
default_enabled: false
......@@ -52,12 +52,7 @@ module Gitlab
batch_end = [batch_start + batch_size, finish].min
batch_relation = build_relation_batch(batch_start, batch_end, mode)
op_args = @operation_args
if @operation == :count && @operation_args.blank? && use_loose_index_scan_for_distinct_values?(mode)
op_args = [Gitlab::Database::LooseIndexScanDistinctCount::COLUMN_ALIAS]
end
results = merge_results(results, batch_relation.send(@operation, *op_args)) # rubocop:disable GitlabSecurity/PublicSend
results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend
batch_start = batch_end
rescue ActiveRecord::QueryCanceled => error
# retry with a safe batch size & warmer cache
......@@ -67,18 +62,6 @@ module Gitlab
log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
return FALLBACK
end
rescue Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError => error
Gitlab::AppJsonLogger
.error(
event: 'batch_count',
relation: @relation.table_name,
operation: @operation,
operation_args: @operation_args,
mode: mode,
message: "LooseIndexScanDistinctCount column error: #{error.message}"
)
return FALLBACK
end
sleep(SLEEP_TIME_IN_SECONDS)
......@@ -104,11 +87,7 @@ module Gitlab
private
def build_relation_batch(start, finish, mode)
if use_loose_index_scan_for_distinct_values?(mode)
Gitlab::Database::LooseIndexScanDistinctCount.new(@relation, @column).build_query(from: start, to: finish)
else
@relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
end
@relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
end
def batch_size_for_mode_and_operation(mode, operation)
......@@ -151,10 +130,6 @@ module Gitlab
)
end
def use_loose_index_scan_for_distinct_values?(mode)
Feature.enabled?(:loose_index_scan_for_distinct_values) && not_group_by_query? && mode == :distinct
end
def not_group_by_query?
!@relation.is_a?(ActiveRecord::Relation) || @relation.group_values.blank?
end
......
# frozen_string_literal: true
module Gitlab
module Database
# This class builds efficient batched distinct query by using loose index scan.
# Consider the following example:
# > Issue.distinct(:project_id).where(project_id: (1...100)).count
#
# Note: there is an index on project_id
#
# This query will read each element in the index matching the project_id filter.
# If for a project_id has 100_000 issues, all 100_000 elements will be read.
#
# A loose index scan will only read one entry from the index for each project_id to reduce the number of disk reads.
#
# Usage:
#
# Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).count(from: 1, to: 100)
#
# The query will return the number of distinct projects_ids between 1 and 100
#
# Getting the Arel query:
#
# Gitlab::Database::LooseIndexScanDisctinctCount.new(Issue, :project_id).build_query(from: 1, to: 100)
class LooseIndexScanDistinctCount
COLUMN_ALIAS = 'distinct_count_column'
ColumnConfigurationError = Class.new(StandardError)
def initialize(scope, column)
if scope.is_a?(ActiveRecord::Relation)
@scope = scope
@model = scope.model
else
@scope = scope.where({})
@model = scope
end
@column = transform_column(column)
end
def count(from:, to:)
build_query(from: from, to: to).count(COLUMN_ALIAS)
end
def build_query(from:, to:) # rubocop:disable Metrics/AbcSize
cte = Gitlab::SQL::RecursiveCTE.new(:counter_cte, union_args: { remove_order: false })
table = model.arel_table
cte << @scope
.dup
.select(column.as(COLUMN_ALIAS))
.where(column.gteq(from))
.where(column.lt(to))
.order(column)
.limit(1)
inner_query = @scope
.dup
.where(column.gt(cte.table[COLUMN_ALIAS]))
.where(column.lt(to))
.select(column.as(COLUMN_ALIAS))
.order(column)
.limit(1)
cte << cte.table
.project(Arel::Nodes::Grouping.new(Arel.sql(inner_query.to_sql)).as(COLUMN_ALIAS))
.where(cte.table[COLUMN_ALIAS].lt(to))
model
.with
.recursive(cte.to_arel)
.from(cte.alias_to(table))
.unscope(where: :source_type)
.unscope(where: model.inheritance_column) # Remove STI query, not needed here
end
private
attr_reader :column, :model
# Transforms the column so it can be used in Arel expressions
#
# 'table.column' => 'table.column'
# 'column' => 'table_name.column'
# :column => 'table_name.column'
# Arel::Attributes::Attribute => name of the column
def transform_column(column)
if column.is_a?(String) || column.is_a?(Symbol)
column_as_string = column.to_s
column_as_string = "#{model.table_name}.#{column_as_string}" unless column_as_string.include?('.')
Arel.sql(column_as_string)
elsif column.is_a?(Arel::Attributes::Attribute)
column
else
raise ColumnConfigurationError, "Cannot transform the column: #{column.inspect}, please provide the column name as string"
end
end
end
end
end
......@@ -270,8 +270,6 @@ RSpec.describe Gitlab::Database::BatchCount do
end
it "defaults the batch size to #{Gitlab::Database::BatchCounter::DEFAULT_DISTINCT_BATCH_SIZE}" do
stub_feature_flags(loose_index_scan_for_distinct_values: false)
min_id = model.minimum(:id)
relation = instance_double(ActiveRecord::Relation)
allow(model).to receive_message_chain(:select, public_send: relation)
......@@ -317,85 +315,13 @@ RSpec.describe Gitlab::Database::BatchCount do
end
end
context 'when the loose_index_scan_for_distinct_values feature flag is off' do
it_behaves_like 'when batch fetch query is canceled' do
let(:mode) { :distinct }
let(:operation) { :count }
let(:operation_args) { nil }
let(:column) { nil }
subject { described_class.method(:batch_distinct_count) }
before do
stub_feature_flags(loose_index_scan_for_distinct_values: false)
end
end
end
context 'when the loose_index_scan_for_distinct_values feature flag is on' do
it_behaves_like 'when batch fetch query is canceled' do
let(:mode) { :distinct }
let(:operation) { :count }
let(:operation_args) { nil }
let(:column) { nil }
let(:batch_size) { 10_000 }
subject { described_class.method(:batch_distinct_count) }
before do
stub_feature_flags(loose_index_scan_for_distinct_values: true)
end
it 'reduces batch size by half and retry fetch' do
too_big_batch_relation_mock = instance_double(ActiveRecord::Relation)
count_method = double(send: 1)
allow(too_big_batch_relation_mock).to receive(:send).and_raise(ActiveRecord::QueryCanceled)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: 0, to: batch_size).and_return(too_big_batch_relation_mock)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: 0, to: batch_size / 2).and_return(count_method)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).with(from: batch_size / 2, to: batch_size).and_return(count_method)
subject.call(model, column, batch_size: batch_size, start: 0, finish: batch_size - 1)
end
context 'when all retries fail' do
let(:batch_count_query) { 'SELECT COUNT(id) FROM relation WHERE id BETWEEN 0 and 1' }
before do
relation = instance_double(ActiveRecord::Relation)
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive_message_chain(:new, :build_query).and_return(relation)
allow(relation).to receive(:send).and_raise(ActiveRecord::QueryCanceled.new('query timed out'))
allow(relation).to receive(:to_sql).and_return(batch_count_query)
end
it 'logs failing query' do
expect(Gitlab::AppJsonLogger).to receive(:error).with(
event: 'batch_count',
relation: model.table_name,
operation: operation,
operation_args: operation_args,
start: 0,
mode: mode,
query: batch_count_query,
message: 'Query has been canceled with message: query timed out'
)
expect(subject.call(model, column, batch_size: batch_size, start: 0)).to eq(-1)
end
end
context 'when LooseIndexScanDistinctCount raises error' do
let(:column) { :creator_id }
let(:error_class) { Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError }
it 'rescues ColumnConfigurationError' do
allow(Gitlab::Database::LooseIndexScanDistinctCount).to receive(:new).and_raise(error_class.new('error message'))
expect(Gitlab::AppJsonLogger).to receive(:error).with(a_hash_including(message: 'LooseIndexScanDistinctCount column error: error message'))
expect(subject.call(Project, column, batch_size: 10_000, start: 0)).to eq(-1)
end
end
end
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::LooseIndexScanDistinctCount do
context 'counting distinct users' do
let_it_be(:user) { create(:user) }
let_it_be(:other_user) { create(:user) }
let(:column) { :creator_id }
before_all do
create_list(:project, 3, creator: user)
create_list(:project, 1, creator: other_user)
end
subject(:count) { described_class.new(Project, :creator_id).count(from: Project.minimum(:creator_id), to: Project.maximum(:creator_id) + 1) }
it { is_expected.to eq(2) }
context 'when STI model is queried' do
it 'does not raise error' do
expect { described_class.new(Group, :owner_id).count(from: 0, to: 1) }.not_to raise_error
end
end
context 'when model with default_scope is queried' do
it 'does not raise error' do
expect { described_class.new(GroupMember, :id).count(from: 0, to: 1) }.not_to raise_error
end
end
context 'when the fully qualified column is given' do
let(:column) { 'projects.creator_id' }
it { is_expected.to eq(2) }
end
context 'when AR attribute is given' do
let(:column) { Project.arel_table[:creator_id] }
it { is_expected.to eq(2) }
end
context 'when invalid value is given for the column' do
let(:column) { Class.new }
it { expect { described_class.new(Group, column) }.to raise_error(Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError) }
end
context 'when null values are present' do
before do
create_list(:project, 2).each { |p| p.update_column(:creator_id, nil) }
end
it { is_expected.to eq(2) }
end
end
context 'counting STI models' do
let!(:groups) { create_list(:group, 3) }
let!(:namespaces) { create_list(:namespace, 2) }
let(:max_id) { Namespace.maximum(:id) + 1 }
it 'counts groups' do
count = described_class.new(Group, :id).count(from: 0, to: max_id)
expect(count).to eq(3)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment