Commit 15623c7f authored by Oleg Smirnov's avatar Oleg Smirnov

MDEV-30660 Aggregation functions fail to leverage uniqueness property

When executing a statement of the form
  SELECT AGGR_FN(DISTINCT c1, c2,..,cn) FROM t1,
where AGGR_FN is an aggregate function such as COUNT(), AVG() or SUM(),
and a unique index exists on table t1 covering some or all of the
columns (c1, c2,..,cn), the retrieved values are inherently unique.
Consequently, the need for de-duplication imposed by the DISTINCT
clause can be eliminated, leading to optimization of aggregation
operations.
This optimization applies under the following conditions:
  - only one table involved in the join (not counting const tables)
  - some arguments of the aggregate function are fields
        (not functions/subqueries)

This optimization extends to queries of the form
  SELECT AGGR_FN(c1, c2,..,cn) GROUP BY cx,..cy
when a unique index covers some or all of the columns
(c1, c2,..cn, cx,..cy)
parent 0381921e
#
# MDEV-30660 COUNT DISTINCT seems unnecessarily slow when run on a PK
#
set @save_optimizer_trace = @@optimizer_trace;
SET optimizer_trace='enabled=on';
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY, b INT NOT NULL);
INSERT INTO t1 VALUES (1,1), (2,1), (3,1);
# Optimization is applied (aggregator=simple):
SELECT COUNT(DISTINCT a) FROM t1;
COUNT(DISTINCT a)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a)",
"aggregator_type": "simple"
}
]
SELECT AVG(DISTINCT a), SUM(DISTINCT b) FROM t1;
AVG(DISTINCT a) SUM(DISTINCT b)
2.0000 1
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "avg(distinct t1.a)",
"aggregator_type": "simple"
},
{
"function": "sum(distinct t1.b)",
"aggregator_type": "distinct"
}
]
# Only `a` is unique but it's enough to eliminate DISTINCT:
SELECT COUNT(DISTINCT b, a) FROM t1;
COUNT(DISTINCT b, a)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.b,t1.a)",
"aggregator_type": "simple"
}
]
SELECT COUNT(DISTINCT a, a + b) FROM t1;
COUNT(DISTINCT a, a + b)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a,t1.a + t1.b)",
"aggregator_type": "simple"
}
]
SELECT SUM(DISTINCT a), AVG(DISTINCT a), COUNT(DISTINCT a) FROM t1 WHERE a > 1;
SUM(DISTINCT a) AVG(DISTINCT a) COUNT(DISTINCT a)
5 2.5000 2
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "sum(distinct t1.a)",
"aggregator_type": "simple"
},
{
"function": "avg(distinct t1.a)",
"aggregator_type": "simple"
},
{
"function": "count(distinct t1.a)",
"aggregator_type": "simple"
}
]
# Optimization is not applied 'cause function argument is not a field
# (aggregator=distinct):
SELECT SUM(DISTINCT a + b) FROM t1;
SUM(DISTINCT a + b)
9
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "sum(distinct t1.a + t1.b)",
"aggregator_type": "distinct"
}
]
SELECT COUNT(DISTINCT b) FROM t1;
COUNT(DISTINCT b)
1
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.b)",
"aggregator_type": "distinct"
}
]
SELECT AVG(DISTINCT b / a) FROM t1;
AVG(DISTINCT b / a)
0.61110000
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "avg(distinct t1.b / t1.a)",
"aggregator_type": "distinct"
}
]
EXPLAIN SELECT COUNT(DISTINCT (SELECT a)) FROM t1;
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 index NULL PRIMARY 4 NULL 3 Using index
2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL No tables used
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct (/* select#2 */ select t1.a))",
"aggregator_type": "distinct"
}
]
CREATE TABLE t2 (a INT);
INSERT INTO t2 VALUES (1), (2);
# Optimization is not applied 'cause there is more than one table
SELECT COUNT(DISTINCT t1.a) FROM t1, t2;
COUNT(DISTINCT t1.a)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a)",
"aggregator_type": "distinct"
}
]
SELECT AVG(DISTINCT t1.a) FROM t1, t2;
AVG(DISTINCT t1.a)
2.0000
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "avg(distinct t1.a)",
"aggregator_type": "distinct"
}
]
# Const tables, optimization is applied
SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1) AS t2;
COUNT(DISTINCT a)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a)",
"aggregator_type": "simple"
}
]
SELECT AVG(DISTINCT t1.a) FROM (SELECT 1 AS a) AS t2, t1, (SELECT 2 AS a) AS t3;
AVG(DISTINCT t1.a)
2.0000
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "avg(distinct t1.a)",
"aggregator_type": "simple"
}
]
SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1 UNION SELECT 2) AS t2;
COUNT(DISTINCT a)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a)",
"aggregator_type": "distinct"
}
]
# Unique index on two columns
CREATE TABLE t3 (a INT NOT NULL, b INT NOT NULL);
INSERT INTO t3 VALUES (1,1), (1,2), (1,3), (2,1), (2,2), (3,1), (3,2);
CREATE UNIQUE INDEX t3_a_b ON t3 (a, b);
# Optimization is applied:
SELECT COUNT(DISTINCT a, b) FROM t3;
COUNT(DISTINCT a, b)
7
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t3.a,t3.b)",
"aggregator_type": "simple"
}
]
SELECT COUNT(DISTINCT b, a) FROM t3;
COUNT(DISTINCT b, a)
7
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t3.b,t3.a)",
"aggregator_type": "simple"
}
]
SELECT COUNT(DISTINCT b, a) FROM t3 WHERE a < 3;
COUNT(DISTINCT b, a)
5
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t3.b,t3.a)",
"aggregator_type": "simple"
}
]
# Optimization is applied to one of the functions:
SELECT COUNT(DISTINCT b), SUM(DISTINCT a), SUM(DISTINCT a + b) FROM t3 GROUP BY a;
COUNT(DISTINCT b) SUM(DISTINCT a) SUM(DISTINCT a + b)
3 1 9
2 2 7
2 3 9
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t3.b)",
"aggregator_type": "simple"
},
{
"function": "sum(distinct t3.a)",
"aggregator_type": "distinct"
},
{
"function": "sum(distinct t3.a + t3.b)",
"aggregator_type": "distinct"
}
]
# Can't apply optimization 'cause GROUP BY argument is not a field:
SELECT COUNT(DISTINCT b) FROM t3 GROUP BY a+b;
COUNT(DISTINCT b)
1
2
3
1
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t3.b)",
"aggregator_type": "distinct"
}
]
# Test merged view
CREATE VIEW v1 AS SELECT * FROM t1;
# Optimization is applied
SELECT COUNT(DISTINCT a, b) FROM v1;
COUNT(DISTINCT a, b)
3
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "count(distinct t1.a,t1.b)",
"aggregator_type": "simple"
}
]
# GROUP_CONCAT implements non-standard distinct aggregator
SELECT GROUP_CONCAT(b) FROM t1;
GROUP_CONCAT(b)
1,1,1
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "group_concat(t1.b separator ',')",
"aggregator_type": "simple"
}
]
SELECT GROUP_CONCAT(DISTINCT b) FROM t1;
GROUP_CONCAT(DISTINCT b)
1
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
JS
[
{
"function": "group_concat(distinct t1.b separator ',')",
"aggregator_type": "distinct"
}
]
DROP TABLE t1, t2, t3;
DROP VIEW v1;
SET optimizer_trace = @save_optimizer_trace;
#
# end of 10.5 tests
#
# Embedded doesn't have optimizer trace:
--source include/not_embedded.inc
--source include/have_sequence.inc
--echo #
--echo # MDEV-30660 COUNT DISTINCT seems unnecessarily slow when run on a PK
--echo #
set @save_optimizer_trace = @@optimizer_trace;
SET optimizer_trace='enabled=on';
let $trace=
SELECT JSON_DETAILED(JSON_EXTRACT(trace, '\$**.prepare_sum_aggregators')) AS JS
FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY, b INT NOT NULL);
INSERT INTO t1 VALUES (1,1), (2,1), (3,1);
--echo # Optimization is applied (aggregator=simple):
SELECT COUNT(DISTINCT a) FROM t1;
eval $trace;
SELECT AVG(DISTINCT a), SUM(DISTINCT b) FROM t1;
eval $trace;
--echo # Only `a` is unique but it's enough to eliminate DISTINCT:
SELECT COUNT(DISTINCT b, a) FROM t1;
eval $trace;
SELECT COUNT(DISTINCT a, a + b) FROM t1;
eval $trace;
SELECT SUM(DISTINCT a), AVG(DISTINCT a), COUNT(DISTINCT a) FROM t1 WHERE a > 1;
eval $trace;
--echo # Optimization is not applied 'cause function argument is not a field
--echo # (aggregator=distinct):
SELECT SUM(DISTINCT a + b) FROM t1;
eval $trace;
SELECT COUNT(DISTINCT b) FROM t1;
eval $trace;
SELECT AVG(DISTINCT b / a) FROM t1;
eval $trace;
EXPLAIN SELECT COUNT(DISTINCT (SELECT a)) FROM t1;
eval $trace;
CREATE TABLE t2 (a INT);
INSERT INTO t2 VALUES (1), (2);
--echo # Optimization is not applied 'cause there is more than one table
SELECT COUNT(DISTINCT t1.a) FROM t1, t2;
eval $trace;
SELECT AVG(DISTINCT t1.a) FROM t1, t2;
eval $trace;
--echo # Const tables, optimization is applied
SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1) AS t2;
eval $trace;
SELECT AVG(DISTINCT t1.a) FROM (SELECT 1 AS a) AS t2, t1, (SELECT 2 AS a) AS t3;
eval $trace;
SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1 UNION SELECT 2) AS t2;
eval $trace;
--echo # Unique index on two columns
CREATE TABLE t3 (a INT NOT NULL, b INT NOT NULL);
INSERT INTO t3 VALUES (1,1), (1,2), (1,3), (2,1), (2,2), (3,1), (3,2);
CREATE UNIQUE INDEX t3_a_b ON t3 (a, b);
--echo # Optimization is applied:
SELECT COUNT(DISTINCT a, b) FROM t3;
eval $trace;
SELECT COUNT(DISTINCT b, a) FROM t3;
eval $trace;
SELECT COUNT(DISTINCT b, a) FROM t3 WHERE a < 3;
eval $trace;
--echo # Optimization is applied to one of the functions:
SELECT COUNT(DISTINCT b), SUM(DISTINCT a), SUM(DISTINCT a + b) FROM t3 GROUP BY a;
eval $trace;
--echo # Can't apply optimization 'cause GROUP BY argument is not a field:
SELECT COUNT(DISTINCT b) FROM t3 GROUP BY a+b;
eval $trace;
--echo # Test merged view
CREATE VIEW v1 AS SELECT * FROM t1;
--echo # Optimization is applied
SELECT COUNT(DISTINCT a, b) FROM v1;
eval $trace;
--echo # GROUP_CONCAT implements non-standard distinct aggregator
SELECT GROUP_CONCAT(b) FROM t1;
eval $trace;
SELECT GROUP_CONCAT(DISTINCT b) FROM t1;
eval $trace;
DROP TABLE t1, t2, t3;
DROP VIEW v1;
SET optimizer_trace = @save_optimizer_trace;
--echo #
--echo # end of 10.5 tests
--echo #
......@@ -1496,6 +1496,12 @@ EXPLAIN SELECT MIN(d) FROM t1 where b=2 and c=3 group by a {
},
{
"test_if_skip_sort_order": []
},
{
"prepare_sum_aggregators": {
"function": "min(t1.d)",
"aggregator_type": "simple"
}
}
]
}
......@@ -1693,6 +1699,18 @@ EXPLAIN SELECT id,MIN(a),MAX(a) FROM t1 WHERE a>=20010104e0 GROUP BY id {
},
{
"test_if_skip_sort_order": []
},
{
"prepare_sum_aggregators": {
"function": "min(t1.a)",
"aggregator_type": "simple"
}
},
{
"prepare_sum_aggregators": {
"function": "max(t1.a)",
"aggregator_type": "simple"
}
}
]
}
......
......@@ -597,6 +597,17 @@ class Item_sum :public Item_func_or_sum
bool with_sum_func() const { return true; }
virtual void set_partition_row_count(ulonglong count) { DBUG_ASSERT(0); }
/*
While most Item_sum descendants employ standard aggregators configured
through Item_sum::set_aggregator() call, there are exceptions like
Item_func_group_concat, which implements its own custom aggregators for
deduplication values.
This function distinguishes between the use of standard and custom
aggregators by the object
*/
virtual bool uses_non_standard_aggregator_for_distinct() const
{ return false; }
};
......@@ -1952,6 +1963,9 @@ class Item_func_group_concat : public Item_sum
{ return f->val_str(tmp, key + offset); }
virtual void cut_max_length(String *result,
uint old_length, uint max_length) const;
bool uses_non_standard_aggregator_for_distinct() const override
{ return distinct; }
public:
// Methods used by ColumnStore
bool get_distinct() const { return distinct; }
......
......@@ -278,7 +278,6 @@ static void update_tmptable_sum_func(Item_sum **func,TABLE *tmp_table);
static void copy_sum_funcs(Item_sum **func_ptr, Item_sum **end);
static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab);
static bool setup_sum_funcs(THD *thd, Item_sum **func_ptr);
static bool prepare_sum_aggregators(Item_sum **func_ptr, bool need_distinct);
static bool init_sum_functions(Item_sum **func, Item_sum **end);
static bool update_sum_func(Item_sum **func);
static void select_describe(JOIN *join, bool need_tmp_table,bool need_order,
......@@ -3656,7 +3655,7 @@ bool JOIN::make_aggr_tables_info()
{
if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
DBUG_RETURN(true);
if (prepare_sum_aggregators(sum_funcs,
if (prepare_sum_aggregators(thd, sum_funcs,
!join_tab->is_using_agg_loose_index_scan()))
DBUG_RETURN(true);
group_list= NULL;
......@@ -3766,7 +3765,7 @@ bool JOIN::make_aggr_tables_info()
}
if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
DBUG_RETURN(true);
if (prepare_sum_aggregators(sum_funcs,
if (prepare_sum_aggregators(thd, sum_funcs,
!join_tab ||
!join_tab-> is_using_agg_loose_index_scan()))
DBUG_RETURN(true);
......@@ -3947,7 +3946,7 @@ JOIN::create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *table_fields,
goto err;
if (make_sum_func_list(all_fields, fields_list, true))
goto err;
if (prepare_sum_aggregators(sum_funcs,
if (prepare_sum_aggregators(thd, sum_funcs,
!(tables_list &&
join_tab->is_using_agg_loose_index_scan())))
goto err;
......@@ -3957,7 +3956,7 @@ JOIN::create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *table_fields,
}
else
{
if (prepare_sum_aggregators(sum_funcs,
if (prepare_sum_aggregators(thd, sum_funcs,
!join_tab->is_using_agg_loose_index_scan()))
goto err;
if (setup_sum_funcs(thd, sum_funcs))
......@@ -26406,13 +26405,86 @@ static bool setup_sum_funcs(THD *thd, Item_sum **func_ptr)
}
static bool prepare_sum_aggregators(Item_sum **func_ptr, bool need_distinct)
/*
@brief
Setup aggregate functions.
@param thd Thread descriptor
@param func_ptr Array of pointers to aggregate functions
@param need_distinct FALSE means that the table access method already
guarantees that arguments of all aggregate functions
will be unique. (This is the case for Loose Scan)
TRUE - Otherwise.
@return
false Ok
true Error
*/
bool JOIN::prepare_sum_aggregators(THD *thd, Item_sum **func_ptr,
bool need_distinct)
{
Item_sum *func;
DBUG_ENTER("prepare_sum_aggregators");
while ((func= *(func_ptr++)))
{
if (func->set_aggregator(need_distinct && func->has_with_distinct() ?
bool need_distinct_aggregator= need_distinct && func->has_with_distinct();
if (need_distinct_aggregator && table_count - const_tables == 1)
{
/*
We are doing setup for an aggregate with DISTINCT, like
SELECT agg_func(DISTINCT col1, col2 ...) FROM ...
In general case, agg_func will need to use Aggregator_distinct to
remove duplicates from its arguments.
We won't have to remove duplicates if we know the arguments are already
unique. This is true when
1. the join operation has only one non-const table (checked above)
2. the argument list covers a PRIMARY or a UNIQUE index.
Example: here the values of t1.pk are unique:
SELECT agg_func(DISTINCT t1.pk, ...) FROM t1
and so the whole argument of agg_func is unique.
*/
List<Item> arg_fields;
for (uint i= 0; i < func->argument_count(); i++)
{
if (func->arguments()[i]->real_item()->type() == Item::FIELD_ITEM)
arg_fields.push_back(func->arguments()[i]);
}
/*
If the query has a GROUP BY, then it's sufficient that a unique
key is covered by a concatenation of {argument_list, group_by_list}.
Example: Suppose t1 has PRIMARY KEY(pk1, pk2). Then:
SELECT agg_func(DISTINCT t1.pk1, ...) FROM t1 GROUP BY t1.pk2
Each GROUP BY group will have t1.pk2 fixed. Then, the values of t1.pk1
will be unique, and no de-duplication will be needed.
*/
for (ORDER *group= group_list; group ; group= group->next)
{
if ((*group->item)->real_item()->type() == Item::FIELD_ITEM)
arg_fields.push_back(*group->item);
}
if (list_contains_unique_index(join_tab[const_tables].table,
find_field_in_item_list,
(void *) &arg_fields))
need_distinct_aggregator= false;
}
Json_writer_object trace_wrapper(thd);
Json_writer_object trace_aggr(thd, "prepare_sum_aggregators");
trace_aggr.add("function", func);
trace_aggr.add("aggregator_type",
(need_distinct_aggregator ||
func->uses_non_standard_aggregator_for_distinct()) ?
"distinct" : "simple");
if (func->set_aggregator(need_distinct_aggregator ?
Aggregator::DISTINCT_AGGREGATOR :
Aggregator::SIMPLE_AGGREGATOR))
DBUG_RETURN(TRUE);
......
......@@ -1905,6 +1905,8 @@ class JOIN :public Sql_alloc
bool add_fields_for_current_rowid(JOIN_TAB *cur, List<Item> *fields);
void free_pushdown_handlers(List<TABLE_LIST>& join_list);
void init_join_cache_and_keyread();
bool prepare_sum_aggregators(THD *thd,Item_sum **func_ptr,
bool need_distinct);
};
enum enum_with_bush_roots { WITH_BUSH_ROOTS, WITHOUT_BUSH_ROOTS};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment