MDEV-30660 Aggregation functions fail to leverage uniqueness property

When executing a statement of the form SELECT AGGR_FN(DISTINCT c1, c2,..,cn) FROM t1, where AGGR_FN is an aggregate function such as COUNT(), AVG() or SUM(), and a unique index exists on table t1 covering some or all of the columns (c1, c2,..,cn), the retrieved values are inherently unique. Consequently, the need for de-duplication imposed by the DISTINCT clause can be eliminated, leading to optimization of aggregation operations. This optimization applies under the following conditions: - only one table involved in the join (not counting const tables) - some arguments of the aggregate function are fields (not functions/subqueries) This optimization extends to queries of the form SELECT AGGR_FN(c1, c2,..,cn) GROUP BY cx,..cy when a unique index covers some or all of the columns (c1, c2,..cn, cx,..cy)

MDEV-30660 Aggregation functions fail to leverage uniqueness property
When executing a statement of the form SELECT AGGR_FN(DISTINCT c1, c2,..,cn) FROM t1, where AGGR_FN is an aggregate function such as COUNT(), AVG() or SUM(), and a unique index exists on table t1 covering some or all of the columns (c1, c2,..,cn), the retrieved values are inherently unique. Consequently, the need for de-duplication imposed by the DISTINCT clause can be eliminated, leading to optimization of aggregation operations. This optimization applies under the following conditions: - only one table involved in the join (not counting const tables) - some arguments of the aggregate function are fields (not functions/subqueries) This optimization extends to queries of the form SELECT AGGR_FN(c1, c2,..,cn) GROUP BY cx,..cy when a unique index covers some or all of the columns (c1, c2,..cn, cx,..cy)
15623c7f · Oleg Smirnov · 0381921e · 15623c7f · 15623c7f · 15623c7f
Commit 15623c7f authored Jan 18, 2024 by Oleg Smirnov
6 changed files
--- a/mysql-test/main/distinct_notembedded.result
+++ b/mysql-test/main/distinct_notembedded.result
+#
+# MDEV-30660 COUNT DISTINCT seems unnecessarily slow when run on a PK
+#
+set @save_optimizer_trace = @@optimizer_trace;
+SET optimizer_trace='enabled=on';
+CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY, b INT NOT NULL);
+INSERT INTO t1 VALUES (1,1), (2,1), (3,1);
+# Optimization is applied (aggregator=simple):
+SELECT COUNT(DISTINCT a) FROM t1;
+COUNT(DISTINCT a)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT AVG(DISTINCT a), SUM(DISTINCT b) FROM t1;
+AVG(DISTINCT a)	SUM(DISTINCT b)
+2.0000	1
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "avg(distinct t1.a)",
+        "aggregator_type": "simple"
+    },
+    {
+        "function": "sum(distinct t1.b)",
+        "aggregator_type": "distinct"
+    }
+]
+# Only `a` is unique but it's enough to eliminate DISTINCT:
+SELECT COUNT(DISTINCT b, a) FROM t1;
+COUNT(DISTINCT b, a)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.b,t1.a)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT COUNT(DISTINCT a, a + b) FROM t1;
+COUNT(DISTINCT a, a + b)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a,t1.a + t1.b)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT SUM(DISTINCT a), AVG(DISTINCT a), COUNT(DISTINCT a) FROM t1 WHERE a > 1;
+SUM(DISTINCT a)	AVG(DISTINCT a)	COUNT(DISTINCT a)
+5	2.5000	2
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "sum(distinct t1.a)",
+        "aggregator_type": "simple"
+    },
+    {
+        "function": "avg(distinct t1.a)",
+        "aggregator_type": "simple"
+    },
+    {
+        "function": "count(distinct t1.a)",
+        "aggregator_type": "simple"
+    }
+]
+# Optimization is not applied 'cause function argument is not a field
+# (aggregator=distinct):
+SELECT SUM(DISTINCT a + b) FROM t1;
+SUM(DISTINCT a + b)
+9
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "sum(distinct t1.a + t1.b)",
+        "aggregator_type": "distinct"
+    }
+]
+SELECT COUNT(DISTINCT b) FROM t1;
+COUNT(DISTINCT b)
+1
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.b)",
+        "aggregator_type": "distinct"
+    }
+]
+SELECT AVG(DISTINCT b / a) FROM t1;
+AVG(DISTINCT b / a)
+0.61110000
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "avg(distinct t1.b / t1.a)",
+        "aggregator_type": "distinct"
+    }
+]
+EXPLAIN SELECT COUNT(DISTINCT (SELECT a)) FROM t1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	PRIMARY	t1	index	NULL	PRIMARY	4	NULL	3	Using index
+2	DEPENDENT SUBQUERY	NULL	NULL	NULL	NULL	NULL	NULL	NULL	No tables used
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct (/* select#2 */ select t1.a))",
+        "aggregator_type": "distinct"
+    }
+]
+CREATE TABLE t2 (a INT);
+INSERT INTO t2 VALUES (1), (2);
+# Optimization is not applied 'cause there is more than one table
+SELECT COUNT(DISTINCT t1.a) FROM t1, t2;
+COUNT(DISTINCT t1.a)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a)",
+        "aggregator_type": "distinct"
+    }
+]
+SELECT AVG(DISTINCT t1.a) FROM t1, t2;
+AVG(DISTINCT t1.a)
+2.0000
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "avg(distinct t1.a)",
+        "aggregator_type": "distinct"
+    }
+]
+# Const tables, optimization is applied
+SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1) AS t2;
+COUNT(DISTINCT a)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT AVG(DISTINCT t1.a) FROM (SELECT 1 AS a) AS t2, t1, (SELECT 2 AS a) AS t3;
+AVG(DISTINCT t1.a)
+2.0000
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "avg(distinct t1.a)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1 UNION SELECT 2) AS t2;
+COUNT(DISTINCT a)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a)",
+        "aggregator_type": "distinct"
+    }
+]
+# Unique index on two columns
+CREATE TABLE t3 (a INT NOT NULL, b INT NOT NULL);
+INSERT INTO t3 VALUES (1,1), (1,2), (1,3), (2,1), (2,2), (3,1), (3,2);
+CREATE UNIQUE INDEX t3_a_b ON t3 (a, b);
+# Optimization is applied:
+SELECT COUNT(DISTINCT a, b) FROM t3;
+COUNT(DISTINCT a, b)
+7
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t3.a,t3.b)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT COUNT(DISTINCT b, a) FROM t3;
+COUNT(DISTINCT b, a)
+7
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t3.b,t3.a)",
+        "aggregator_type": "simple"
+    }
+]
+SELECT COUNT(DISTINCT b, a) FROM t3 WHERE a < 3;
+COUNT(DISTINCT b, a)
+5
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t3.b,t3.a)",
+        "aggregator_type": "simple"
+    }
+]
+# Optimization is applied to one of the functions:
+SELECT COUNT(DISTINCT b), SUM(DISTINCT a), SUM(DISTINCT a + b) FROM t3 GROUP BY a;
+COUNT(DISTINCT b)	SUM(DISTINCT a)	SUM(DISTINCT a + b)
+3	1	9
+2	2	7
+2	3	9
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t3.b)",
+        "aggregator_type": "simple"
+    },
+    {
+        "function": "sum(distinct t3.a)",
+        "aggregator_type": "distinct"
+    },
+    {
+        "function": "sum(distinct t3.a + t3.b)",
+        "aggregator_type": "distinct"
+    }
+]
+# Can't apply optimization 'cause GROUP BY argument is not a field:
+SELECT COUNT(DISTINCT b) FROM t3 GROUP BY a+b;
+COUNT(DISTINCT b)
+1
+2
+3
+1
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t3.b)",
+        "aggregator_type": "distinct"
+    }
+]
+# Test merged view
+CREATE VIEW v1 AS SELECT * FROM t1;
+# Optimization is applied
+SELECT COUNT(DISTINCT a, b) FROM v1;
+COUNT(DISTINCT a, b)
+3
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "count(distinct t1.a,t1.b)",
+        "aggregator_type": "simple"
+    }
+]
+# GROUP_CONCAT implements non-standard distinct aggregator
+SELECT GROUP_CONCAT(b) FROM t1;
+GROUP_CONCAT(b)
+1,1,1
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "group_concat(t1.b separator ',')",
+        "aggregator_type": "simple"
+    }
+]
+SELECT GROUP_CONCAT(DISTINCT b) FROM t1;
+GROUP_CONCAT(DISTINCT b)
+1
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '$**.prepare_sum_aggregators')) AS JS
+FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JS
+[
+    {
+        "function": "group_concat(distinct t1.b separator ',')",
+        "aggregator_type": "distinct"
+    }
+]
+DROP TABLE t1, t2, t3;
+DROP VIEW v1;
+SET optimizer_trace = @save_optimizer_trace;
+#
+# end of 10.5 tests
+#
--- a/mysql-test/main/distinct_notembedded.test
+++ b/mysql-test/main/distinct_notembedded.test
+# Embedded doesn't have optimizer trace:
+--source include/not_embedded.inc
+--source include/have_sequence.inc
+
+--echo #
+--echo # MDEV-30660 COUNT DISTINCT seems unnecessarily slow when run on a PK
+--echo #
+
+set @save_optimizer_trace = @@optimizer_trace;
+SET optimizer_trace='enabled=on';
+let $trace=
+SELECT JSON_DETAILED(JSON_EXTRACT(trace, '\$**.prepare_sum_aggregators')) AS JS
+  FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+
+CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY, b INT NOT NULL);
+INSERT INTO t1 VALUES (1,1), (2,1), (3,1);
+
+--echo # Optimization is applied (aggregator=simple):
+SELECT COUNT(DISTINCT a) FROM t1;
+eval $trace;
+
+SELECT AVG(DISTINCT a), SUM(DISTINCT b) FROM t1;
+eval $trace;
+
+--echo # Only `a` is unique but it's enough to eliminate DISTINCT:
+SELECT COUNT(DISTINCT b, a) FROM t1;
+eval $trace;
+
+SELECT COUNT(DISTINCT a, a + b) FROM t1;
+eval $trace;
+
+SELECT SUM(DISTINCT a), AVG(DISTINCT a), COUNT(DISTINCT a) FROM t1 WHERE a > 1;
+eval $trace;
+
+--echo # Optimization is not applied 'cause function argument is not a field
+--echo # (aggregator=distinct):
+SELECT SUM(DISTINCT a + b) FROM t1;
+eval $trace;
+
+SELECT COUNT(DISTINCT b) FROM t1;
+eval $trace;
+
+SELECT AVG(DISTINCT b / a) FROM t1;
+eval $trace;
+
+EXPLAIN SELECT COUNT(DISTINCT (SELECT a)) FROM t1;
+eval $trace;
+
+CREATE TABLE t2 (a INT);
+INSERT INTO t2 VALUES (1), (2);
+
+--echo # Optimization is not applied 'cause there is more than one table
+SELECT COUNT(DISTINCT t1.a) FROM t1, t2;
+eval $trace;
+
+SELECT AVG(DISTINCT t1.a) FROM t1, t2;
+eval $trace;
+
+--echo # Const tables, optimization is applied
+SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1) AS t2;
+eval $trace;
+
+SELECT AVG(DISTINCT t1.a) FROM (SELECT 1 AS a) AS t2, t1, (SELECT 2 AS a) AS t3;
+eval $trace;
+
+SELECT COUNT(DISTINCT a) FROM t1, (SELECT 1 UNION SELECT 2) AS t2;
+eval $trace;
+
+--echo # Unique index on two columns
+CREATE TABLE t3 (a INT NOT NULL, b INT NOT NULL);
+INSERT INTO t3 VALUES (1,1), (1,2), (1,3), (2,1), (2,2), (3,1), (3,2);
+CREATE UNIQUE INDEX t3_a_b ON t3 (a, b);
+--echo # Optimization is applied:
+SELECT COUNT(DISTINCT a, b) FROM t3;
+eval $trace;
+
+SELECT COUNT(DISTINCT b, a) FROM t3;
+eval $trace;
+
+SELECT COUNT(DISTINCT b, a) FROM t3 WHERE a < 3;
+eval $trace;
+
+--echo # Optimization is applied to one of the functions:
+SELECT COUNT(DISTINCT b), SUM(DISTINCT a), SUM(DISTINCT a + b) FROM t3 GROUP BY a;
+eval $trace;
+
+--echo # Can't apply optimization 'cause GROUP BY argument is not a field:
+SELECT COUNT(DISTINCT b) FROM t3 GROUP BY a+b;
+eval $trace;
+
+--echo # Test merged view
+CREATE VIEW v1 AS SELECT * FROM t1;
+--echo # Optimization is applied
+SELECT COUNT(DISTINCT a, b) FROM v1;
+eval $trace;
+
+--echo # GROUP_CONCAT implements non-standard distinct aggregator
+SELECT GROUP_CONCAT(b) FROM t1;
+eval $trace;
+
+SELECT GROUP_CONCAT(DISTINCT b) FROM t1;
+eval $trace;
+
+DROP TABLE t1, t2, t3;
+DROP VIEW v1;
+SET optimizer_trace = @save_optimizer_trace;
+--echo #
+--echo # end of 10.5 tests
+--echo #
--- a/mysql-test/main/opt_trace.result
+++ b/mysql-test/main/opt_trace.result
@@ -1496,6 +1496,12 @@ EXPLAIN SELECT MIN(d) FROM t1 where b=2 and c=3  group by a	{
          },
          {
            "test_if_skip_sort_order": []
+          },
+          {
+            "prepare_sum_aggregators": {
+              "function": "min(t1.d)",
+              "aggregator_type": "simple"
+            }
          }
        ]
      }
@@ -1693,6 +1699,18 @@ EXPLAIN SELECT id,MIN(a),MAX(a) FROM t1 WHERE a>=20010104e0 GROUP BY id	{
          },
          {
            "test_if_skip_sort_order": []
+          },
+          {
+            "prepare_sum_aggregators": {
+              "function": "min(t1.a)",
+              "aggregator_type": "simple"
+            }
+          },
+          {
+            "prepare_sum_aggregators": {
+              "function": "max(t1.a)",
+              "aggregator_type": "simple"
+            }
          }
        ]
      }

--- a/sql/item_sum.h
+++ b/sql/item_sum.h
@@ -597,6 +597,17 @@ class Item_sum :public Item_func_or_sum

  bool with_sum_func() const { return true; }
  virtual void set_partition_row_count(ulonglong count) { DBUG_ASSERT(0); }
+
+  /*
+    While most Item_sum descendants employ standard aggregators configured
+    through Item_sum::set_aggregator() call, there are exceptions like
+    Item_func_group_concat, which implements its own custom aggregators for
+    deduplication values.
+    This function distinguishes between the use of standard and custom
+    aggregators by the object
+  */
+  virtual bool uses_non_standard_aggregator_for_distinct() const
+  { return false; }
 };


@@ -1952,6 +1963,9 @@ class Item_func_group_concat : public Item_sum
    { return f->val_str(tmp, key + offset); }
  virtual void cut_max_length(String *result,
                              uint old_length, uint max_length) const;
+  bool uses_non_standard_aggregator_for_distinct() const override
+    { return distinct; }
+
 public:
  // Methods used by ColumnStore
  bool get_distinct() const { return distinct; }

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -278,7 +278,6 @@ static void update_tmptable_sum_func(Item_sum **func,TABLE *tmp_table);
 static void copy_sum_funcs(Item_sum **func_ptr, Item_sum **end);
 static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab);
 static bool setup_sum_funcs(THD *thd, Item_sum **func_ptr);
-static bool prepare_sum_aggregators(Item_sum **func_ptr, bool need_distinct);
 static bool init_sum_functions(Item_sum **func, Item_sum **end);
 static bool update_sum_func(Item_sum **func);
 static void select_describe(JOIN *join, bool need_tmp_table,bool need_order,
@@ -3656,7 +3655,7 @@ bool JOIN::make_aggr_tables_info()
      {
        if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
          DBUG_RETURN(true);
-        if (prepare_sum_aggregators(sum_funcs,
+        if (prepare_sum_aggregators(thd, sum_funcs,
                                    !join_tab->is_using_agg_loose_index_scan()))
          DBUG_RETURN(true);
        group_list= NULL;
@@ -3766,7 +3765,7 @@ bool JOIN::make_aggr_tables_info()
    }
    if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
      DBUG_RETURN(true);
-    if (prepare_sum_aggregators(sum_funcs,
+    if (prepare_sum_aggregators(thd, sum_funcs,
                                !join_tab ||
                                !join_tab-> is_using_agg_loose_index_scan()))
      DBUG_RETURN(true);
@@ -3947,8 +3946,8 @@ JOIN::create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *table_fields,
      goto err;
    if (make_sum_func_list(all_fields, fields_list, true))
      goto err;
-    if (prepare_sum_aggregators(sum_funcs,
-                                !(tables_list && 
+    if (prepare_sum_aggregators(thd, sum_funcs,
+                                !(tables_list &&
                                  join_tab->is_using_agg_loose_index_scan())))
      goto err;
    if (setup_sum_funcs(thd, sum_funcs))
@@ -3957,7 +3956,7 @@ JOIN::create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *table_fields,
  }
  else
  {
-    if (prepare_sum_aggregators(sum_funcs,
+    if (prepare_sum_aggregators(thd, sum_funcs,
                                !join_tab->is_using_agg_loose_index_scan()))
      goto err;
    if (setup_sum_funcs(thd, sum_funcs))
@@ -26406,13 +26405,86 @@ static bool setup_sum_funcs(THD *thd, Item_sum **func_ptr)
 }


-static bool prepare_sum_aggregators(Item_sum **func_ptr, bool need_distinct)
+/*
+  @brief
+    Setup aggregate functions.
+
+  @param thd            Thread descriptor
+  @param func_ptr       Array of pointers to aggregate functions
+  @param need_distinct  FALSE means that the table access method already
+                        guarantees that arguments of all aggregate functions
+                        will be unique. (This is the case for Loose Scan)
+                        TRUE - Otherwise.
+  @return
+     false  Ok
+     true   Error
+*/
+
+bool JOIN::prepare_sum_aggregators(THD *thd, Item_sum **func_ptr,
+                                   bool need_distinct)
 {
  Item_sum *func;
  DBUG_ENTER("prepare_sum_aggregators");
  while ((func= *(func_ptr++)))
  {
-    if (func->set_aggregator(need_distinct && func->has_with_distinct() ?
+    bool need_distinct_aggregator= need_distinct && func->has_with_distinct();
+    if (need_distinct_aggregator && table_count - const_tables == 1)
+    {
+      /*
+        We are doing setup for an aggregate with DISTINCT, like
+
+          SELECT agg_func(DISTINCT col1, col2 ...) FROM ...
+
+        In general case, agg_func will need to use Aggregator_distinct to
+        remove duplicates from its arguments.
+        We won't have to remove duplicates if we know the arguments are already
+        unique. This is true when
+        1. the join operation has only one non-const table (checked above)
+        2. the argument list covers a PRIMARY or a UNIQUE index.
+
+        Example: here the values of t1.pk are unique:
+
+          SELECT agg_func(DISTINCT t1.pk, ...) FROM t1
+
+        and so the whole argument of agg_func is unique.
+      */
+      List<Item> arg_fields;
+      for (uint i= 0; i < func->argument_count(); i++)
+      {
+        if (func->arguments()[i]->real_item()->type() == Item::FIELD_ITEM)
+          arg_fields.push_back(func->arguments()[i]);
+      }
+
+      /*
+        If the query has a GROUP BY, then it's sufficient that a unique
+        key is covered by a concatenation of {argument_list, group_by_list}.
+
+        Example: Suppose t1 has PRIMARY KEY(pk1, pk2). Then:
+
+          SELECT agg_func(DISTINCT t1.pk1, ...) FROM t1 GROUP BY t1.pk2
+
+        Each GROUP BY group will have t1.pk2 fixed. Then, the values of t1.pk1
+        will be unique, and no de-duplication will be needed.
+      */
+      for (ORDER *group= group_list; group ; group= group->next)
+      {
+        if ((*group->item)->real_item()->type() == Item::FIELD_ITEM)
+          arg_fields.push_back(*group->item);
+      }
+
+      if (list_contains_unique_index(join_tab[const_tables].table,
+                                     find_field_in_item_list,
+                                     (void *) &arg_fields))
+        need_distinct_aggregator= false;
+    }
+    Json_writer_object trace_wrapper(thd);
+    Json_writer_object trace_aggr(thd, "prepare_sum_aggregators");
+    trace_aggr.add("function", func);
+    trace_aggr.add("aggregator_type",
+                   (need_distinct_aggregator ||
+                    func->uses_non_standard_aggregator_for_distinct()) ?
+                      "distinct" : "simple");
+    if (func->set_aggregator(need_distinct_aggregator ?
                             Aggregator::DISTINCT_AGGREGATOR :
                             Aggregator::SIMPLE_AGGREGATOR))
      DBUG_RETURN(TRUE);

--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -1905,6 +1905,8 @@ class JOIN :public Sql_alloc
  bool add_fields_for_current_rowid(JOIN_TAB *cur, List<Item> *fields);
  void free_pushdown_handlers(List<TABLE_LIST>& join_list);
  void init_join_cache_and_keyread();
+  bool prepare_sum_aggregators(THD *thd,Item_sum **func_ptr,
+                               bool need_distinct);
 };

 enum enum_with_bush_roots { WITH_BUSH_ROOTS, WITHOUT_BUSH_ROOTS};