Fix cost calculation for get_best_group_min_max()

If the final range restrictions (SEL_ARG tree) over GROUP BY columns are single-point, we can compute the number of GROUP BY groups. Example: in the query: SELECT ... FROM tbl WHERE keypart1 IN (1,2,3) and keypart2 IN ('foo','bar') Other things: - Fixed cost calculation to more correctly count the number of blocks that may be read. The old code could use the total blocks in the file even if a range was available.

Fix cost calculation for get_best_group_min_max()
If the final range restrictions (SEL_ARG tree) over GROUP BY columns are single-point, we can compute the number of GROUP BY groups. Example: in the query: SELECT ... FROM tbl WHERE keypart1 IN (1,2,3) and keypart2 IN ('foo','bar') Other things: - Fixed cost calculation to more correctly count the number of blocks that may be read. The old code could use the total blocks in the file even if a range was available.
7afa819f · Monty · Sergei Petrunia · 009db228 · 7afa819f · 7afa819f
Commit 7afa819f authored Sep 25, 2022 by Monty Committed by Sergei Petrunia Feb 02, 2023
4 changed files
--- a/mysql-test/main/group_by.result
+++ b/mysql-test/main/group_by.result
@@ -552,12 +552,12 @@ a	b
 3	1
 explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using temporary; Using filesort
-1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using temporary; Using filesort
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using where; Using join buffer (flat, BNL join)
 explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b ORDER BY NULL;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using temporary
-1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using temporary
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using where; Using join buffer (flat, BNL join)
 drop table t1,t2;
 SET @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity,@@optimizer_switch=@save_optimizer_switch;
 create table t1 (a int, b int);
@@ -890,7 +890,8 @@ Level	Code	Message
 drop table t1, t2;
 CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY  (a));
 INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), 
-(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
+(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
+(100,"No land"), (101,"No land");
 CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY  (a), KEY b (b));
 INSERT INTO t2 VALUES (1,1), (2,1), (6,6), (18,17), (15,25), (16,25),
 (17,25), (10,54), (5,62),(3,68);
@@ -1352,7 +1353,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	PRIMARY,i2	PRIMARY	4	NULL	1	Using where; Using index
 EXPLAIN SELECT a FROM t1 WHERE a < 2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY,i2	PRIMARY	4	NULL	1	Using where; Using index
+1	SIMPLE	t1	range	PRIMARY,i2	i2	4	NULL	1	Using where; Using index for group-by
 EXPLAIN SELECT a FROM t1 IGNORE INDEX (PRIMARY,i2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	144	
@@ -1701,7 +1702,7 @@ NULL	1
 1	2
 EXPLAIN SELECT a from t2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	index	NULL	a	10	NULL	6	Using index
+1	SIMPLE	t2	range	NULL	a	5	NULL	6	Using index for group-by
 SELECT a from t2 GROUP BY a;
 a
 NULL
@@ -1715,10 +1716,17 @@ NULL
 1
 2
 insert into t2 SELECT NULL, NULL from seq_1_to_10;
+EXPLAIN SELECT b from t2 GROUP BY a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	index	NULL	a	10	NULL	16	Using index
 # Expect: Using index for group-by
+analyze table t2;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	OK
 EXPLAIN SELECT b from t2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	a	5	NULL	9	Using index for group-by
+1	SIMPLE	t2	range	NULL	a	5	NULL	6	Using index for group-by
 DROP TABLE t1;
 DROP TABLE t2;
 CREATE TABLE t1 ( a INT, b INT );
@@ -2255,11 +2263,11 @@ INSERT INTO t2(col1, col2) VALUES
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index
 explain
 select SQL_BIG_RESULT col1 f1, col2 f2, col1 f3 from t2 group by f1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index; Using filesort
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -2267,7 +2275,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 explain
 select col1 f1, col1 f2 from t2 group by f1, 1+1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2, f3+0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -2457,7 +2465,7 @@ test.t1	analyze	status	OK

 EXPLAIN SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	b	b	9	NULL	2	Using where; Using index for group-by; Using temporary
+1	SIMPLE	t1	range	b	b	9	NULL	1	Using where; Using index for group-by; Using temporary

 SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 MIN(a)	b
@@ -2465,7 +2473,7 @@ MIN(a)	b

 EXPLAIN SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	b	b	9	NULL	2	Using where; Using index for group-by
+1	SIMPLE	t1	range	b	b	9	NULL	1	Using where; Using index for group-by

 SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 MIN(a)	b
@@ -2994,3 +3002,34 @@ drop table t20, t21, t22;
 #
 # End of 10.3 tests
 #
+#
+# Test new group_min_max optimization
+#
+create table t1 (a int, b int, c int, key(a,b,c));
+insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
+explain select a from t1 where a in (1,2,3) group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	3	Using where; Using index for group-by
+explain select a from t1 where a in (1,2,3) or a = 22 group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	4	Using where; Using index for group-by
+explain select a from t1 where a in (1,2,3) and a < 3 group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	2	Using where; Using index for group-by
+explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	10	NULL	9	Using where; Using index for group-by
+explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	10	NULL	3	Using where; Using index for group-by
+explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	15	NULL	3	Using where; Using index for group-by
+# Will not use index for group-by
+explain select a from t1 where a in (1,2,3) and b>1 group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	10	NULL	983	Using where; Using index
+explain select a from t1 where a in (1,2,3) and c=1 group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	1161	Using where; Using index
+drop table t1;
--- a/mysql-test/main/group_by.test
+++ b/mysql-test/main/group_by.test
@@ -718,7 +718,8 @@ drop table t1, t2;
 CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY  (a));

 INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), 
-    (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
+    (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
+    (100,"No land"), (101,"No land");

 CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY  (a), KEY b (b));

@@ -1168,7 +1169,9 @@ SELECT b from t2 GROUP BY b;

 # Show that we are using 'range' when there is more NULL rows in the table
 insert into t2 SELECT NULL, NULL from seq_1_to_10;
+EXPLAIN SELECT b from t2 GROUP BY a;
 --echo # Expect: Using index for group-by
+analyze table t2;
 EXPLAIN SELECT b from t2 GROUP BY a;

 DROP TABLE t1;
@@ -2131,3 +2134,22 @@ drop table t20, t21, t22;
 --echo #
 --echo # End of 10.3 tests
 --echo #
+
+--echo #
+--echo # Test new group_min_max optimization
+--echo #
+
+create table t1 (a int, b int, c int, key(a,b,c));
+insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
+
+explain select a from t1 where a in (1,2,3) group by a;
+explain select a from t1 where a in (1,2,3) or a = 22 group by a;
+explain select a from t1 where a in (1,2,3) and a < 3 group by a;
+explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
+explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
+explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
+
+--echo # Will not use index for group-by
+explain select a from t1 where a in (1,2,3) and b>1 group by a;
+explain select a from t1 where a in (1,2,3) and c=1 group by a;
+drop table t1;
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -2174,6 +2174,27 @@ int SEL_ARG::sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
 }


+/*
+  Check if min and values are equal
+
+  @return 1 if equal
+*/
+
+bool SEL_ARG::min_max_are_equal() const
+{
+  uint offset= 0;
+  if (field->real_maybe_null())			// If null is part of key
+  {
+    if (*min_value != *max_value)
+      return 0;
+    if (*min_value)
+      return 1;					// NULL where equal
+    offset= 1;                                  // Skip NULL marker
+  }
+  return field->key_cmp(min_value+offset, max_value+offset) == 0;
+}
+
+
 SEL_ARG *SEL_ARG::clone_tree(RANGE_OPT_PARAM *param)
 {
  SEL_ARG tmp_link,*next_arg,*root;
@@ -11040,6 +11061,50 @@ uint SEL_ARG::get_max_key_part() const
 }


+/**
+  Compute the number of eq_ranges top elements in the tree
+
+  This is used by the cost_group_min_max() to calculate the number of
+  groups in SEL_TREE
+
+  @param group_key_parts number of key parts that must be equal
+
+  @return < 0  Not known
+  @return >= 0  Number of groups
+*/
+
+int SEL_ARG::number_of_eq_groups(uint group_key_parts) const
+{
+  int elements= 0;
+  SEL_ARG const *cur;
+
+  if (part > group_key_parts-1 || type != KEY_RANGE)
+    return -1;
+
+  cur= first();
+  do
+  {
+    if ((cur->min_flag | cur->min_flag) &
+        (NO_MIN_RANGE | NO_MAX_RANGE | NEAR_MIN | NEAR_MAX | GEOM_FLAG))
+      return -1;
+    if (min_value != max_value && !min_max_are_equal())
+      return -1;
+    if (part != group_key_parts -1)
+    {
+      int tmp;
+      if (!next_key_part)
+        return -1;
+      if ((tmp= next_key_part->number_of_eq_groups(group_key_parts)) < 0)
+        return -1;
+      elements+= tmp;
+    }
+    else
+      elements++;
+  } while ((cur= cur->next));
+  return elements;
+}
+
+
 /*
  Remove the SEL_ARG graph elements which have part > max_part.

@@ -11092,8 +11157,8 @@ void prune_sel_arg_graph(SEL_ARG *sel_arg, uint max_part)

  @return
    tree pointer  The tree after processing,
-    NULL          If it was not possible to reduce the weight of the tree below the
-                  limit.
+    NULL          If it was not possible to reduce the weight of the tree below
+                  the limit.
 */

 SEL_ARG *enforce_sel_arg_weight_limit(RANGE_OPT_PARAM *param, uint keyno,
@@ -14950,7 +15015,7 @@ get_field_keypart(KEY *index, Field *field)
    have_min             [in] True if there is a MIN function
    have_max             [in] True if there is a MAX function
    read_cost           [out] The cost to retrieve rows via this quick select
-    records             [out] The number of rows retrieved
+    out_records         [out] The number of rows retrieved

  DESCRIPTION
    This method computes the access cost of a TRP_GROUP_MIN_MAX instance and
@@ -14968,6 +15033,20 @@ get_field_keypart(KEY *index, Field *field)
        either scan the index for the next value or do a new index dive
        with 'find next bigger key'.

+    When using MIN() and MAX() in the query, the calls to the storage engine
+    are as follows for each group:
+      Assuming kp1 in ('abc','def','ghi)' and kp2 between 1000 and 2000
+
+      read_key('abc', HA_READ_KEY_OR_NEXT)
+      In case of MIN() we do:
+        read_key('abc,:'1000', HA_READ_KEY_OR_NEXT)
+      In case of MAX() we do
+        read_key('abc,:'2000', HA_READ_PREFIX_LAST_OR_PREV)
+      In the following code we will assume that the MIN key will be in
+      the same block as the first key read.
+      (We should try to optimize away the extra call for MAX() at some
+       point).
+
  NOTES
    See get_best_group_min_max() for which kind of queries this function
    will be called.
@@ -15013,36 +15092,27 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
                        uint group_key_parts, SEL_TREE *range_tree,
                        SEL_ARG *index_tree, ha_rows quick_prefix_records,
                        bool have_min, bool have_max,
-                        double *read_cost, ha_rows *records)
+                        double *read_cost, ha_rows *out_records)
 {
-  uint    keys_per_block, key_length;
-  ha_rows table_records;
+  uint    key_length;
+  ha_rows records;
  ha_rows num_groups;
  ha_rows num_blocks;
  ha_rows keys_per_group;
-  ha_rows keys_per_subgroup; /* Average number of keys in sub-groups */
-                          /* formed by a key infix. */
-  double p_overlap; /* Probability that a sub-group overlaps two blocks. */
  double quick_prefix_selectivity;
-  double io_cost;
+  ulonglong io_cost;
  handler *file= table->file;
  DBUG_ENTER("cost_group_min_max");

  /* Same code as in handler::key_read_time() */
-  table_records= table->stat_records();
+  records= table->stat_records();
  key_length= (index_info->key_length + file->ref_length);
-  num_blocks= (table_records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
-               INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
-  keys_per_block= (file->stats.block_size /
-                   (key_length * INDEX_BLOCK_FILL_FACTOR_MUL /
-                    INDEX_BLOCK_FILL_FACTOR_DIV) +
-                   1);

  /* Compute the number of keys in a group. */
  if (!group_key_parts)
  {
    /* Summary over the whole table */
-    keys_per_group= MY_MAX(table_records,1);
+    keys_per_group= MY_MAX(records,1);
  }
  else
  {
@@ -15050,25 +15120,39 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
                                                             1);
    if (keys_per_group == 0) /* If there is no statistics try to guess */
    {
-      /* each group contains 10% of all records */
-      keys_per_group= (table_records / 10) + 1;
+      /* each group contains 1% of all records */
+      keys_per_group= (records / 100) + 1;
    }
  }
  if (keys_per_group > 1)
-    num_groups= (table_records / keys_per_group) + 1;
+    num_groups= (records / keys_per_group) + 1;
  else
-    num_groups= table_records;
+    num_groups= records;

  /* Apply the selectivity of the quick select for group prefixes. */
  if (range_tree && (quick_prefix_records != HA_POS_ERROR))
  {
+    int groups;
    quick_prefix_selectivity= (double) quick_prefix_records /
-                              (double) table_records;
+                              (double) records;
    num_groups= (ha_rows) rint(num_groups * quick_prefix_selectivity);
+    records= quick_prefix_records;
+
    /*
-      Expect at least as many groups as there is ranges in the index
+      Try to handle cases like
      WHERE a in (1,2,3) GROUP BY a

+      If all ranges are eq_ranges for the group_key_parts we can use
+      this as the number of groups.
+    */
+    groups= index_tree->number_of_eq_groups(group_key_parts);
+    if (groups > 0)
+      num_groups= groups;
+    else
+    {
+      /*
+        Expect at least as many groups as there is ranges in the index
+
        This is mostly relevant for queries with few records, which is
        something we have a lot of in our test suites.
        In theory it is possible to scan the index_tree and for cases
@@ -15078,34 +15162,19 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
        tree.
      */
      set_if_bigger(num_groups, MY_MAX(index_tree->elements, 1));
+    }
    /* There cannot be more groups than matched records */
    set_if_smaller(num_groups, quick_prefix_records);
  }
-  DBUG_ASSERT(num_groups <= table_records);
+  DBUG_ASSERT(num_groups <= records);

-  if (used_key_parts > group_key_parts)
-  {
-    /*
-      Compute the probability that two ends of a subgroup are inside
-      different blocks.
-    */
-    keys_per_subgroup= (ha_rows) index_info->actual_rec_per_key(used_key_parts -
-                                                                1);
-    if (keys_per_subgroup >= keys_per_block) /* If a subgroup is bigger than */
-      p_overlap= 1.0;       /* a block, it will overlap at least two blocks. */
-    else
-    {
-      double blocks_per_group= (double) num_blocks / (double) num_groups;
-      p_overlap= (blocks_per_group * (keys_per_subgroup - 1)) / keys_per_group;
-      p_overlap= MY_MIN(p_overlap, 1.0);
-    }
-    io_cost= (double) MY_MIN(num_groups * (1 + p_overlap), num_blocks);
-  }
-  else
-    io_cost= ((keys_per_group > keys_per_block) ?
-              (have_min && have_max) ? (double) (num_groups + 1) :
-              (double) num_groups :
-              (double) num_blocks);
+
+  /* Calculate the number of blocks we will touch for the table or range scan */
+  num_blocks= (records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
+               INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
+
+  io_cost= (have_max) ? num_groups*2 : num_groups;
+  set_if_smaller(io_cost, num_blocks);

  /*
    CPU cost must be comparable to that of an index scan as computed
@@ -15118,13 +15187,13 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
  *read_cost= file->ha_keyread_and_compare_time(keyno, (ulong) num_groups,
                                                num_groups,
                                                io_cost);
-  *records= num_groups;
+  *out_records= num_groups;

  DBUG_PRINT("info",
-             ("table rows: %lu  keys/block: %u  keys/group: %lu  "
+             ("rows: %lu  keys/group: %lu  "
              "result rows: %lu  blocks: %lu",
-              (ulong) table_records, keys_per_block, (ulong) keys_per_group, 
-              (ulong) *records, (ulong) num_blocks));
+              (ulong) records, (ulong) keys_per_group,
+              (ulong) *out_records, (ulong) num_blocks));
  DBUG_VOID_RETURN;
 }


--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -302,6 +302,7 @@ class SEL_ARG :public Sql_alloc
 {
  static int sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
                     uint8 b_flag);
+  bool min_max_are_equal() const;
 public:
  uint8 min_flag,max_flag,maybe_flag;
  uint8 part;					// Which key part
@@ -401,6 +402,7 @@ class SEL_ARG :public Sql_alloc
      return false;
    return true;
  }
+  int number_of_eq_groups(uint group_key_parts) const;
  inline void merge_flags(SEL_ARG *arg) { maybe_flag|=arg->maybe_flag; }
  inline void maybe_smaller() { maybe_flag=1; }
  /* Return true iff it's a single-point null interval */