Commit 7afa819f authored by Monty's avatar Monty Committed by Sergei Petrunia

Fix cost calculation for get_best_group_min_max()

If the final range restrictions (SEL_ARG tree) over GROUP BY
columns are single-point, we can compute the number of GROUP BY groups.

Example: in the query:
SELECT ... FROM tbl
WHERE keypart1 IN (1,2,3) and keypart2 IN ('foo','bar')

Other things:
- Fixed cost calculation to more correctly count the number of blocks
  that may be read. The old code could use the total blocks in the file
  even if a range was available.
parent 009db228
......@@ -552,12 +552,12 @@ a b
3 1
explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using temporary; Using filesort
1 SIMPLE t2 ALL a NULL NULL NULL 4 Using where; Using join buffer (flat, BNL join)
1 SIMPLE t2 ALL a NULL NULL NULL 4 Using temporary; Using filesort
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where; Using join buffer (flat, BNL join)
explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b ORDER BY NULL;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using temporary
1 SIMPLE t2 ALL a NULL NULL NULL 4 Using where; Using join buffer (flat, BNL join)
1 SIMPLE t2 ALL a NULL NULL NULL 4 Using temporary
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where; Using join buffer (flat, BNL join)
drop table t1,t2;
SET @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity,@@optimizer_switch=@save_optimizer_switch;
create table t1 (a int, b int);
......@@ -890,7 +890,8 @@ Level Code Message
drop table t1, t2;
CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY (a));
INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'),
(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
(100,"No land"), (101,"No land");
CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY (a), KEY b (b));
INSERT INTO t2 VALUES (1,1), (2,1), (6,6), (18,17), (15,25), (16,25),
(17,25), (10,54), (5,62),(3,68);
......@@ -1352,7 +1353,7 @@ id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range PRIMARY,i2 PRIMARY 4 NULL 1 Using where; Using index
EXPLAIN SELECT a FROM t1 WHERE a < 2 GROUP BY a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range PRIMARY,i2 PRIMARY 4 NULL 1 Using where; Using index
1 SIMPLE t1 range PRIMARY,i2 i2 4 NULL 1 Using where; Using index for group-by
EXPLAIN SELECT a FROM t1 IGNORE INDEX (PRIMARY,i2);
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 144
......@@ -1701,7 +1702,7 @@ NULL 1
1 2
EXPLAIN SELECT a from t2 GROUP BY a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 index NULL a 10 NULL 6 Using index
1 SIMPLE t2 range NULL a 5 NULL 6 Using index for group-by
SELECT a from t2 GROUP BY a;
a
NULL
......@@ -1715,10 +1716,17 @@ NULL
1
2
insert into t2 SELECT NULL, NULL from seq_1_to_10;
EXPLAIN SELECT b from t2 GROUP BY a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 index NULL a 10 NULL 16 Using index
# Expect: Using index for group-by
analyze table t2;
Table Op Msg_type Msg_text
test.t2 analyze status Engine-independent statistics collected
test.t2 analyze status OK
EXPLAIN SELECT b from t2 GROUP BY a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 range NULL a 5 NULL 9 Using index for group-by
1 SIMPLE t2 range NULL a 5 NULL 6 Using index for group-by
DROP TABLE t1;
DROP TABLE t2;
CREATE TABLE t1 ( a INT, b INT );
......@@ -2255,11 +2263,11 @@ INSERT INTO t2(col1, col2) VALUES
explain
select col1 f1, col2 f2, col1 f3 from t2 group by f1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by
1 SIMPLE t2 index NULL idx 10 NULL 20 Using index
explain
select SQL_BIG_RESULT col1 f1, col2 f2, col1 f3 from t2 group by f1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by
1 SIMPLE t2 index NULL idx 10 NULL 20 Using index; Using filesort
explain
select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2;
id select_type table type possible_keys key key_len ref rows Extra
......@@ -2267,7 +2275,7 @@ id select_type table type possible_keys key key_len ref rows Extra
explain
select col1 f1, col1 f2 from t2 group by f1, 1+1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by
1 SIMPLE t2 index NULL idx 10 NULL 20 Using index
explain
select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2, f3+0;
id select_type table type possible_keys key key_len ref rows Extra
......@@ -2457,7 +2465,7 @@ test.t1 analyze status OK
EXPLAIN SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range b b 9 NULL 2 Using where; Using index for group-by; Using temporary
1 SIMPLE t1 range b b 9 NULL 1 Using where; Using index for group-by; Using temporary
SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
MIN(a) b
......@@ -2465,7 +2473,7 @@ MIN(a) b
EXPLAIN SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range b b 9 NULL 2 Using where; Using index for group-by
1 SIMPLE t1 range b b 9 NULL 1 Using where; Using index for group-by
SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
MIN(a) b
......@@ -2994,3 +3002,34 @@ drop table t20, t21, t22;
#
# End of 10.3 tests
#
#
# Test new group_min_max optimization
#
create table t1 (a int, b int, c int, key(a,b,c));
insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
explain select a from t1 where a in (1,2,3) group by a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 5 NULL 3 Using where; Using index for group-by
explain select a from t1 where a in (1,2,3) or a = 22 group by a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 5 NULL 4 Using where; Using index for group-by
explain select a from t1 where a in (1,2,3) and a < 3 group by a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 5 NULL 2 Using where; Using index for group-by
explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 10 NULL 9 Using where; Using index for group-by
explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 10 NULL 3 Using where; Using index for group-by
explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 15 NULL 3 Using where; Using index for group-by
# Will not use index for group-by
explain select a from t1 where a in (1,2,3) and b>1 group by a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 10 NULL 983 Using where; Using index
explain select a from t1 where a in (1,2,3) and c=1 group by a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 range a a 5 NULL 1161 Using where; Using index
drop table t1;
......@@ -718,7 +718,8 @@ drop table t1, t2;
CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY (a));
INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'),
(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
(100,"No land"), (101,"No land");
CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY (a), KEY b (b));
......@@ -1168,7 +1169,9 @@ SELECT b from t2 GROUP BY b;
# Show that we are using 'range' when there is more NULL rows in the table
insert into t2 SELECT NULL, NULL from seq_1_to_10;
EXPLAIN SELECT b from t2 GROUP BY a;
--echo # Expect: Using index for group-by
analyze table t2;
EXPLAIN SELECT b from t2 GROUP BY a;
DROP TABLE t1;
......@@ -2131,3 +2134,22 @@ drop table t20, t21, t22;
--echo #
--echo # End of 10.3 tests
--echo #
--echo #
--echo # Test new group_min_max optimization
--echo #
create table t1 (a int, b int, c int, key(a,b,c));
insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
explain select a from t1 where a in (1,2,3) group by a;
explain select a from t1 where a in (1,2,3) or a = 22 group by a;
explain select a from t1 where a in (1,2,3) and a < 3 group by a;
explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
--echo # Will not use index for group-by
explain select a from t1 where a in (1,2,3) and b>1 group by a;
explain select a from t1 where a in (1,2,3) and c=1 group by a;
drop table t1;
......@@ -2174,6 +2174,27 @@ int SEL_ARG::sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
}
/*
Check if min and values are equal
@return 1 if equal
*/
bool SEL_ARG::min_max_are_equal() const
{
uint offset= 0;
if (field->real_maybe_null()) // If null is part of key
{
if (*min_value != *max_value)
return 0;
if (*min_value)
return 1; // NULL where equal
offset= 1; // Skip NULL marker
}
return field->key_cmp(min_value+offset, max_value+offset) == 0;
}
SEL_ARG *SEL_ARG::clone_tree(RANGE_OPT_PARAM *param)
{
SEL_ARG tmp_link,*next_arg,*root;
......@@ -11040,6 +11061,50 @@ uint SEL_ARG::get_max_key_part() const
}
/**
Compute the number of eq_ranges top elements in the tree
This is used by the cost_group_min_max() to calculate the number of
groups in SEL_TREE
@param group_key_parts number of key parts that must be equal
@return < 0 Not known
@return >= 0 Number of groups
*/
int SEL_ARG::number_of_eq_groups(uint group_key_parts) const
{
int elements= 0;
SEL_ARG const *cur;
if (part > group_key_parts-1 || type != KEY_RANGE)
return -1;
cur= first();
do
{
if ((cur->min_flag | cur->min_flag) &
(NO_MIN_RANGE | NO_MAX_RANGE | NEAR_MIN | NEAR_MAX | GEOM_FLAG))
return -1;
if (min_value != max_value && !min_max_are_equal())
return -1;
if (part != group_key_parts -1)
{
int tmp;
if (!next_key_part)
return -1;
if ((tmp= next_key_part->number_of_eq_groups(group_key_parts)) < 0)
return -1;
elements+= tmp;
}
else
elements++;
} while ((cur= cur->next));
return elements;
}
/*
Remove the SEL_ARG graph elements which have part > max_part.
......@@ -11092,8 +11157,8 @@ void prune_sel_arg_graph(SEL_ARG *sel_arg, uint max_part)
@return
tree pointer The tree after processing,
NULL If it was not possible to reduce the weight of the tree below the
limit.
NULL If it was not possible to reduce the weight of the tree below
the limit.
*/
SEL_ARG *enforce_sel_arg_weight_limit(RANGE_OPT_PARAM *param, uint keyno,
......@@ -14950,7 +15015,7 @@ get_field_keypart(KEY *index, Field *field)
have_min [in] True if there is a MIN function
have_max [in] True if there is a MAX function
read_cost [out] The cost to retrieve rows via this quick select
records [out] The number of rows retrieved
out_records [out] The number of rows retrieved
DESCRIPTION
This method computes the access cost of a TRP_GROUP_MIN_MAX instance and
......@@ -14968,6 +15033,20 @@ get_field_keypart(KEY *index, Field *field)
either scan the index for the next value or do a new index dive
with 'find next bigger key'.
When using MIN() and MAX() in the query, the calls to the storage engine
are as follows for each group:
Assuming kp1 in ('abc','def','ghi)' and kp2 between 1000 and 2000
read_key('abc', HA_READ_KEY_OR_NEXT)
In case of MIN() we do:
read_key('abc,:'1000', HA_READ_KEY_OR_NEXT)
In case of MAX() we do
read_key('abc,:'2000', HA_READ_PREFIX_LAST_OR_PREV)
In the following code we will assume that the MIN key will be in
the same block as the first key read.
(We should try to optimize away the extra call for MAX() at some
point).
NOTES
See get_best_group_min_max() for which kind of queries this function
will be called.
......@@ -15013,36 +15092,27 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
uint group_key_parts, SEL_TREE *range_tree,
SEL_ARG *index_tree, ha_rows quick_prefix_records,
bool have_min, bool have_max,
double *read_cost, ha_rows *records)
double *read_cost, ha_rows *out_records)
{
uint keys_per_block, key_length;
ha_rows table_records;
uint key_length;
ha_rows records;
ha_rows num_groups;
ha_rows num_blocks;
ha_rows keys_per_group;
ha_rows keys_per_subgroup; /* Average number of keys in sub-groups */
/* formed by a key infix. */
double p_overlap; /* Probability that a sub-group overlaps two blocks. */
double quick_prefix_selectivity;
double io_cost;
ulonglong io_cost;
handler *file= table->file;
DBUG_ENTER("cost_group_min_max");
/* Same code as in handler::key_read_time() */
table_records= table->stat_records();
records= table->stat_records();
key_length= (index_info->key_length + file->ref_length);
num_blocks= (table_records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
keys_per_block= (file->stats.block_size /
(key_length * INDEX_BLOCK_FILL_FACTOR_MUL /
INDEX_BLOCK_FILL_FACTOR_DIV) +
1);
/* Compute the number of keys in a group. */
if (!group_key_parts)
{
/* Summary over the whole table */
keys_per_group= MY_MAX(table_records,1);
keys_per_group= MY_MAX(records,1);
}
else
{
......@@ -15050,25 +15120,39 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
1);
if (keys_per_group == 0) /* If there is no statistics try to guess */
{
/* each group contains 10% of all records */
keys_per_group= (table_records / 10) + 1;
/* each group contains 1% of all records */
keys_per_group= (records / 100) + 1;
}
}
if (keys_per_group > 1)
num_groups= (table_records / keys_per_group) + 1;
num_groups= (records / keys_per_group) + 1;
else
num_groups= table_records;
num_groups= records;
/* Apply the selectivity of the quick select for group prefixes. */
if (range_tree && (quick_prefix_records != HA_POS_ERROR))
{
int groups;
quick_prefix_selectivity= (double) quick_prefix_records /
(double) table_records;
(double) records;
num_groups= (ha_rows) rint(num_groups * quick_prefix_selectivity);
records= quick_prefix_records;
/*
Expect at least as many groups as there is ranges in the index
Try to handle cases like
WHERE a in (1,2,3) GROUP BY a
If all ranges are eq_ranges for the group_key_parts we can use
this as the number of groups.
*/
groups= index_tree->number_of_eq_groups(group_key_parts);
if (groups > 0)
num_groups= groups;
else
{
/*
Expect at least as many groups as there is ranges in the index
This is mostly relevant for queries with few records, which is
something we have a lot of in our test suites.
In theory it is possible to scan the index_tree and for cases
......@@ -15078,34 +15162,19 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
tree.
*/
set_if_bigger(num_groups, MY_MAX(index_tree->elements, 1));
}
/* There cannot be more groups than matched records */
set_if_smaller(num_groups, quick_prefix_records);
}
DBUG_ASSERT(num_groups <= table_records);
DBUG_ASSERT(num_groups <= records);
if (used_key_parts > group_key_parts)
{
/*
Compute the probability that two ends of a subgroup are inside
different blocks.
*/
keys_per_subgroup= (ha_rows) index_info->actual_rec_per_key(used_key_parts -
1);
if (keys_per_subgroup >= keys_per_block) /* If a subgroup is bigger than */
p_overlap= 1.0; /* a block, it will overlap at least two blocks. */
else
{
double blocks_per_group= (double) num_blocks / (double) num_groups;
p_overlap= (blocks_per_group * (keys_per_subgroup - 1)) / keys_per_group;
p_overlap= MY_MIN(p_overlap, 1.0);
}
io_cost= (double) MY_MIN(num_groups * (1 + p_overlap), num_blocks);
}
else
io_cost= ((keys_per_group > keys_per_block) ?
(have_min && have_max) ? (double) (num_groups + 1) :
(double) num_groups :
(double) num_blocks);
/* Calculate the number of blocks we will touch for the table or range scan */
num_blocks= (records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
io_cost= (have_max) ? num_groups*2 : num_groups;
set_if_smaller(io_cost, num_blocks);
/*
CPU cost must be comparable to that of an index scan as computed
......@@ -15118,13 +15187,13 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
*read_cost= file->ha_keyread_and_compare_time(keyno, (ulong) num_groups,
num_groups,
io_cost);
*records= num_groups;
*out_records= num_groups;
DBUG_PRINT("info",
("table rows: %lu keys/block: %u keys/group: %lu "
("rows: %lu keys/group: %lu "
"result rows: %lu blocks: %lu",
(ulong) table_records, keys_per_block, (ulong) keys_per_group,
(ulong) *records, (ulong) num_blocks));
(ulong) records, (ulong) keys_per_group,
(ulong) *out_records, (ulong) num_blocks));
DBUG_VOID_RETURN;
}
......
......@@ -302,6 +302,7 @@ class SEL_ARG :public Sql_alloc
{
static int sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
uint8 b_flag);
bool min_max_are_equal() const;
public:
uint8 min_flag,max_flag,maybe_flag;
uint8 part; // Which key part
......@@ -401,6 +402,7 @@ class SEL_ARG :public Sql_alloc
return false;
return true;
}
int number_of_eq_groups(uint group_key_parts) const;
inline void merge_flags(SEL_ARG *arg) { maybe_flag|=arg->maybe_flag; }
inline void maybe_smaller() { maybe_flag=1; }
/* Return true iff it's a single-point null interval */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment