Commit 5e5988db authored by Monty's avatar Monty

Selectivity: apply found_constraint heuristic only to post-join #rows.

matching_candidates_in_table() computes the number of rows one
gets from the current table after applying the WHERE clause on
just this table

The function had a "found_counstraint heuristic" which reduced the
number of rows after WHERE check by 25% if there were comparisons
between key parts in table T and previous tables, like WHERE
T.keyXpartY= func(prev_table.cols)

Note that such comparisons can only be checked when the row of
table T is joined with rows of the previous tables. It is wrong
to apply the selectivity before the join operation.

Fixed by moving the 'found_constraint' code to a separate function
and only reducing the #rows in 'records_out'.

Renamed matching_candidates_in_table() to apply_selectivity_for_table() as
the function now either applies selectivity on the rows (depending
on the value of thd->variables.optimizer_use_condition_selectivity)
or uses the selectivity from the available range conditions.
parent 33af691f
...@@ -12537,7 +12537,7 @@ EXPLAIN ...@@ -12537,7 +12537,7 @@ EXPLAIN
"loops": 3, "loops": 3,
"rows": 10000, "rows": 10000,
"cost": 1.701731924, "cost": 1.701731924,
"filtered": 4.307688236, "filtered": 3.230766058,
"attached_condition": "t1.b < 5000 and t1.c < 1000" "attached_condition": "t1.b < 5000 and t1.c < 1000"
}, },
"buffer_type": "flat", "buffer_type": "flat",
...@@ -12800,7 +12800,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and ...@@ -12800,7 +12800,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
"access_type": "scan_with_join_cache", "access_type": "scan_with_join_cache",
"rows": 10000, "rows": 10000,
"rows_after_filter": 430.7688, "rows_after_filter": 430.7688,
"rows_out": 430.7688, "rows_out": 323.0766,
"cost": 1.701731924, "cost": 1.701731924,
"index_only": false, "index_only": false,
"chosen": true "chosen": true
...@@ -12809,7 +12809,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and ...@@ -12809,7 +12809,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
"chosen_access_method": { "chosen_access_method": {
"type": "scan", "type": "scan",
"rows_read": 430.7688, "rows_read": 430.7688,
"rows_out": 430.7688, "rows_out": 323.0766,
"cost": 1.701731924, "cost": 1.701731924,
"uses_join_buffering": true "uses_join_buffering": true
} }
...@@ -12820,8 +12820,11 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and ...@@ -12820,8 +12820,11 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
{ {
"plan_prefix": "three", "plan_prefix": "three",
"table": "t1", "table": "t1",
"rows_for_plan": 1292.3064, "rows_for_plan": 969.2298,
"cost_for_plan": 1.712236739 "cost_for_plan": 1.712236739,
"pushdown_cond_selectivity": 0.75,
"filtered": 3.230766,
"rows_out": 323.0766
} }
] ]
}, },
...@@ -12836,7 +12839,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and ...@@ -12836,7 +12839,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
}, },
{ {
"best_join_order": ["three", "t1"], "best_join_order": ["three", "t1"],
"rows": 1292.3064, "rows": 969.2298,
"cost": 1.712236739 "cost": 1.712236739
}, },
{ {
......
...@@ -2436,9 +2436,9 @@ SET SESSION optimizer_switch='loosescan=off'; ...@@ -2436,9 +2436,9 @@ SET SESSION optimizer_switch='loosescan=off';
EXPLAIN EXPLAIN
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 4 Using where 1 PRIMARY t1 ALL NULL NULL NULL NULL 4
1 PRIMARY t3 range idx idx 4 NULL 2 Using where; Using index 1 PRIMARY t2 range idx idx 4 NULL 2 Using where; Using index
1 PRIMARY t2 ref idx idx 9 test.t3.b,test.t1.a 1 Using index; FirstMatch(t1) 1 PRIMARY t3 ref idx idx 4 test.t2.b 1 Using index; FirstMatch(t1)
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
a a
5 5
...@@ -2446,9 +2446,9 @@ SET SESSION optimizer_switch='loosescan=on'; ...@@ -2446,9 +2446,9 @@ SET SESSION optimizer_switch='loosescan=on';
EXPLAIN EXPLAIN
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 4 Using where 1 PRIMARY t1 ALL NULL NULL NULL NULL 4
1 PRIMARY t3 range idx idx 4 NULL 2 Using where; Using index 1 PRIMARY t2 range idx idx 4 NULL 2 Using where; Using index
1 PRIMARY t2 ref idx idx 9 test.t3.b,test.t1.a 1 Using index; FirstMatch(t1) 1 PRIMARY t3 ref idx idx 4 test.t2.b 1 Using index; FirstMatch(t1)
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
a a
5 5
......
...@@ -2442,9 +2442,9 @@ SET SESSION optimizer_switch='loosescan=off'; ...@@ -2442,9 +2442,9 @@ SET SESSION optimizer_switch='loosescan=off';
EXPLAIN EXPLAIN
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 4 Using where 1 PRIMARY t1 ALL NULL NULL NULL NULL 4
1 PRIMARY t3 range idx idx 4 NULL 2 Using where; Using index 1 PRIMARY t2 range idx idx 4 NULL 2 Using where; Using index
1 PRIMARY t2 ref idx idx 9 test.t3.b,test.t1.a 1 Using index; FirstMatch(t1) 1 PRIMARY t3 ref idx idx 4 test.t2.b 1 Using index; FirstMatch(t1)
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
a a
5 5
...@@ -2452,9 +2452,9 @@ SET SESSION optimizer_switch='loosescan=on'; ...@@ -2452,9 +2452,9 @@ SET SESSION optimizer_switch='loosescan=on';
EXPLAIN EXPLAIN
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 4 Using where 1 PRIMARY t1 ALL NULL NULL NULL NULL 4
1 PRIMARY t3 range idx idx 4 NULL 2 Using where; Using index 1 PRIMARY t2 range idx idx 4 NULL 2 Using where; Using index
1 PRIMARY t2 ref idx idx 9 test.t3.b,test.t1.a 1 Using index; FirstMatch(t1) 1 PRIMARY t3 ref idx idx 4 test.t2.b 1 Using index; FirstMatch(t1)
SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b); SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
a a
5 5
......
...@@ -7756,14 +7756,12 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key) ...@@ -7756,14 +7756,12 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
/* /*
Estimate how many records we will get if we read just this table and apply Estimate how many records we will get if we read just this table and apply
a part of WHERE that can be checked for it. a part of WHERE that can be checked using only the current table and
const tables.
@param s Current JOIN_TAB @param s Current JOIN_TAB
@param with_found_constraint There is a filtering condition on the
current table. See more below.
@param use_cond_selectivity Value of optimizer_use_condition_selectivity. @param use_cond_selectivity Value of optimizer_use_condition_selectivity.
If > 1 then use table->cond_selecitivity. If > 1 then use table->cond_selecitivity.
@return 0.0 No matching rows @return 0.0 No matching rows
@return >= 1.0 Number of expected matching rows @return >= 1.0 Number of expected matching rows
...@@ -7771,28 +7769,15 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key) ...@@ -7771,28 +7769,15 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
Estimate how many records we will get if we Estimate how many records we will get if we
- read the given table with its "independent" access method (either quick - read the given table with its "independent" access method (either quick
select or full table/index scan), select or full table/index scan),
- apply the part of WHERE that refers only to this table. - apply the part of WHERE that refers only to this table and const tables.
- The result cannot be bigger than table records - The result cannot be bigger than table records
'with_found_constraint' is true if the WHERE has a top level expression of
type:
current_table.key_part=expression_that_contains_fields_from_previous_tables
Examples (assuming join order t1,t2):
WHERE t2.keypart1=t1.some_field
WHERE t2.keypart1=t1.some_field+t1.other_field
TODO:
Extend with_found_constraint' to be set for a top level expression of type
X=Y where X and Y has fields from current table and at least one field from
one or more previous tables.
@see also @see also
table_after_join_selectivity() produces selectivity of condition that is table_after_join_selectivity() produces selectivity of condition that is
checked after joining rows from this table to rows from preceding tables. checked after joining rows from this table to rows from preceding tables.
*/ */
static double matching_candidates_in_table(JOIN_TAB *s, static double apply_selectivity_for_table(JOIN_TAB *s,
bool with_found_constraint,
uint use_cond_selectivity) uint use_cond_selectivity)
{ {
double dbl_records; double dbl_records;
...@@ -7826,19 +7811,6 @@ static double matching_candidates_in_table(JOIN_TAB *s, ...@@ -7826,19 +7811,6 @@ static double matching_candidates_in_table(JOIN_TAB *s,
dbl_records= rows2double(s->table->opt_range_condition_rows); dbl_records= rows2double(s->table->opt_range_condition_rows);
} }
/*
If there is a filtering condition on the table (i.e. ref analyzer found
at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
preceding this table in the join order we're now considering), then
assume that 25% of the rows will be filtered out by this condition.
This heuristic is supposed to force tables used in exprZ to be before
this table in join order.
*/
if (with_found_constraint)
dbl_records-= dbl_records/4;
DBUG_ASSERT(dbl_records <= s->records); DBUG_ASSERT(dbl_records <= s->records);
/* /*
Ensure we return at least one row if there is any possibility to have Ensure we return at least one row if there is any possibility to have
...@@ -7853,6 +7825,28 @@ static double matching_candidates_in_table(JOIN_TAB *s, ...@@ -7853,6 +7825,28 @@ static double matching_candidates_in_table(JOIN_TAB *s,
} }
/*
Take into account that the table's WHERE clause has conditions on earlier
tables that can reduce the number of accepted rows.
@param records Number of original rows (after selectivity)
If there is a filtering condition on the table (i.e. ref analyzer found
at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
preceding this table in the join order we're now considering), then
assume that 25% of the rows will be filtered out by this condition.
This heuristic is supposed to force tables used in exprZ to be before
this table in join order.
*/
static double use_found_constraint(double records)
{
records-= records/4;
return records ? MY_MAX(records, MIN_ROWS_AFTER_FILTERING) : 0.0;
}
/* /*
Calculate the cost of reading a set of rows trough an index Calculate the cost of reading a set of rows trough an index
...@@ -8240,9 +8234,6 @@ best_access_path(JOIN *join, ...@@ -8240,9 +8234,6 @@ best_access_path(JOIN *join,
{ {
found_part|= keyuse->keypart_map; found_part|= keyuse->keypart_map;
key_parts_dependent= 0; key_parts_dependent= 0;
found_constraint|= (keyuse->used_tables &
~(remaining_tables |
join->const_table_map));
if (!(keyuse->used_tables & ~join->const_table_map)) if (!(keyuse->used_tables & ~join->const_table_map))
const_part|= keyuse->keypart_map; const_part|= keyuse->keypart_map;
...@@ -8265,6 +8256,16 @@ best_access_path(JOIN *join, ...@@ -8265,6 +8256,16 @@ best_access_path(JOIN *join,
*/ */
if (keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL) if (keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL)
ref_or_null_part |= keyuse->keypart_map; ref_or_null_part |= keyuse->keypart_map;
/*
Remember if there is a WHERE condition that contains
'key_part=expression_with_only_accessible_tables'
We ignore const tables as these are handled by selectivity
code (const table fields are treated as constants).
*/
found_constraint|= (keyuse->used_tables &
~(remaining_tables |
join->const_table_map));
} }
else if (!(found_part & keyuse->keypart_map)) else if (!(found_part & keyuse->keypart_map))
key_parts_dependent|= keyuse->used_tables; key_parts_dependent|= keyuse->used_tables;
...@@ -8838,12 +8839,15 @@ best_access_path(JOIN *join, ...@@ -8838,12 +8839,15 @@ best_access_path(JOIN *join,
(!(table->map & join->outer_join) || (!(table->map & join->outer_join) ||
join->allowed_outer_join_with_cache)) // (2) join->allowed_outer_join_with_cache)) // (2)
{ {
double refills, row_copy_cost, cmp_time, cur_cost; double refills, row_copy_cost, cmp_time, cur_cost, records_table_filter;
/* Estimate the cost of the hash join access to the table */ /* Estimate the cost of the hash join access to the table */
double rnd_records= matching_candidates_in_table(s, 0, double rnd_records= apply_selectivity_for_table(s, use_cond_selectivity);
use_cond_selectivity); records_table_filter= ((found_constraint) ?
use_found_constraint(rnd_records) :
rnd_records);
DBUG_ASSERT(rnd_records <= s->found_records); DBUG_ASSERT(rnd_records <= s->found_records);
set_if_smaller(best.records_out, rnd_records); set_if_smaller(best.records_out, records_table_filter);
/* /*
The following cost calculation is identical to the cost calculation for The following cost calculation is identical to the cost calculation for
...@@ -9053,13 +9057,19 @@ best_access_path(JOIN *join, ...@@ -9053,13 +9057,19 @@ best_access_path(JOIN *join,
} }
else else
{ {
double records_table_filter;
/* We will now calculate cost of scan, with or without join buffer */ /* We will now calculate cost of scan, with or without join buffer */
records_best_filter= records_after_filter= records_best_filter= records_after_filter=
matching_candidates_in_table(s, 0, use_cond_selectivity); apply_selectivity_for_table(s, use_cond_selectivity);
records_table_filter= ((found_constraint) ?
use_found_constraint(records_after_filter) :
records_after_filter);
DBUG_ASSERT(records_after_filter <= s->records); DBUG_ASSERT(records_after_filter <= s->records);
DBUG_ASSERT(records_after_filter <= s->found_records); DBUG_ASSERT(records_after_filter <= s->found_records);
set_if_smaller(best.records_out, records_after_filter); set_if_smaller(best.records_out, records_table_filter);
org_records= rows2double(s->records); org_records= rows2double(s->records);
...@@ -10380,7 +10390,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, ...@@ -10380,7 +10390,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
with previous tables. with previous tables.
For quick selects and full table scans, selectivity of COND(this_table) For quick selects and full table scans, selectivity of COND(this_table)
is accounted for in matching_candidates_in_table(). Here, we only count is accounted for in apply_selectivity_for_table(). Here, we only count
selectivity of COND(this_table, previous_tables). selectivity of COND(this_table, previous_tables).
For other access methods, we need to calculate selectivity of the whole For other access methods, we need to calculate selectivity of the whole
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment