Selectivity: apply found_constraint heuristic only to post-join #rows.

matching_candidates_in_table() computes the number of rows one gets from the current table after applying the WHERE clause on just this table The function had a "found_counstraint heuristic" which reduced the number of rows after WHERE check by 25% if there were comparisons between key parts in table T and previous tables, like WHERE T.keyXpartY= func(prev_table.cols) Note that such comparisons can only be checked when the row of table T is joined with rows of the previous tables. It is wrong to apply the selectivity before the join operation. Fixed by moving the 'found_constraint' code to a separate function and only reducing the #rows in 'records_out'. Renamed matching_candidates_in_table() to apply_selectivity_for_table() as the function now either applies selectivity on the rows (depending on the value of thd->variables.optimizer_use_condition_selectivity) or uses the selectivity from the available range conditions.

Selectivity: apply found_constraint heuristic only to post-join #rows.
matching_candidates_in_table() computes the number of rows one gets from the current table after applying the WHERE clause on just this table The function had a "found_counstraint heuristic" which reduced the number of rows after WHERE check by 25% if there were comparisons between key parts in table T and previous tables, like WHERE T.keyXpartY= func(prev_table.cols) Note that such comparisons can only be checked when the row of table T is joined with rows of the previous tables. It is wrong to apply the selectivity before the join operation. Fixed by moving the 'found_constraint' code to a separate function and only reducing the #rows in 'records_out'. Renamed matching_candidates_in_table() to apply_selectivity_for_table() as the function now either applies selectivity on the rows (depending on the value of thd->variables.optimizer_use_condition_selectivity) or uses the selectivity from the available range conditions.
5e5988db · Monty · 33af691f · 5e5988db · 5e5988db · 5e5988db
Commit 5e5988db authored Jan 24, 2023 by Monty
4 changed files
--- a/mysql-test/main/opt_trace.result
+++ b/mysql-test/main/opt_trace.result
@@ -12537,7 +12537,7 @@ EXPLAIN
            "loops": 3,
            "rows": 10000,
            "cost": 1.701731924,
-            "filtered": 4.307688236,
+            "filtered": 3.230766058,
            "attached_condition": "t1.b < 5000 and t1.c < 1000"
          },
          "buffer_type": "flat",
@@ -12800,7 +12800,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
                              "access_type": "scan_with_join_cache",
                              "rows": 10000,
                              "rows_after_filter": 430.7688,
-                              "rows_out": 430.7688,
+                              "rows_out": 323.0766,
                              "cost": 1.701731924,
                              "index_only": false,
                              "chosen": true
@@ -12809,7 +12809,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
                          "chosen_access_method": {
                            "type": "scan",
                            "rows_read": 430.7688,
-                            "rows_out": 430.7688,
+                            "rows_out": 323.0766,
                            "cost": 1.701731924,
                            "uses_join_buffering": true
                          }
@@ -12820,8 +12820,11 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
                  {
                    "plan_prefix": "three",
                    "table": "t1",
-                    "rows_for_plan": 1292.3064,
-                    "cost_for_plan": 1.712236739
+                    "rows_for_plan": 969.2298,
+                    "cost_for_plan": 1.712236739,
+                    "pushdown_cond_selectivity": 0.75,
+                    "filtered": 3.230766,
+                    "rows_out": 323.0766
                  }
                ]
              },
@@ -12836,7 +12839,7 @@ explain format=json select * from three, t1 where t1.a=three.a and t1.b<5000 and
          },
          {
            "best_join_order": ["three", "t1"],
-            "rows": 1292.3064,
+            "rows": 969.2298,
            "cost": 1.712236739
          },
          {

--- a/mysql-test/main/subselect_sj.result
+++ b/mysql-test/main/subselect_sj.result
@@ -2436,9 +2436,9 @@ SET SESSION optimizer_switch='loosescan=off';
 EXPLAIN
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	Using where
-1	PRIMARY	t3	range	idx	idx	4	NULL	2	Using where; Using index
-1	PRIMARY	t2	ref	idx	idx	9	test.t3.b,test.t1.a	1	Using index; FirstMatch(t1)
+1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	
+1	PRIMARY	t2	range	idx	idx	4	NULL	2	Using where; Using index
+1	PRIMARY	t3	ref	idx	idx	4	test.t2.b	1	Using index; FirstMatch(t1)
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 a
 5
@@ -2446,9 +2446,9 @@ SET SESSION optimizer_switch='loosescan=on';
 EXPLAIN
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	Using where
-1	PRIMARY	t3	range	idx	idx	4	NULL	2	Using where; Using index
-1	PRIMARY	t2	ref	idx	idx	9	test.t3.b,test.t1.a	1	Using index; FirstMatch(t1)
+1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	
+1	PRIMARY	t2	range	idx	idx	4	NULL	2	Using where; Using index
+1	PRIMARY	t3	ref	idx	idx	4	test.t2.b	1	Using index; FirstMatch(t1)
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 a
 5

--- a/mysql-test/main/subselect_sj_jcl6.result
+++ b/mysql-test/main/subselect_sj_jcl6.result
@@ -2442,9 +2442,9 @@ SET SESSION optimizer_switch='loosescan=off';
 EXPLAIN
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	Using where
-1	PRIMARY	t3	range	idx	idx	4	NULL	2	Using where; Using index
-1	PRIMARY	t2	ref	idx	idx	9	test.t3.b,test.t1.a	1	Using index; FirstMatch(t1)
+1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	
+1	PRIMARY	t2	range	idx	idx	4	NULL	2	Using where; Using index
+1	PRIMARY	t3	ref	idx	idx	4	test.t2.b	1	Using index; FirstMatch(t1)
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 a
 5
@@ -2452,9 +2452,9 @@ SET SESSION optimizer_switch='loosescan=on';
 EXPLAIN
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	Using where
-1	PRIMARY	t3	range	idx	idx	4	NULL	2	Using where; Using index
-1	PRIMARY	t2	ref	idx	idx	9	test.t3.b,test.t1.a	1	Using index; FirstMatch(t1)
+1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	4	
+1	PRIMARY	t2	range	idx	idx	4	NULL	2	Using where; Using index
+1	PRIMARY	t3	ref	idx	idx	4	test.t2.b	1	Using index; FirstMatch(t1)
 SELECT * FROM t1 WHERE a IN (SELECT t2.a FROM t2,t3 WHERE t2.b = t3.b);
 a
 5

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7756,14 +7756,12 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)

 /*
  Estimate how many records we will get if we read just this table and apply
-  a part of WHERE that can be checked for it.
+  a part of WHERE that can be checked using only the current table and
+  const tables.

  @param s                      Current JOIN_TAB
-  @param with_found_constraint  There is a filtering condition on the
-                                current table. See more below.
  @param use_cond_selectivity   Value of optimizer_use_condition_selectivity.
                                If > 1 then use table->cond_selecitivity.
-
  @return 0.0                   No matching rows
  @return >= 1.0                Number of expected matching rows

@@ -7771,28 +7769,15 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
  Estimate how many records we will get if we
   - read the given table with its "independent" access method (either quick 
     select or full table/index scan),
-   - apply the part of WHERE that refers only to this table.
+   - apply the part of WHERE that refers only to this table and const tables.
   - The result cannot be bigger than table records

-   'with_found_constraint' is true if the WHERE has a top level expression of
-   type:
-   current_table.key_part=expression_that_contains_fields_from_previous_tables
-   Examples (assuming join order t1,t2):
-   WHERE t2.keypart1=t1.some_field
-   WHERE t2.keypart1=t1.some_field+t1.other_field
-
-   TODO:
-   Extend with_found_constraint' to be set for a top level expression of type
-   X=Y where X and Y has fields from current table and at least one field from
-   one or more previous tables.
-
  @see also
    table_after_join_selectivity() produces selectivity of condition that is
    checked after joining rows from this table to rows from preceding tables.
 */

-static double matching_candidates_in_table(JOIN_TAB *s,
-                                           bool with_found_constraint,
+static double apply_selectivity_for_table(JOIN_TAB *s,
                                           uint use_cond_selectivity)
 {
  double dbl_records;
@@ -7826,19 +7811,6 @@ static double matching_candidates_in_table(JOIN_TAB *s,
    dbl_records= rows2double(s->table->opt_range_condition_rows);
  }

-  /*
-    If there is a filtering condition on the table (i.e. ref analyzer found
-    at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
-    preceding this table in the join order we're now considering), then 
-    assume that 25% of the rows will be filtered out by this condition.
-
-    This heuristic is supposed to force tables used in exprZ to be before
-    this table in join order.
-  */
-
-  if (with_found_constraint)
-    dbl_records-= dbl_records/4;
-
  DBUG_ASSERT(dbl_records <= s->records);
  /*
    Ensure we return at least one row if there is any possibility to have
@@ -7853,6 +7825,28 @@ static double matching_candidates_in_table(JOIN_TAB *s,
 }


+/*
+  Take into account that the table's WHERE clause has conditions on earlier
+  tables that can reduce the number of accepted rows.
+
+  @param records  Number of original rows (after selectivity)
+
+  If there is a filtering condition on the table (i.e. ref analyzer found
+  at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
+  preceding this table in the join order we're now considering), then
+  assume that 25% of the rows will be filtered out by this condition.
+
+  This heuristic is supposed to force tables used in exprZ to be before
+  this table in join order.
+*/
+
+static double use_found_constraint(double records)
+{
+  records-= records/4;
+  return records ? MY_MAX(records, MIN_ROWS_AFTER_FILTERING) : 0.0;
+}
+
+
 /*
  Calculate the cost of reading a set of rows trough an index

@@ -8240,9 +8234,6 @@ best_access_path(JOIN      *join,
          {
            found_part|= keyuse->keypart_map;
            key_parts_dependent= 0;
-            found_constraint|= (keyuse->used_tables &
-                                ~(remaining_tables |
-                                  join->const_table_map));
            if (!(keyuse->used_tables & ~join->const_table_map))
              const_part|= keyuse->keypart_map;

@@ -8265,6 +8256,16 @@ best_access_path(JOIN      *join,
 	    */
            if (keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL)
              ref_or_null_part |= keyuse->keypart_map;
+
+            /*
+              Remember if there is a WHERE condition that contains
+              'key_part=expression_with_only_accessible_tables'
+              We ignore const tables as these are handled by selectivity
+              code (const table fields are treated as constants).
+            */
+            found_constraint|= (keyuse->used_tables &
+                                ~(remaining_tables |
+                                  join->const_table_map));
          }
          else if (!(found_part & keyuse->keypart_map))
            key_parts_dependent|= keyuse->used_tables;
@@ -8838,12 +8839,15 @@ best_access_path(JOIN      *join,
      (!(table->map & join->outer_join) ||
       join->allowed_outer_join_with_cache))    // (2)
  {
-    double refills, row_copy_cost, cmp_time, cur_cost;
+    double refills, row_copy_cost, cmp_time, cur_cost, records_table_filter;
    /* Estimate the cost of the hash join access to the table */
-    double rnd_records= matching_candidates_in_table(s, 0,
-                                                     use_cond_selectivity);
+    double rnd_records= apply_selectivity_for_table(s, use_cond_selectivity);
+    records_table_filter= ((found_constraint) ?
+                           use_found_constraint(rnd_records) :
+                           rnd_records);
+
    DBUG_ASSERT(rnd_records <= s->found_records);
-    set_if_smaller(best.records_out, rnd_records);
+    set_if_smaller(best.records_out, records_table_filter);

    /*
      The following cost calculation is identical to the cost calculation for
@@ -9053,13 +9057,19 @@ best_access_path(JOIN      *join,
    }
    else
    {
+      double records_table_filter;
+
      /* We will now calculate cost of scan, with or without join buffer */
      records_best_filter= records_after_filter=
-        matching_candidates_in_table(s, 0, use_cond_selectivity);
+        apply_selectivity_for_table(s, use_cond_selectivity);
+      records_table_filter= ((found_constraint) ?
+                             use_found_constraint(records_after_filter) :
+                             records_after_filter);
+
      DBUG_ASSERT(records_after_filter <= s->records);
      DBUG_ASSERT(records_after_filter <= s->found_records);

-      set_if_smaller(best.records_out, records_after_filter);
+      set_if_smaller(best.records_out, records_table_filter);

      org_records= rows2double(s->records);

@@ -10380,7 +10390,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
    with previous tables.

    For quick selects and full table scans, selectivity of COND(this_table)
-    is accounted for in matching_candidates_in_table(). Here, we only count
+    is accounted for in apply_selectivity_for_table(). Here, we only count
    selectivity of COND(this_table, previous_tables). 

    For other access methods, we need to calculate selectivity of the whole