MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value

- Fix table_cond_selectivity() to work correctly for ref access and "keypart2=const" case.

MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
- Fix table_cond_selectivity() to work correctly for ref access and "keypart2=const" case.
2acd81af · Sergey Petrunya · 182f7d76 · 2acd81af · 2acd81af · 2acd81af
Commit 2acd81af authored Apr 25, 2014 by Sergey Petrunya
3 changed files
--- a/mysql-test/r/selectivity_no_engine.result
+++ b/mysql-test/r/selectivity_no_engine.result
@@ -139,6 +139,46 @@ Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
 drop table t2, t1;
 # 
+# MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
+# 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int, kp2 int, 
+filler1 char(100),
+filler2 char(100),
+key(kp1, kp2)
+);
+insert into t1 
+select 
+A.a,
+B.a,
+'filler-data-1',
+'filler-data-2'
+from t0 A, t0 B, t0 C;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	Table is already up to date
+# NOTE: 10*100%, 10*100% rows is ok
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
+1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,func	10	100.00	Using index condition
+Warnings:
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1)))
+# NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
+1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,const	10	100.00	
+Warnings:
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4))
+drop table t0, t1;
+# 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;

--- a/mysql-test/t/selectivity_no_engine.test
+++ b/mysql-test/t/selectivity_no_engine.test
@@ -101,6 +101,36 @@ analyze table t2 persistent for all;
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 drop table t2, t1;
+--echo # 
+--echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
+--echo # 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+  kp1 int, kp2 int, 
+  filler1 char(100),
+  filler2 char(100),
+  key(kp1, kp2)
+);
+insert into t1 
+select 
+  A.a,
+  B.a,
+  'filler-data-1',
+  'filler-data-2'
+from t0 A, t0 B, t0 C;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+--echo # NOTE: 10*100%, 10*100% rows is ok
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
+--echo # NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
+drop table t0, t1;
 --echo # 
 --echo # End of the test file
 --echo # 

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7287,18 +7287,74 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
  if (pos->key != 0)
  {
    /* 
-      A ref access or hash join is used for this table.
+      A ref access or hash join is used for this table. ref access is created
+      from
-      It could have some parts with "t.key_part=const". Using ref access
+        tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ...
-      means that we will only get records where the condition holds, so we
-      should remove its selectivity from the condition selectivity.
+      and it will only return rows for which this condition is satisified.
+      Suppose, certain expr{i} is a constant. Since ref access only returns
+      rows that satisfy
+         tbl.keypart{i}=const       (*)
+      then selectivity of this equality should not be counted in return value 
+      of this function. This function uses the value of 
+         table->cond_selectivity=selectivity(COND(tbl)) (**)
+      as a starting point. This value includes selectivity of equality (*). We
+      should somehow discount it. 
+      Looking at calculate_cond_selectivity_for_table(), one can see that that
+      the value is not necessarily a direct multiplicand in 
+      table->cond_selectivity
+      There are three possible ways to discount
+      1. There is a potential range access on t.keypart{i}=const. 
+         (an important special case: the used ref access has a const prefix for
+          which a range estimate is available)
+      2. The field has a histogram. field[x]->cond_selectivity has the data.
+      3. Use index stats on this index:
+         rec_per_key[key_part+1]/rec_per_key[key_part]
      (TODO: more details about the "t.key=othertable.col" case)
    */
    KEYUSE *keyuse= pos->key;
    KEYUSE *prev_ref_keyuse= keyuse;
    uint key= keyuse->key;
-    do
+    /*
+      Check if we have a prefix of key=const that matches a quick select.
+    */
+    if (!is_hash_join_key_no(key))
+    {
+      table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1;
+      if (table->quick_rows[key] && 
+          !(quick_key_map & ~table->const_key_parts[key]))
+      {
+        /* 
+          Ok, there is an equality for each of the key parts used by the
+          quick select. This means, quick select's estimate can be reused to
+          discount the selectivity of a prefix of a ref access.
+        */
+        for (; quick_key_map & 1 ; quick_key_map>>= 1)
+        {
+          while (keyuse->keypart == keyparts)
+            keyuse++;
+          keyparts++;
+        }
+        sel /= table->quick_rows[key] / table->stat_records();
+      }
+    }
+    /*
+      Go through the "keypart{N}=..." equalities and find those that were
+      already taken into account in table->cond_selectivity.
+    */
+    while (keyuse->table == table && keyuse->key == key)
    {
      if (!(keyuse->used_tables & (rem_tables | table->map)))
      {
@@ -7312,22 +7368,35 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          else
 	  {
            if (keyparts == keyuse->keypart &&
-                !(~(keyuse->val->used_tables()) & pos->ref_depend_map) &&
+                !((keyuse->val->used_tables()) & ~pos->ref_depend_map) &&
                !(found_part_ref_or_null & keyuse->optimize))
 	    {
+              /* Found a KEYUSE object that will be used by ref access */
              keyparts++;
              found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
            }
          }
          if (keyparts > keyuse->keypart)
 	  {
+            /* Ok this is the keyuse that will be used for ref access */
            uint fldno;
            if (is_hash_join_key_no(key))
 	      fldno= keyuse->keypart;
            else
              fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1;
            if (keyuse->val->const_item())
-              sel*= table->field[fldno]->cond_selectivity; 
+            {              
+              sel /= table->field[fldno]->cond_selectivity;
+              /* 
+               TODO: we could do better here:
+                 1. cond_selectivity might be =1 (the default) because quick 
+                    select on some index prevented us from analyzing 
+                    histogram for this column.
+                 2. we could get an estimate through this?
+                     rec_per_key[key_part-1] / rec_per_key[key_part]
+              */
+            }
            if (keyparts > 1)
 	    {
              ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse;
@@ -7337,7 +7406,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
 	}
      }
      keyuse++;
-    } while (keyuse->table == table && keyuse->key == key);
+    }
  }
  else
  {