MDEV-30812: Improve output cardinality estimates for hash join

Introduce @@optimizer_switch flag: hash_join_cardinality When it is on, use EITS statistics to produce tighter bounds for hash join output cardinality. Reviewed by: Monty <monty@mariadb.org>

MDEV-30812: Improve output cardinality estimates for hash join
Introduce @@optimizer_switch flag: hash_join_cardinality When it is on, use EITS statistics to produce tighter bounds for hash join output cardinality. Reviewed by: Monty <monty@mariadb.org>
2cc16653 · Sergei Petrunia · cc348115 · 2cc16653 · 2cc16653 · 2cc16653
Commit 2cc16653 authored Mar 09, 2023 by Sergei Petrunia
9 changed files
--- a/mysql-test/main/join_cache_cardinality.result
+++ b/mysql-test/main/join_cache_cardinality.result
+create table t1 (a int, b int, c int);
+insert into t1 select seq,seq/2, seq/4 from seq_1_to_100;
+create table t2 (a int, b int, c int);
+insert into t2 select seq, seq/2, seq/4 from seq_1_to_200;
+analyze table t1,t2 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	OK
+set optimizer_trace=1;
+set join_cache_level=6;
+set optimizer_switch='hash_join_cardinality=on';
+explain select *
+from t1, t2
+where t1.a=t2.a and t1.a=t2.b and t1.c=t2.c;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+1	SIMPLE	t2	hash_ALL	NULL	#hash#$hj	15	test.t1.a,test.t1.a,test.t1.c	200	Using where; Using join buffer (flat, BNLH join)
+set @json= (select trace from information_schema.optimizer_trace);
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+JS
+[
+    
+    {
+        "hash_join_columns": 
+        [
+            
+            {
+                "field": "a",
+                "avg_frequency": 1
+            },
+            
+            {
+                "field": "b",
+                "avg_frequency": 2
+            },
+            
+            {
+                "field": "c",
+                "avg_frequency": 3.9216
+            }
+        ],
+        "rows": 1
+    }
+]
+select json_detailed(json_extract(@json, '$**.rest_of_plan[*].rows_for_plan'))
+as ROWS_FOR_PLAN;
+ROWS_FOR_PLAN
+[
+    100
+]
+explain select *
+from t1, t2 where t1.c=t2.c;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+1	SIMPLE	t2	hash_ALL	NULL	#hash#$hj	5	test.t1.c	200	Using where; Using join buffer (flat, BNLH join)
+set @json= (select trace from information_schema.optimizer_trace);
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+JS
+[
+    
+    {
+        "hash_join_columns": 
+        [
+            
+            {
+                "field": "c",
+                "avg_frequency": 3.9216
+            }
+        ],
+        "rows": 3.9216
+    }
+]
+select json_detailed(json_extract(@json, '$**.rest_of_plan[*].rows_for_plan'))
+as ROWS_FOR_PLAN;
+ROWS_FOR_PLAN
+[
+    392.16
+]
+explain select *
+from t1 straight_join t2 where t1.c=t2.c and t2.a<30;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+1	SIMPLE	t2	hash_ALL	NULL	#hash#$hj	5	test.t1.c	200	Using where; Using join buffer (flat, BNLH join)
+set @json= (select trace from information_schema.optimizer_trace);
+# Note that rows is the same:
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+JS
+[
+    
+    {
+        "hash_join_columns": 
+        [
+            
+            {
+                "field": "c",
+                "avg_frequency": 3.9216
+            }
+        ],
+        "rows": 3.9216
+    }
+]
+# Despite available selectivity:
+select json_detailed(json_extract(@json, '$**.selectivity_for_columns')) as JS;
+JS
+[
+    
+    [
+        
+        {
+            "column_name": "a",
+            "ranges": 
+            [
+                "NULL < a < 30"
+            ],
+            "selectivity_from_histogram": 0.1484375
+        }
+    ]
+]
+drop table t1,t2;
--- a/mysql-test/main/join_cache_cardinality.test
+++ b/mysql-test/main/join_cache_cardinality.test
+--source include/have_sequence.inc
+
+create table t1 (a int, b int, c int);
+insert into t1 select seq,seq/2, seq/4 from seq_1_to_100;
+
+create table t2 (a int, b int, c int);
+insert into t2 select seq, seq/2, seq/4 from seq_1_to_200;
+
+analyze table t1,t2 persistent for all;
+
+set optimizer_trace=1;
+set join_cache_level=6;
+set optimizer_switch='hash_join_cardinality=on';
+explain select *
+from t1, t2
+where t1.a=t2.a and t1.a=t2.b and t1.c=t2.c;
+
+set @json= (select trace from information_schema.optimizer_trace);
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+select json_detailed(json_extract(@json, '$**.rest_of_plan[*].rows_for_plan'))
+as ROWS_FOR_PLAN;
+
+explain select *
+from t1, t2 where t1.c=t2.c;
+set @json= (select trace from information_schema.optimizer_trace);
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+select json_detailed(json_extract(@json, '$**.rest_of_plan[*].rows_for_plan'))
+as ROWS_FOR_PLAN;
+
+explain select *
+from t1 straight_join t2 where t1.c=t2.c and t2.a<30;
+set @json= (select trace from information_schema.optimizer_trace);
+--echo # Note that rows is the same:
+select json_detailed(json_extract(@json, '$**.hash_join_cardinality')) as JS;
+
+--echo # Despite available selectivity:
+select json_detailed(json_extract(@json, '$**.selectivity_for_columns')) as JS;
+drop table t1,t2;
--- a/mysql-test/main/mysqld--help.result
+++ b/mysql-test/main/mysqld--help.result
@@ -734,7 +734,8 @@ The following specify which files/extra groups are read (specified before remain
 extended_keys, exists_to_in, orderby_uses_equalities, 
 condition_pushdown_for_derived, split_materialized, 
 condition_pushdown_for_subquery, rowid_filter, 
- condition_pushdown_from_having, not_null_range_scan
+ condition_pushdown_from_having, not_null_range_scan, 
+ hash_join_cardinality
 --optimizer-trace=name 
 Controls tracing of the Optimizer:
 optimizer_trace=option=val[,option=val...], where option

--- a/mysql-test/main/mysqltest_tracking_info.result
+++ b/mysql-test/main/mysqltest_tracking_info.result
@@ -38,7 +38,7 @@ SET @@session.session_track_system_variables='optimizer_switch';
 set optimizer_switch='index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off';
 -- Tracker : SESSION_TRACK_SYSTEM_VARIABLES
 -- optimizer_switch
-- index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off
+-- index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off,hash_join_cardinality=off

 Warnings:
 Warning	1681	'engine_condition_pushdown=on' is deprecated and will be removed in a future release

--- a/mysql-test/suite/sys_vars/r/optimizer_switch_basic.result
+++ b/mysql-test/suite/sys_vars/r/optimizer_switch_basic.result
--- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
@@ -2449,7 +2449,7 @@ VARIABLE_COMMENT	Fine-tune the optimizer behavior
 NUMERIC_MIN_VALUE	NULL
 NUMERIC_MAX_VALUE	NULL
 NUMERIC_BLOCK_SIZE	NULL
-ENUM_VALUE_LIST	index_merge,index_merge_union,index_merge_sort_union,index_merge_intersection,index_merge_sort_intersection,engine_condition_pushdown,index_condition_pushdown,derived_merge,derived_with_keys,firstmatch,loosescan,materialization,in_to_exists,semijoin,partial_match_rowid_merge,partial_match_table_scan,subquery_cache,mrr,mrr_cost_based,mrr_sort_keys,outer_join_with_cache,semijoin_with_cache,join_cache_incremental,join_cache_hashed,join_cache_bka,optimize_join_buffer_size,table_elimination,extended_keys,exists_to_in,orderby_uses_equalities,condition_pushdown_for_derived,split_materialized,condition_pushdown_for_subquery,rowid_filter,condition_pushdown_from_having,not_null_range_scan,default
+ENUM_VALUE_LIST	index_merge,index_merge_union,index_merge_sort_union,index_merge_intersection,index_merge_sort_intersection,engine_condition_pushdown,index_condition_pushdown,derived_merge,derived_with_keys,firstmatch,loosescan,materialization,in_to_exists,semijoin,partial_match_rowid_merge,partial_match_table_scan,subquery_cache,mrr,mrr_cost_based,mrr_sort_keys,outer_join_with_cache,semijoin_with_cache,join_cache_incremental,join_cache_hashed,join_cache_bka,optimize_join_buffer_size,table_elimination,extended_keys,exists_to_in,orderby_uses_equalities,condition_pushdown_for_derived,split_materialized,condition_pushdown_for_subquery,rowid_filter,condition_pushdown_from_having,not_null_range_scan,hash_join_cardinality,default
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	OPTIMIZER_TRACE

--- a/sql/sql_priv.h
+++ b/sql/sql_priv.h
@@ -234,6 +234,7 @@
 #define OPTIMIZER_SWITCH_USE_ROWID_FILTER          (1ULL << 33)
 #define OPTIMIZER_SWITCH_COND_PUSHDOWN_FROM_HAVING (1ULL << 34)
 #define OPTIMIZER_SWITCH_NOT_NULL_RANGE_SCAN       (1ULL << 35)
+#define OPTIMIZER_SWITCH_HASH_JOIN_CARDINALITY     (1ULL << 36)

 #define OPTIMIZER_SWITCH_DEFAULT   (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                    OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \

--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7677,6 +7677,90 @@ double adjust_quick_cost(double quick_cost, ha_rows records)
 }


+/*
+  @brief
+    Compute the fanout of hash join operation using EITS data
+*/
+
+double hash_join_fanout(JOIN *join, JOIN_TAB *s, table_map remaining_tables,
+                        double rnd_records, KEYUSE *hj_start_key)
+{
+  THD *thd= join->thd;
+  /*
+    Before doing the hash join, we will scan the table and apply the local part
+    of the WHERE condition. This will produce rnd_records.
+
+    The EITS statistics describes the entire table. Calling
+
+      table->field[N]->get_avg_frequency()
+
+    produces average #rows in the table with some value.
+
+    What happens if we filter out rows so that rnd_records rows are left?
+    Something between the two outcomes:
+    A. filtering removes a fraction of rows for each value:
+      avg_frequency=avg_frequency * condition_selectivity
+
+    B. filtering removes entire groups of rows with the same value, but
+       the remaining groups remain of the same size.
+
+    We make pessimistic assumption and assume B.
+    We also handle an edge case: if rnd_records is less than avg_frequency,
+    assume we'll get rnd_records rows with the same value, and return
+    rnd_records as the fanout estimate.
+  */
+  double min_freq= rnd_records;
+
+  Json_writer_object trace_obj(thd, "hash_join_cardinality");
+  /*
+    There can be multiple KEYUSE referring to same or different columns
+
+       KEYUSE(tbl.col1 = ...)
+       KEYUSE(tbl.col1 = ...)
+       KEYUSE(tbl.col2 = ...)
+
+    Hash join code can use multiple columns: (col1, col2) for joining.
+    We need n_distinct({col1, col2}).
+
+    EITS only has statistics on individual columns: n_distinct(col1),
+    n_distinct(col2).
+
+    Our current solution is to be very conservative and use selectivity
+    of one column with the lowest avg_frequency.
+
+    In the future, we should an approach that cautiosly takes into account
+    multiple KEYUSEs either multiply by number of equalities or by sqrt
+    of the second most selective equality.
+  */
+  Json_writer_array trace_arr(thd, "hash_join_columns");
+  for (KEYUSE *keyuse= hj_start_key;
+       keyuse->table == s->table && is_hash_join_key_no(keyuse->key);
+       keyuse++)
+  {
+    if (!(remaining_tables & keyuse->used_tables) &&
+        (!keyuse->validity_ref || *keyuse->validity_ref) &&
+        s->access_from_tables_is_allowed(keyuse->used_tables,
+                                         join->sjm_lookup_tables))
+    {
+      Field *field= s->table->field[keyuse->keypart];
+      if (is_eits_usable(field))
+      {
+        double freq= field->read_stats->get_avg_frequency();
+
+        Json_writer_object trace_field(thd);
+        trace_field.add("field",field->field_name.str).
+          add("avg_frequency", freq);
+        if (freq < min_freq)
+          min_freq= freq;
+      }
+    }
+  }
+  trace_arr.end();
+  trace_obj.add("rows", min_freq);
+  return min_freq;
+}
+
+
 /**
  Find the best access path for an extension of a partial execution
  plan and add this path to the plan.
@@ -8292,7 +8376,10 @@ best_access_path(JOIN      *join,
      (!(s->table->map & join->outer_join) ||
       join->allowed_outer_join_with_cache))    // (2)
  {
+    double fanout;
    Json_writer_object trace_access_hash(thd);
+    trace_access_hash.add("type", "hash");
+    trace_access_hash.add("index", "hj-key");
    double join_sel= 0.1;
    /* Estimate the cost of  the hash join access to the table */
    double rnd_records= matching_candidates_in_table(s, found_constraint,
@@ -8302,25 +8389,55 @@ best_access_path(JOIN      *join,
    double cmp_time= (s->records - rnd_records)/TIME_FOR_COMPARE;
    tmp= COST_ADD(tmp, cmp_time);

+    DBUG_ASSERT(hj_start_key);
+    fanout= rnd_records;
+    if (optimizer_flag(thd, OPTIMIZER_SWITCH_HASH_JOIN_CARDINALITY))
+    {
+      /*
+        Starting from this point, rnd_records should not be used anymore.
+        Use "fanout" for an estimate of # matching records.
+      */
+      fanout= hash_join_fanout(join, s, remaining_tables, rnd_records,
+                               hj_start_key);
+      join_sel= 1.0; // Don't do the "10% heuristic"
+    }
+
    /* We read the table as many times as join buffer becomes full. */

    double refills= (1.0 + floor((double) cache_record_length(join,idx) *
                           record_count /
 			   (double) thd->variables.join_buff_size));
    tmp= COST_MULT(tmp, refills);
-    best_time= COST_ADD(tmp,
-                        COST_MULT((record_count*join_sel) / TIME_FOR_COMPARE,
-                                  rnd_records));
+
+    // Add cost of reading/writing the join buffer
+    if (optimizer_flag(thd, OPTIMIZER_SWITCH_HASH_JOIN_CARDINALITY))
+    {
+      /* Set it to be 1/10th of TIME_FOR_COMPARE */
+      double row_copy_cost= 1.0 / (10*TIME_FOR_COMPARE);
+      double join_buffer_operations=
+        COST_ADD(
+          COST_MULT(record_count, row_copy_cost),
+          COST_MULT(record_count, fanout * (idx - join->const_tables))
+          );
+      double jbuf_use_cost= row_copy_cost * join_buffer_operations;
+      trace_access_hash.add("jbuf_use_cost", jbuf_use_cost);
+      tmp= COST_ADD(tmp, jbuf_use_cost);
+    }
+
+    double where_cost= COST_MULT((fanout*join_sel) / TIME_FOR_COMPARE,
+                                  record_count);
+    trace_access_hash.add("extra_cond_check_cost", where_cost);
+
+    best_time= COST_ADD(tmp, where_cost);
+
    best= tmp;
-    records= rnd_records;
+    records= fanout;
    best_key= hj_start_key;
    best_ref_depends_map= 0;
    best_uses_jbuf= TRUE;
    best_filter= 0;
    best_type= JT_HASH;
-    trace_access_hash.add("type", "hash");
-    trace_access_hash.add("index", "hj-key");
-    trace_access_hash.add("cost", rnd_records);
+    trace_access_hash.add("records", records);
    trace_access_hash.add("cost", best);
    trace_access_hash.add("chosen", true);
  }

--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -2754,6 +2754,7 @@ export const char *optimizer_switch_names[]=
  "rowid_filter",
  "condition_pushdown_from_having",
  "not_null_range_scan",
+  "hash_join_cardinality",
  "default", 
  NullS
 };