MDEV-34665 Simplify IN predicate processing for NULL-aware materialization...

MDEV-34665 Simplify IN predicate processing for NULL-aware materialization involving only one column It was found that unnecessary work of building Ordered_key structures is being done when processing NULL-aware materialization for IN predicates having only one column. In fact, the logic for that simplified case can be expressed as follows. Say we have predicate left_expr IN (SELECT <subq1>), where left_expr is scalar(not a tuple). Then if (left_expr is NULL) { if (subq1 produced any rows) { // note that we don't care if subq1 has produced // NULLs or not. NULL IN (<some values>) -> UNKNOWN, i.e. NULL. } else { NULL IN ({empty-set}) -> FALSE. } } else { // left_expr is a non-NULL value if (subq1 output has a match for left_expr) { left_expr IN (..., left_expr ...) -> TRUE } else { // no "known" matches. if (subq1 output has a NULL) { left_expr IN ( ... NULL ...) -> (NULL could have been a match or not) -> NULL. } else { // subq1 didn't produce any "UNKNOWNs" so // we're positive there weren't any matches -> FALSE. } } } This commit introduces subselect_single_column_partial_engine class implementing the logic described.

MDEV-34665 Simplify IN predicate processing for NULL-aware materialization...
MDEV-34665 Simplify IN predicate processing for NULL-aware materialization involving only one column It was found that unnecessary work of building Ordered_key structures is being done when processing NULL-aware materialization for IN predicates having only one column. In fact, the logic for that simplified case can be expressed as follows. Say we have predicate left_expr IN (SELECT <subq1>), where left_expr is scalar(not a tuple). Then if (left_expr is NULL) { if (subq1 produced any rows) { // note that we don't care if subq1 has produced // NULLs or not. NULL IN (<some values>) -> UNKNOWN, i.e. NULL. } else { NULL IN ({empty-set}) -> FALSE. } } else { // left_expr is a non-NULL value if (subq1 output has a match for left_expr) { left_expr IN (..., left_expr ...) -> TRUE } else { // no "known" matches. if (subq1 output has a NULL) { left_expr IN ( ... NULL ...) -> (NULL could have been a match or not) -> NULL. } else { // subq1 didn't produce any "UNKNOWNs" so // we're positive there weren't any matches -> FALSE. } } } This commit introduces subselect_single_column_partial_engine class implementing the logic described.
702b6073 · Oleg Smirnov · 48b256a7 · 702b6073 · 702b6073 · 702b6073
Commit 702b6073 authored Aug 01, 2024 by Oleg Smirnov
6 changed files
--- a/mysql-test/main/subselect_mat.result
+++ b/mysql-test/main/subselect_mat.result
@@ -3013,3 +3013,154 @@ UNION ALL
 1
 1
 drop table t1, t2;
+#
+# MDEV-34665: Simplify IN predicate processing for NULL-aware
+#             materialization involving only one column
+set @save_optimizer_switch=@@optimizer_switch;
+set @@optimizer_switch = "materialization=on,in_to_exists=off,semijoin=off";
+create table t1 (a int);
+create table t2 (b int);
+insert into t1 values (null), (1), (2), (3);
+# Query against empty t2
+select * from t1 where a not in (select b from t2);
+a
+1
+2
+3
+NULL
+select a from t1 join t2 on a not in (select b from t2);
+a
+select a from t1 left join t2 on a not in (select b from t2);
+a
+1
+2
+3
+NULL
+# Insert some not-NULL values
+insert into t2 values (3), (4);
+select * from t1 where a not in (select b from t2);
+a
+1
+2
+select a from t1 join t2 on a not in (select b from t2);
+a
+1
+1
+2
+2
+select a from t1 left join t2 on a not in (select b from t2);
+a
+1
+1
+2
+2
+3
+NULL
+# Ensure the correct strategy is tested
+analyze format=json  select * from t1 where a not in (select b from t2);
+ANALYZE
+{
+  "query_block": {
+    "select_id": 1,
+    "r_loops": 1,
+    "r_total_time_ms": "REPLACED",
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "r_loops": 1,
+      "rows": 4,
+      "r_rows": 4,
+      "r_table_time_ms": "REPLACED",
+      "r_other_time_ms": "REPLACED",
+      "filtered": 100,
+      "r_filtered": 50,
+      "attached_condition": "!<in_optimizer>(t1.a,t1.a in (subquery#2))"
+    },
+    "subqueries": [
+      {
+        "materialization": {
+          "r_strategy": "index_lookup;constant for partial match",
+          "r_loops": 4,
+          "r_index_lookups": 3,
+          "r_partial_matches": 1,
+          "query_block": {
+            "select_id": 2,
+            "r_loops": 1,
+            "r_total_time_ms": "REPLACED",
+            "table": {
+              "table_name": "t2",
+              "access_type": "ALL",
+              "r_loops": 1,
+              "rows": 2,
+              "r_rows": 2,
+              "r_table_time_ms": "REPLACED",
+              "r_other_time_ms": "REPLACED",
+              "filtered": 100,
+              "r_filtered": 100
+            }
+          }
+        }
+      }
+    ]
+  }
+}
+# Insert NULL value
+insert into t2 values (null);
+select * from t1 where a not in (select b from t2);
+a
+select a from t1 join t2 on a not in (select b from t2);
+a
+select a from t1 left join t2 on a not in (select b from t2);
+a
+1
+2
+3
+NULL
+analyze format=json  select * from t1 where a not in (select b from t2);
+ANALYZE
+{
+  "query_block": {
+    "select_id": 1,
+    "r_loops": 1,
+    "r_total_time_ms": "REPLACED",
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "r_loops": 1,
+      "rows": 4,
+      "r_rows": 4,
+      "r_table_time_ms": "REPLACED",
+      "r_other_time_ms": "REPLACED",
+      "filtered": 100,
+      "r_filtered": 0,
+      "attached_condition": "!<in_optimizer>(t1.a,t1.a in (subquery#2))"
+    },
+    "subqueries": [
+      {
+        "materialization": {
+          "r_strategy": "index_lookup;constant for partial match",
+          "r_loops": 4,
+          "r_index_lookups": 3,
+          "query_block": {
+            "select_id": 2,
+            "r_loops": 1,
+            "r_total_time_ms": "REPLACED",
+            "table": {
+              "table_name": "t2",
+              "access_type": "ALL",
+              "r_loops": 1,
+              "rows": 3,
+              "r_rows": 3,
+              "r_table_time_ms": "REPLACED",
+              "r_other_time_ms": "REPLACED",
+              "filtered": 100,
+              "r_filtered": 100
+            }
+          }
+        }
+      }
+    ]
+  }
+}
+drop table t1, t2;
+set @@optimizer_switch=@save_optimizer_switch;
--- a/mysql-test/main/subselect_mat.test
+++ b/mysql-test/main/subselect_mat.test
@@ -296,3 +296,50 @@ UNION ALL
 ;

 drop table t1, t2;
+
+--echo #
+--echo # MDEV-34665: Simplify IN predicate processing for NULL-aware
+--echo #             materialization involving only one column
+
+set @save_optimizer_switch=@@optimizer_switch;
+set @@optimizer_switch = "materialization=on,in_to_exists=off,semijoin=off";
+
+create table t1 (a int);
+create table t2 (b int);
+insert into t1 values (null), (1), (2), (3);
+
+--echo # Query against empty t2
+--sorted_result
+select * from t1 where a not in (select b from t2);
+--sorted_result
+select a from t1 join t2 on a not in (select b from t2);
+--sorted_result
+select a from t1 left join t2 on a not in (select b from t2);
+
+--echo # Insert some not-NULL values
+insert into t2 values (3), (4);
+--sorted_result
+select * from t1 where a not in (select b from t2);
+--sorted_result
+select a from t1 join t2 on a not in (select b from t2);
+--sorted_result
+select a from t1 left join t2 on a not in (select b from t2);
+--echo # Ensure the correct strategy is tested
+--source include/analyze-format.inc
+analyze format=json  select * from t1 where a not in (select b from t2);
+
+--echo # Insert NULL value
+insert into t2 values (null);
+--sorted_result
+select * from t1 where a not in (select b from t2);
+--sorted_result
+select a from t1 join t2 on a not in (select b from t2);
+--sorted_result
+select a from t1 left join t2 on a not in (select b from t2);
+--source include/analyze-format.inc
+analyze format=json  select * from t1 where a not in (select b from t2);
+
+drop table t1, t2;
+
+set @@optimizer_switch=@save_optimizer_switch;
+
--- a/mysql-test/main/subselect_mat_analyze_json.result
+++ b/mysql-test/main/subselect_mat_analyze_json.result
@@ -80,8 +80,6 @@ ANALYZE
  }
 }
 # "Partial match" is used due to NOT IN
-# Force rowid-merge partial partial matching
-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
 analyze format=json select * from t1 where a not in (select b from t2);
 ANALYZE
 {
@@ -104,58 +102,7 @@ ANALYZE
    "subqueries": [
      {
        "materialization": {
-          "r_strategy": "index_lookup;array merge for partial match",
-          "r_loops": 4,
-          "r_index_lookups": 3,
-          "r_partial_matches": 1,
-          "r_partial_match_buffer_size": "REPLACED",
-          "r_partial_match_array_sizes": ["2"],
-          "query_block": {
-            "select_id": 2,
-            "r_loops": 1,
-            "r_total_time_ms": "REPLACED",
-            "table": {
-              "table_name": "t2",
-              "access_type": "ALL",
-              "r_loops": 1,
-              "rows": 2,
-              "r_rows": 2,
-              "r_table_time_ms": "REPLACED",
-              "r_other_time_ms": "REPLACED",
-              "filtered": 100,
-              "r_filtered": 100
-            }
-          }
-        }
-      }
-    ]
-  }
-}
-# Force table scan partial matching
-set @@optimizer_switch="partial_match_rowid_merge=off,partial_match_table_scan=on";
-analyze format=json select * from t1 where a not in (select b from t2);
-ANALYZE
-{
-  "query_block": {
-    "select_id": 1,
-    "r_loops": 1,
-    "r_total_time_ms": "REPLACED",
-    "table": {
-      "table_name": "t1",
-      "access_type": "ALL",
-      "r_loops": 1,
-      "rows": 4,
-      "r_rows": 4,
-      "r_table_time_ms": "REPLACED",
-      "r_other_time_ms": "REPLACED",
-      "filtered": 100,
-      "r_filtered": 50,
-      "attached_condition": "!<in_optimizer>(t1.a,t1.a in (subquery#2))"
-    },
-    "subqueries": [
-      {
-        "materialization": {
-          "r_strategy": "index_lookup;full scan for partial match",
+          "r_strategy": "index_lookup;constant for partial match",
          "r_loops": 4,
          "r_index_lookups": 3,
          "r_partial_matches": 1,
@@ -211,7 +158,7 @@ ANALYZE
        "subqueries": [
          {
            "materialization": {
-              "r_strategy": "index_lookup;full scan for partial match",
+              "r_strategy": "index_lookup;constant for partial match",
              "r_loops": 4,
              "r_index_lookups": 3,
              "r_partial_matches": 1,
@@ -238,7 +185,6 @@ ANALYZE
    }
  }
 }
-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
 analyze format=json select a from t1 group by a not in (select b from t2);
 ANALYZE
 {
@@ -269,12 +215,10 @@ ANALYZE
        "subqueries": [
          {
            "materialization": {
-              "r_strategy": "index_lookup;array merge for partial match",
+              "r_strategy": "index_lookup;constant for partial match",
              "r_loops": 4,
              "r_index_lookups": 3,
              "r_partial_matches": 1,
-              "r_partial_match_buffer_size": "REPLACED",
-              "r_partial_match_array_sizes": ["2"],
              "query_block": {
                "select_id": 2,
                "r_loops": 1,
@@ -298,7 +242,6 @@ ANALYZE
    }
  }
 }
-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=on";
 # Subselect in ORDER BY
 analyze format=json select a from t1 order by a in (select b from t2);
 ANALYZE
@@ -333,7 +276,7 @@ ANALYZE
    "subqueries": [
      {
        "materialization": {
-          "r_strategy": "index_lookup;full scan for partial match",
+          "r_strategy": "index_lookup;constant for partial match",
          "r_loops": 4,
          "r_index_lookups": 3,
          "r_partial_matches": 1,
@@ -381,7 +324,7 @@ ANALYZE
    "subqueries": [
      {
        "materialization": {
-          "r_strategy": "index_lookup;full scan for partial match",
+          "r_strategy": "index_lookup;constant for partial match",
          "r_loops": 4,
          "r_index_lookups": 3,
          "r_partial_matches": 1,
@@ -521,7 +464,7 @@ ANALYZE
    "subqueries": [
      {
        "materialization": {
-          "r_strategy": "index_lookup;full scan for partial match",
+          "r_strategy": "index_lookup;constant for partial match",
          "r_loops": 4,
          "r_index_lookups": 3,
          "query_block": {
@@ -614,6 +557,7 @@ create table t1 (a1 char(1), a2 char(1));
 insert into t1 values (null, 'b');
 create table t2 (b1 char(1), b2 char(2));
 insert into t2 values ('a','b'), ('c', 'd'), (null, 'e'), ('f', 'g');
+# Force rowid-merge partial matching
 set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
 explain format=json select * from t1 where (a1, a2) not in (select b1, b2 from t2);
 EXPLAIN
@@ -690,6 +634,7 @@ ANALYZE
    ]
  }
 }
+# Force table scan partial matching
 set @@optimizer_switch="partial_match_rowid_merge=off,partial_match_table_scan=on";
 analyze format=json select * from t1 where (a1, a2) not in (select b1, b2 from t2);
 ANALYZE

--- a/mysql-test/main/subselect_mat_analyze_json.test
+++ b/mysql-test/main/subselect_mat_analyze_json.test
@@ -13,13 +13,6 @@ explain format=json select * from t1 where a in (select b from t2);
 analyze format=json select * from t1 where a in (select b from t2);

 --echo # "Partial match" is used due to NOT IN
--echo # Force rowid-merge partial partial matching
-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
--source include/analyze-format.inc
-analyze format=json select * from t1 where a not in (select b from t2);
-
--echo # Force table scan partial matching
-set @@optimizer_switch="partial_match_rowid_merge=off,partial_match_table_scan=on";
 --source include/analyze-format.inc
 analyze format=json select * from t1 where a not in (select b from t2);

@@ -27,11 +20,9 @@ analyze format=json select * from t1 where a not in (select b from t2);
 --source include/analyze-format.inc
 analyze format=json select a from t1 group by a in (select b from t2);

-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
 --source include/analyze-format.inc
 analyze format=json select a from t1 group by a not in (select b from t2);

-set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=on";
 --echo # Subselect in ORDER BY
 --source include/analyze-format.inc
 analyze format=json select a from t1 order by a in (select b from t2);
@@ -68,11 +59,13 @@ insert into t1 values (null, 'b');
 create table t2 (b1 char(1), b2 char(2));
 insert into t2 values ('a','b'), ('c', 'd'), (null, 'e'), ('f', 'g');

+--echo # Force rowid-merge partial matching
 set @@optimizer_switch="partial_match_rowid_merge=on,partial_match_table_scan=off";
 explain format=json select * from t1 where (a1, a2) not in (select b1, b2 from t2);
 --source include/analyze-format.inc
 analyze format=json select * from t1 where (a1, a2) not in (select b1, b2 from t2);

+--echo # Force table scan partial matching
 set @@optimizer_switch="partial_match_rowid_merge=off,partial_match_table_scan=on";
 --source include/analyze-format.inc
 analyze format=json select * from t1 where (a1, a2) not in (select b1, b2 from t2);

--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -4973,12 +4973,18 @@ subselect_hash_sj_engine::get_strategy_using_data()

 void
 subselect_hash_sj_engine::choose_partial_match_strategy(
-  bool has_non_null_key, bool has_covering_null_row,
+  uint field_count, bool has_non_null_key, bool has_covering_null_row,
  MY_BITMAP *partial_match_key_parts_arg)
 {
  ulonglong pm_buff_size;

  DBUG_ASSERT(strategy == PARTIAL_MATCH);
+  if (field_count == 1)
+  {
+    strategy= PARTIAL_MATCH_SINGLE_COLUMN;
+    return;
+  }
+
  /*
    Choose according to global optimizer switch. If only one of the switches is
    'ON', then the remaining strategy is the only possible one. The only cases
@@ -5414,7 +5420,8 @@ void subselect_hash_sj_engine::cleanup()
    related engines are created and chosen for each execution.
  */
  item->get_IN_subquery()->engine= materialize_engine;
-  if (lookup_engine_type == TABLE_SCAN_ENGINE ||
+  if (lookup_engine_type == SINGLE_COLUMN_ENGINE ||
+      lookup_engine_type == TABLE_SCAN_ENGINE ||
      lookup_engine_type == ROWID_MERGE_ENGINE)
  {
    subselect_engine *inner_lookup_engine;
@@ -5755,11 +5762,26 @@ int subselect_hash_sj_engine::exec()
      count_pm_keys= count_partial_match_columns - count_null_only_columns +
                     (nn_key_parts ? 1 : 0);

-    choose_partial_match_strategy(MY_TEST(nn_key_parts),
+    choose_partial_match_strategy(field_count, MY_TEST(nn_key_parts),
                                  has_covering_null_row,
                                  &partial_match_key_parts);
-    DBUG_ASSERT(strategy == PARTIAL_MATCH_MERGE ||
+    DBUG_ASSERT(strategy == PARTIAL_MATCH_SINGLE_COLUMN ||
+                strategy == PARTIAL_MATCH_MERGE ||
                strategy == PARTIAL_MATCH_SCAN);
+    if (strategy == PARTIAL_MATCH_SINGLE_COLUMN)
+    {
+      if (!(pm_engine= new subselect_single_column_partial_engine(
+              (subselect_uniquesubquery_engine*) lookup_engine, tmp_table,
+              item, result, semi_join_conds->argument_list(),
+              has_covering_null_row, has_covering_null_columns,
+              count_columns_with_nulls)) ||
+          pm_engine->prepare(thd))
+      {
+        /* This is an irrecoverable error. */
+        res= 1;
+        goto err;
+      }
+    }
    if (strategy == PARTIAL_MATCH_MERGE)
    {
      pm_engine=
@@ -5786,7 +5808,6 @@ int subselect_hash_sj_engine::exec()
        strategy= PARTIAL_MATCH_SCAN;
      }
    }
-
    if (strategy == PARTIAL_MATCH_SCAN)
    {
      if (!(pm_engine=
@@ -6978,6 +6999,44 @@ void subselect_table_scan_engine::cleanup()
 }


+subselect_single_column_partial_engine::subselect_single_column_partial_engine(
+  subselect_uniquesubquery_engine *engine_arg,
+  TABLE *tmp_table_arg,
+  Item_subselect *item_arg,
+  select_result_interceptor *result_arg,
+  List<Item> *equi_join_conds_arg,
+  bool has_covering_null_row_arg,
+  bool has_covering_null_columns_arg,
+  uint count_columns_with_nulls_arg)
+  :subselect_partial_match_engine(engine_arg, tmp_table_arg, item_arg,
+                                  result_arg, equi_join_conds_arg,
+                                  has_covering_null_row_arg,
+                                  has_covering_null_columns_arg,
+                                  count_columns_with_nulls_arg)
+{}
+
+
+bool subselect_single_column_partial_engine::partial_match()
+{
+  /*
+    We get here if:
+    - there is only one column in the materialized table;
+    - its current value of left_expr is NULL (otherwise we would have hit
+         the earlier "index lookup" branch at subselect_partial_match::exec());
+    - the materialized table does not have NULL values (for a similar reason);
+    - the materialized table is not empty.
+    The case when materialization produced no rows (empty table) is handled at
+    subselect_hash_sj_engine::exec(), the result of IN predicate is always
+    FALSE in that case.
+    After all those preconditions met, the result of the partial match is TRUE.
+  */
+  DBUG_ASSERT(item->get_IN_subquery()->left_expr_has_null() &&
+              !has_covering_null_row &&
+              tmp_table->file->stats.records > 0);
+  return true;
+}
+
+
 void Item_subselect::register_as_with_rec_ref(With_element *with_elem)
 {
  with_elem->sq_with_rec_ref.link_in_list(this, &this->next_with_rec_ref);

--- a/sql/item_subselect.h
+++ b/sql/item_subselect.h
@@ -835,7 +835,8 @@ class subselect_engine: public Sql_alloc,
  enum enum_engine_type {ABSTRACT_ENGINE, SINGLE_SELECT_ENGINE,
                         UNION_ENGINE, UNIQUESUBQUERY_ENGINE,
                         INDEXSUBQUERY_ENGINE, HASH_SJ_ENGINE,
-                         ROWID_MERGE_ENGINE, TABLE_SCAN_ENGINE};
+                         ROWID_MERGE_ENGINE, TABLE_SCAN_ENGINE,
+                         SINGLE_COLUMN_ENGINE};

  subselect_engine(Item_subselect *si,
                   select_result_interceptor *res):
@@ -1175,6 +1176,8 @@ class subselect_hash_sj_engine : public subselect_engine
    PARTIAL_MATCH,  /* Use some partial matching strategy. */
    PARTIAL_MATCH_MERGE, /* Use partial matching through index merging. */
    PARTIAL_MATCH_SCAN,  /* Use partial matching through table scan. */
+    PARTIAL_MATCH_SINGLE_COLUMN, /* Use simplifeid partial matching when
+                                    there is only one field involved. */
    IMPOSSIBLE      /* Subquery materialization is not applicable. */
  };

@@ -1195,7 +1198,8 @@ class subselect_hash_sj_engine : public subselect_engine
  ulonglong rowid_merge_buff_size(bool has_non_null_key,
                                  bool has_covering_null_row,
                                  MY_BITMAP *partial_match_key_parts);
-  void choose_partial_match_strategy(bool has_non_null_key,
+  void choose_partial_match_strategy(uint field_count,
+                                     bool has_non_null_key,
                                     bool has_covering_null_row,
                                     MY_BITMAP *partial_match_key_parts);
  bool make_semi_join_conds();
@@ -1559,6 +1563,25 @@ class subselect_table_scan_engine: public subselect_partial_match_engine
  enum_engine_type engine_type() override { return TABLE_SCAN_ENGINE; }
 };

+
+class subselect_single_column_partial_engine:
+    public subselect_partial_match_engine
+{
+protected:
+  bool partial_match() override;
+public:
+  subselect_single_column_partial_engine(
+                              subselect_uniquesubquery_engine *engine_arg,
+                              TABLE *tmp_table_arg, Item_subselect *item_arg,
+                              select_result_interceptor *result_arg,
+                              List<Item> *equi_join_conds_arg,
+                              bool has_covering_null_row_arg,
+                              bool has_covering_null_columns_arg,
+                              uint count_columns_with_nulls_arg);
+  void cleanup() override {}
+  enum_engine_type engine_type() override { return SINGLE_COLUMN_ENGINE; }
+};
+
 /**
  @brief Subquery materialization tracker

@@ -1641,6 +1664,8 @@ class Subq_materialization_tracker
        return "index_lookup;array merge for partial match";
      case Strategy::PARTIAL_MATCH_SCAN:
        return "index_lookup;full scan for partial match";
+      case Strategy::PARTIAL_MATCH_SINGLE_COLUMN:
+        return "index_lookup;constant for partial match";
      default:
        return "unsupported";
    }