MDEV-26886: Estimation for filtered rows less precise with JSON histogram

- Make Histogram_json_hb::range_selectivity handle singleton buckets specially when computing selectivity of the max. endpoint bound. (for min. endpoint, we already do that). - Also, fixed comments for Histogram_json_hb::find_bucket

MDEV-26886: Estimation for filtered rows less precise with JSON histogram
- Make Histogram_json_hb::range_selectivity handle singleton buckets specially when computing selectivity of the max. endpoint bound. (for min. endpoint, we already do that). - Also, fixed comments for Histogram_json_hb::find_bucket
eb6a9ad7 · Sergei Petrunia · 106c785e · eb6a9ad7 · eb6a9ad7 · eb6a9ad7
Commit eb6a9ad7 authored Nov 26, 2021 by Sergei Petrunia
4 changed files
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@@ -7658,3 +7658,28 @@ test.t1	analyze	status	OK
 ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
 INSERT INTO t1 (f) VALUES ('bar');
 DROP TABLE t1;
+#
+# MDEV-26886: Estimation for filtered rows less precise with JSON histogram
+#
+create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
+select count(*) from t1 where a <= 0;
+count(*)
+33
+set histogram_type = JSON_HB, histogram_size=default;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+analyze select * from t1 where a <= 0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	33.00	33.00	Using where
+analyze select * from t1 where a < 0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	1.47	0.00	Using where
+analyze select * from t1 where a > 0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	67.00	67.00	Using where
+analyze select * from t1 where a >= 0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	100.00	100.00	Using where
+drop table t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
 ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
 INSERT INTO t1 (f) VALUES ('bar');
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
+--echo #
+create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
+select count(*) from t1 where a <= 0;
+
+set histogram_type = JSON_HB, histogram_size=default;
+analyze table t1 persistent for all;
+analyze select * from t1 where a <= 0;
+analyze select * from t1 where a < 0;
+analyze select * from t1 where a > 0;
+analyze select * from t1 where a >= 0;
+drop table t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      idx--;
    }
    double left_fract= get_left_fract(idx);
-    double sel= position_in_interval(field, max_key, max_key_len,
-                                     buckets[idx].start_value,
-                                     get_end_value(idx));
+
+    double sel;
+    /* Special handling for singleton buckets */
+    if (buckets[idx].ndv == 1 && equal)
+    {
+      if (inclusive_endp)
+        sel= 1.0;
+      else
+        sel= 0.0;
+    }
+    else
+    {
+      sel= position_in_interval(field, max_key, max_key_len,
+                                buckets[idx].start_value,
+                                get_end_value(idx));
+    }
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
@@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)


 /*
-  Find the rightmost histogram bucket such that "lookup_val $GT start_value".
-
-  $GT is either '>' or '>=' depending on equal_is_less parameter.
-
-  @param equal_is_less Controls what to do if a histogram bound is equal to the
-                       lookup_val.
-
-  @detail
-    Possible cases:
-    1. The regular case: the value falls into some bucket.
+  @brief
+   Find the leftmost histogram bucket such that "lookup_val >= start_value".

-    2. The value is less than the minimum of the first bucket
-    3. The value is greater than the maximum of the last bucket
-      In these cases we "clip" to the first/last bucket.
+  @param field        Field object (used to do value comparisons)
+  @param lookup_val   The lookup value in KeyTupleFormat.
+  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val

-    4. The value hits the bucket boundary. Then, we need to know whether the
-       point of interest is to the left the constant, or to the right of it.
+  @return
+     The bucket index
 */

-int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
+int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
                                   bool *equal)
 {
  int res;
@@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
    if (!res)
    {
      *equal= true;
-      return middle;
+      low= middle;
+      goto end;
    }
    else if (res < 0)
      low= middle;
@@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
  }

  /*
-    If low and high were assigned a value in the above loop, then they are not
-    equal to the lookup value:
+    If low and high were assigned a value in the above loop and we got here,
+    then the following holds:

-      bucket[low] < lookup_val < bucket[high]
+      bucket[low].start_value < lookup_val < bucket[high].start_value

-    But there are two special cases: low=0 and high=last_bucket. Handle them
-    below.
+    Besides that, there are two special cases: low=0 and high=last_bucket.
+    Handle them below.
  */
  if (low == 0)
  {
    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
-    else if (res < 0)
+    else if (res < 0) //  buckets[0] < lookup_val
    {
      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
      if (!res)
        *equal= true;
-      if (res >= 0)
+      if (res <= 0) // buckets[high] <= lookup_val
        low= high;
    }
  }
@@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
-    if (res >= 0)
+    if (res <= 0)
      low= high;
  }

+end:
+  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
+  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
+                                         lookup_val)));
+  // buckets[low] <= lookup_val, with one exception of the first bucket.
+  DBUG_ASSERT(low == 0 ||
+              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
+  // buckets[low+1] > lookup_val, with one exception of the last bucket
+  DBUG_ASSERT(low == (int)buckets.size()-1 ||
+              field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
  return low;
 }
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@@ -124,6 +124,6 @@ class Histogram_json_hb : public Histogram_base
 private:
  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
+  int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
 };