Commit eb6a9ad7 authored by Sergei Petrunia's avatar Sergei Petrunia

MDEV-26886: Estimation for filtered rows less precise with JSON histogram

- Make Histogram_json_hb::range_selectivity handle singleton buckets
  specially when computing selectivity of the max. endpoint bound.
  (for min. endpoint, we already do that).

- Also, fixed comments for Histogram_json_hb::find_bucket
parent 106c785e
...@@ -7658,3 +7658,28 @@ test.t1 analyze status OK ...@@ -7658,3 +7658,28 @@ test.t1 analyze status OK
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk; ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
INSERT INTO t1 (f) VALUES ('bar'); INSERT INTO t1 (f) VALUES ('bar');
DROP TABLE t1; DROP TABLE t1;
#
# MDEV-26886: Estimation for filtered rows less precise with JSON histogram
#
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
select count(*) from t1 where a <= 0;
count(*)
33
set histogram_type = JSON_HB, histogram_size=default;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
analyze select * from t1 where a <= 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 33.00 33.00 Using where
analyze select * from t1 where a < 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 1.47 0.00 Using where
analyze select * from t1 where a > 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 67.00 67.00 Using where
analyze select * from t1 where a >= 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where
drop table t1;
...@@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL; ...@@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk; ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
INSERT INTO t1 (f) VALUES ('bar'); INSERT INTO t1 (f) VALUES ('bar');
DROP TABLE t1; DROP TABLE t1;
--echo #
--echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
--echo #
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
select count(*) from t1 where a <= 0;
set histogram_type = JSON_HB, histogram_size=default;
analyze table t1 persistent for all;
analyze select * from t1 where a <= 0;
analyze select * from t1 where a < 0;
analyze select * from t1 where a > 0;
analyze select * from t1 where a >= 0;
drop table t1;
...@@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, ...@@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
idx--; idx--;
} }
double left_fract= get_left_fract(idx); double left_fract= get_left_fract(idx);
double sel= position_in_interval(field, max_key, max_key_len,
buckets[idx].start_value, double sel;
get_end_value(idx)); /* Special handling for singleton buckets */
if (buckets[idx].ndv == 1 && equal)
{
if (inclusive_endp)
sel= 1.0;
else
sel= 0.0;
}
else
{
sel= position_in_interval(field, max_key, max_key_len,
buckets[idx].start_value,
get_end_value(idx));
}
max= left_fract + sel * (buckets[idx].cum_fract - left_fract); max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
} }
else else
...@@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field) ...@@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)
/* /*
Find the rightmost histogram bucket such that "lookup_val $GT start_value". @brief
Find the leftmost histogram bucket such that "lookup_val >= start_value".
$GT is either '>' or '>=' depending on equal_is_less parameter.
@param equal_is_less Controls what to do if a histogram bound is equal to the
lookup_val.
@detail
Possible cases:
1. The regular case: the value falls into some bucket.
2. The value is less than the minimum of the first bucket @param field Field object (used to do value comparisons)
3. The value is greater than the maximum of the last bucket @param lookup_val The lookup value in KeyTupleFormat.
In these cases we "clip" to the first/last bucket. @param equal OUT TRUE<=> the found bucket has left_bound=lookup_val
4. The value hits the bucket boundary. Then, we need to know whether the @return
point of interest is to the left the constant, or to the right of it. The bucket index
*/ */
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val, int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
bool *equal) bool *equal)
{ {
int res; int res;
...@@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val, ...@@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
if (!res) if (!res)
{ {
*equal= true; *equal= true;
return middle; low= middle;
goto end;
} }
else if (res < 0) else if (res < 0)
low= middle; low= middle;
...@@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val, ...@@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
} }
/* /*
If low and high were assigned a value in the above loop, then they are not If low and high were assigned a value in the above loop and we got here,
equal to the lookup value: then the following holds:
bucket[low] < lookup_val < bucket[high] bucket[low].start_value < lookup_val < bucket[high].start_value
But there are two special cases: low=0 and high=last_bucket. Handle them Besides that, there are two special cases: low=0 and high=last_bucket.
below. Handle them below.
*/ */
if (low == 0) if (low == 0)
{ {
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val); res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
if (!res) if (!res)
*equal= true; *equal= true;
else if (res < 0) else if (res < 0) // buckets[0] < lookup_val
{ {
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
if (!res) if (!res)
*equal= true; *equal= true;
if (res >= 0) if (res <= 0) // buckets[high] <= lookup_val
low= high; low= high;
} }
} }
...@@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val, ...@@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
if (!res) if (!res)
*equal= true; *equal= true;
if (res >= 0) if (res <= 0)
low= high; low= high;
} }
end:
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
lookup_val)));
// buckets[low] <= lookup_val, with one exception of the first bucket.
DBUG_ASSERT(low == 0 ||
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
// buckets[low+1] > lookup_val, with one exception of the last bucket
DBUG_ASSERT(low == (int)buckets.size()-1 ||
field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
return low; return low;
} }
...@@ -124,6 +124,6 @@ class Histogram_json_hb : public Histogram_base ...@@ -124,6 +124,6 @@ class Histogram_json_hb : public Histogram_base
private: private:
double get_left_fract(int idx); double get_left_fract(int idx);
std::string& get_end_value(int idx); std::string& get_end_value(int idx);
int find_bucket(Field *field, const uchar *lookup_val, bool *equal); int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment