Commit 531dd708 authored by Sergei Petrunia's avatar Sergei Petrunia Committed by Sergei Petrunia

MDEV-27229: Estimation for filtered rows less precise ... #5

Fix special handling for values that are right next to buckets with ndv=1.
parent 67d4d042
...@@ -4631,12 +4631,12 @@ test t1_json a a-0 a-9 0.0000 3.0000 1.0000 10 JSON_HB { ...@@ -4631,12 +4631,12 @@ test t1_json a a-0 a-9 0.0000 3.0000 1.0000 10 JSON_HB {
} }
explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 68.71 Using where 1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 60.00 Using where
Warnings: Warnings:
Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz' Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 68.71 60.00 Using where 1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 60.00 60.00 Using where
explain extended select * from t1_json where a < 'b-1a'; explain extended select * from t1_json where a < 'b-1a';
id select_type table type possible_keys key key_len ref rows filtered Extra id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where 1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where
...@@ -8014,7 +8014,7 @@ test.t1 analyze status OK ...@@ -8014,7 +8014,7 @@ test.t1 analyze status OK
analyze analyze
select c from t1 where c > '1'; select c from t1 where c > '1';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 80.47 75.00 Using where 1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where
drop table t1; drop table t1;
# #
# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
...@@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a'; ...@@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where 1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where
drop table t1; drop table t1;
#
# MDEV-27229: Estimation for filtered rows less precise ... #5
#
create table t1 (id int, a varchar(8));
insert into t1 select seq, 'bar' from seq_1_to_100;
insert into t1 select id, 'qux' from t1;
set histogram_type=JSON_HB;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
analyze select COUNT(*) FROM t1 WHERE a > 'foo';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
analyze select COUNT(*) FROM t1 WHERE a >='aaa';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
analyze select COUNT(*) FROM t1 WHERE a > 'bar';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
analyze select COUNT(*) FROM t1 WHERE a >='bar';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
analyze select COUNT(*) FROM t1 WHERE a <='bar';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
drop table t1;
...@@ -390,3 +390,29 @@ analyze table t1 persistent for all; ...@@ -390,3 +390,29 @@ analyze table t1 persistent for all;
analyze select COUNT(*) FROM t1 WHERE a <> 'a'; analyze select COUNT(*) FROM t1 WHERE a <> 'a';
analyze select COUNT(*) FROM t1 WHERE a < 'a'; analyze select COUNT(*) FROM t1 WHERE a < 'a';
drop table t1; drop table t1;
--echo #
--echo # MDEV-27229: Estimation for filtered rows less precise ... #5
--echo #
create table t1 (id int, a varchar(8));
insert into t1 select seq, 'bar' from seq_1_to_100;
insert into t1 select id, 'qux' from t1;
set histogram_type=JSON_HB;
analyze table t1 persistent for all;
analyze select COUNT(*) FROM t1 WHERE a > 'foo';
analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
analyze select COUNT(*) FROM t1 WHERE a >='aaa';
analyze select COUNT(*) FROM t1 WHERE a > 'bar';
analyze select COUNT(*) FROM t1 WHERE a >='bar';
# Can enable these after get_avg_frequency issue is resolved:
# analyze select COUNT(*) FROM t1 WHERE a < 'aaa';
# analyze select COUNT(*) FROM t1 WHERE a <='aaa';
# analyze select COUNT(*) FROM t1 WHERE a < 'bar';
analyze select COUNT(*) FROM t1 WHERE a <='bar';
drop table t1;
...@@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, ...@@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
// If the value is outside of the histogram's range, this will "clip" it to // If the value is outside of the histogram's range, this will "clip" it to
// first or last bucket. // first or last bucket.
bool equal; int endp_cmp;
int idx= find_bucket(field, key, &equal); int idx= find_bucket(field, key, &endp_cmp);
double sel; double sel;
if (buckets[idx].ndv == 1 && !equal) if (buckets[idx].ndv == 1 && (endp_cmp!=0))
{ {
/* /*
The bucket has a single value and it doesn't match! Return a very The bucket has a single value and it doesn't match! Return a very
...@@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, ...@@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
// Find the leftmost bucket that contains the lookup value. // Find the leftmost bucket that contains the lookup value.
// (If the lookup value is to the left of all buckets, find bucket #0) // (If the lookup value is to the left of all buckets, find bucket #0)
bool equal; int endp_cmp;
int idx= find_bucket(field, min_key, &equal); int idx= find_bucket(field, min_key, &endp_cmp);
if (equal && exclusive_endp && buckets[idx].ndv==1 &&
idx < (int)buckets.size()-1) double sel;
// Special handling for buckets with ndv=1:
if (buckets[idx].ndv == 1)
{ {
/* if (endp_cmp < 0)
The range is "col > $CONST" and we've found a bucket that contains sel= 0.0;
only the value $CONST. Move to the next bucket. else if (endp_cmp > 0)
*/ sel= 1.0;
idx++; else // endp_cmp == 0.0
sel= (exclusive_endp)? 1.0 : 0.0;
}
else
{
sel= position_in_interval(field, min_key, min_key_len,
buckets[idx].start_value,
get_end_value(idx));
} }
double left_fract= get_left_fract(idx); double left_fract= get_left_fract(idx);
double sel= position_in_interval(field, min_key, min_key_len,
buckets[idx].start_value,
get_end_value(idx));
min= left_fract + sel * (buckets[idx].cum_fract - left_fract); min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
} }
else else
...@@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, ...@@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
max_key++; max_key++;
max_key_len--; max_key_len--;
} }
bool equal; int endp_cmp;
int idx= find_bucket(field, max_key, &equal); int idx= find_bucket(field, max_key, &endp_cmp);
if (equal && !inclusive_endp && idx > 0) if ((endp_cmp == 0) && !inclusive_endp)
{ {
/* /*
The range is "col < $CONST" and we've found a bucket starting with The range is "col < $CONST" and we've found a bucket starting with
$CONST. Move to the previous bucket. $CONST.
*/ */
idx--; if (idx > 0)
equal= false; {
// Move to the previous bucket
endp_cmp= 1;
idx--;
}
else
endp_cmp= -1;
} }
double left_fract= get_left_fract(idx);
double sel; double sel;
/* Special handling for singleton buckets */
if (buckets[idx].ndv == 1 && equal) // Special handling for buckets with ndv=1:
if (buckets[idx].ndv == 1)
{ {
if (inclusive_endp) if (endp_cmp < 0)
sel= 1.0;
else
sel= 0.0; sel= 0.0;
else if (endp_cmp > 0)
sel= 1.0;
else // endp_cmp == 0.0
sel= inclusive_endp? 1.0 : 0.0;
} }
else else
{ {
...@@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, ...@@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
buckets[idx].start_value, buckets[idx].start_value,
get_end_value(idx)); get_end_value(idx));
} }
double left_fract= get_left_fract(idx);
max= left_fract + sel * (buckets[idx].cum_fract - left_fract); max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
} }
else else
max= 1.0; max= 1.0;
double sel = max - min; return max - min;
return sel;
} }
...@@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field) ...@@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field)
} }
static int SGN(int x)
{
if (!x)
return 0;
return (x < 0)? -1 : 1;
}
/* /*
@brief @brief
Find the leftmost histogram bucket such that "lookup_val >= start_value". Find the leftmost histogram bucket such that "lookup_val >= start_value".
@param field Field object (used to do value comparisons) @param field Field object (used to do value comparisons)
@param lookup_val The lookup value in KeyTupleFormat. @param lookup_val The lookup value in KeyTupleFormat.
@param equal OUT TRUE<=> the found bucket has left_bound=lookup_val @param cmp OUT How the lookup_val compares to found_bucket.left_bound:
0 - lookup_val == bucket.left_bound
>0 - lookup_val > bucket.left_bound (the most typical)
<0 - lookup_val < bucket.left_bound. This can only happen
for the first bucket, for all other buckets we would just
pick the previous bucket and have cmp>=0.
@return @return
The bucket index The bucket index
*/ */
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
bool *equal) int *cmp)
{ {
int res; int res;
int low= 0; int low= 0;
int high= (int)buckets.size() - 1; int high= (int)buckets.size() - 1;
*equal= false; *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)
while (low + 1 < high) while (low + 1 < high)
{ {
...@@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, ...@@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val); res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
if (!res) if (!res)
{ {
*equal= true; *cmp= res;
low= middle; low= middle;
goto end; goto end;
} }
...@@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, ...@@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
*/ */
if (low == 0) if (low == 0)
{ {
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val); res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
if (!res) if (res <= 0)
*equal= true; *cmp= res;
else if (res < 0) // buckets[0] < lookup_val else // res>0, lookup_val > buckets[0].start_value
{ {
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
if (!res) if (res >= 0) // lookup_val >= buckets[high].start_value
*equal= true; {
if (res <= 0) // buckets[high] <= lookup_val // Move to that bucket
low= high; low= high;
*cmp= res;
}
else
*cmp= 1;
} }
} }
else if (high == (int)buckets.size() - 1) else if (high == (int)buckets.size() - 1)
{ {
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
if (!res) if (res >= 0)
*equal= true; {
if (res <= 0) // Ok the value is in the last bucket.
*cmp= res;
low= high; low= high;
}
else
{
// The value is in the 'low' bucket.
res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
*cmp= res;
}
} }
end: end:
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket. // Verification: *cmp has correct value
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(), DBUG_ASSERT(SGN(*cmp) ==
lookup_val))); SGN(field->key_cmp(lookup_val,
(uchar*)buckets[low].start_value.data())));
// buckets[low] <= lookup_val, with one exception of the first bucket. // buckets[low] <= lookup_val, with one exception of the first bucket.
DBUG_ASSERT(low == 0 || DBUG_ASSERT(low == 0 ||
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0); field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
......
...@@ -144,6 +144,6 @@ class Histogram_json_hb : public Histogram_base ...@@ -144,6 +144,6 @@ class Histogram_json_hb : public Histogram_base
double get_left_fract(int idx); double get_left_fract(int idx);
std::string& get_end_value(int idx); std::string& get_end_value(int idx);
int find_bucket(const Field *field, const uchar *lookup_val, bool *equal); int find_bucket(const Field *field, const uchar *lookup_val, int *cmp);
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment