Commit 05877df4 authored by Sergei Petrunia's avatar Sergei Petrunia

MDEV-26849: JSON Histograms: point selectivity estimates are off

.. for non-existent values.

Handle this special case.
parent f3f78bed
......@@ -1885,6 +1885,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15
explain select * from t1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
drop table t0;
drop table t1;
set analyze_sample_percentage=@save_analyze_sample_percentage;
set histogram_size=@save_histogram_size;
......
......@@ -1019,7 +1019,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
from mysql.column_stats;
explain select * from t1;
drop table t0;
drop table t1;
set analyze_sample_percentage=@save_analyze_sample_percentage;
set histogram_size=@save_histogram_size;
......
......@@ -4148,6 +4148,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 {
explain select * from t1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
drop table t0;
drop table t1;
set analyze_sample_percentage=@save_analyze_sample_percentage;
set histogram_size=@save_histogram_size;
......@@ -7530,3 +7531,104 @@ select c from t1 where c > '1';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where
drop table t1;
#
# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
#
#
create table t0(a int);
insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(a int);
insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
select a, count(*) from t1 group by a order by a;
a count(*)
0 100
100 100
200 100
300 100
400 100
500 100
600 100
700 100
800 100
900 100
set histogram_type=json_hb, histogram_size=default;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select * from mysql.column_stats where table_name='t1';
db_name table_name column_name min_value max_value nulls_ratio avg_length avg_frequency hist_size hist_type histogram
test t1 a 0 900 0.0000 4.0000 100.0000 10 JSON_HB {
"histogram_hb_v2": [
{
"start": "0",
"size": 0.1,
"ndv": 1
},
{
"start": "100",
"size": 0.1,
"ndv": 1
},
{
"start": "200",
"size": 0.1,
"ndv": 1
},
{
"start": "300",
"size": 0.1,
"ndv": 1
},
{
"start": "400",
"size": 0.1,
"ndv": 1
},
{
"start": "500",
"size": 0.1,
"ndv": 1
},
{
"start": "600",
"size": 0.1,
"ndv": 1
},
{
"start": "700",
"size": 0.1,
"ndv": 1
},
{
"start": "800",
"size": 0.1,
"ndv": 1
},
{
"start": "900",
"end": "900",
"size": 0.1,
"ndv": 1
}
]
}
analyze select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where
analyze select * from t1 where a=50;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where
analyze select * from t1 where a=70;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where
analyze select * from t1 where a=100;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where
analyze select * from t1 where a=150;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where
analyze select * from t1 where a=200;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where
drop table t0,t1;
......@@ -295,3 +295,25 @@ select c from t1 where c > '1';
drop table t1;
--echo #
--echo # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
--echo #
--echo #
create table t0(a int);
insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1(a int);
insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
select a, count(*) from t1 group by a order by a;
set histogram_type=json_hb, histogram_size=default;
analyze table t1 persistent for all;
select * from mysql.column_stats where table_name='t1';
analyze select * from t1 where a=0;
analyze select * from t1 where a=50;
analyze select * from t1 where a=70;
analyze select * from t1 where a=100;
analyze select * from t1 where a=150;
analyze select * from t1 where a=200;
drop table t0,t1;
......@@ -616,7 +616,7 @@ double position_in_interval(Field *field, const uchar *key, uint key_len,
double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
double avg_sel)
double avg_sel, double total_rows)
{
const uchar *key = endpoint->key;
if (field->real_maybe_null())
......@@ -631,9 +631,11 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
if (buckets[idx].ndv == 1 && !equal)
{
// The bucket has a single value and it doesn't match! Use the global
// average.
sel= avg_sel;
/*
The bucket has a single value and it doesn't match! Return a very
small value.
*/
sel= 1.0 / total_rows;
}
else
{
......
......@@ -109,7 +109,8 @@ class Histogram_json_hb : public Histogram_base
ulonglong size) override;
double point_selectivity(Field *field, key_range *endpoint,
double avg_selection) override;
double avg_selection,
double total_rows) override;
double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp) override;
......
......@@ -3882,7 +3882,8 @@ double get_column_range_cardinality(Field *field,
{
res= col_non_nulls *
hist->point_selectivity(field, min_endp,
avg_frequency / col_non_nulls);
avg_frequency / col_non_nulls,
tab_records);
}
}
else if (avg_frequency == 0.0)
......@@ -3973,7 +3974,8 @@ double get_column_range_cardinality(Field *field,
*/
double Histogram_binary::point_selectivity(Field *field, key_range *endpoint,
double avg_sel)
double avg_sel,
double total_records)
{
double sel;
Column_statistics *col_stats= field->read_stats;
......
......@@ -186,7 +186,8 @@ class Histogram_base
virtual double point_selectivity(Field *field, key_range *endpoint,
double avg_selection)=0;
double avg_selectivity,
double total_rows)=0;
virtual double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp)=0;
......@@ -355,7 +356,8 @@ class Histogram_binary : public Histogram_base
Estimate selectivity of "col=const" using a histogram
*/
double point_selectivity(Field *field, key_range *endpoint,
double avg_sel) override;
double avg_sel,
double total_rows) override;
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment