Commit c23f2842 authored by Rex's avatar Rex

MDEV-31327 Range Histogram selectivity estimates added, not merged

When the optimizer is evaluating join order and estimating rows produced a join order, it loops through column constraints adding rather than merging selectivity estimates.
parent 6966d7fe
......@@ -119,6 +119,7 @@
#include "sql_select.h"
#include "sql_statistics.h"
#include "uniques.h"
#include "my_bitmap.h"
#include "my_json_writer.h"
#ifndef EXTRA_DEBUG
......@@ -3182,19 +3183,25 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
*/
static
double records_in_column_ranges(PARAM *param, uint idx,
SEL_ARG *tree)
double records_in_column_ranges(PARAM *param, uint idx, SEL_ARG *tree,
Json_writer_object *wo )
{
THD *thd= param->thd;
SEL_ARG_RANGE_SEQ seq;
KEY_MULTI_RANGE range;
range_seq_t seq_it;
double rows;
Field *field;
TABLE *table;
uint flags= 0;
double total_rows= 0;
double total_rows;
RANGE_SEQ_IF seq_if = {NULL, sel_arg_range_seq_init,
sel_arg_range_seq_next, 0, 0};
MY_BITMAP bucket_map, new_bucket_map;
my_bitmap_map *tmp1= nullptr, *tmp2= nullptr;
double table_records;
double column_nulls;
double column_non_nulls;
bool nulls_included;
/* Handle cases when we don't have a valid non-empty list of range */
if (!tree)
......@@ -3203,6 +3210,14 @@ double records_in_column_ranges(PARAM *param, uint idx,
return (0L);
field= tree->field;
DBUG_ASSERT(field);
table= field->table;
DBUG_ASSERT(table);
table_records= (double)table->stat_records();
column_nulls= table_records * field->read_stats->get_nulls_ratio();
column_non_nulls= table_records - column_nulls;
nulls_included= field->null_ptr;
seq.keyno= idx;
seq.real_keyno= MAX_KEY;
......@@ -3212,11 +3227,31 @@ double records_in_column_ranges(PARAM *param, uint idx,
seq_it= seq_if.init((void *) &seq, 0, flags);
if((total_rows= histogram_test( field, thd, wo )) > 0)
{
if (unlikely(thd->trace_started()))
wo->add("avg_selectivity", total_rows/table_records);
return total_rows;
}
total_rows= 0;
if(my_bitmap_init(&bucket_map, tmp1,
field->read_stats->histogram.get_size(), FALSE))
return DBL_MAX;
if(my_bitmap_init(&new_bucket_map, tmp2,
field->read_stats->histogram.get_size(), FALSE))
{
bitmap_free( &bucket_map );
return DBL_MAX;
}
{
Json_writer_array range_trace(thd, "ranges");
key_range *min_endp, *max_endp;
while (!seq_if.next(seq_it, &range))
{
key_range *min_endp, *max_endp;
min_endp= range.start_key.length? &range.start_key : NULL;
max_endp= range.end_key.length? &range.end_key : NULL;
int range_flag= range.range_flag;
......@@ -3237,14 +3272,36 @@ double records_in_column_ranges(PARAM *param, uint idx,
range_trace.add(range_info.c_ptr_safe(), range_info.length());
}
rows= get_column_range_cardinality(field, min_endp, max_endp, range_flag);
if (DBL_MAX == rows)
nulls_included= nulls_included && min_endp && min_endp->key[0] &&
!(range_flag & NEAR_MIN);
// use histogram
get_column_range_bitmap(&new_bucket_map, field,
min_endp, max_endp, range_flag);
bitmap_union(&bucket_map, &new_bucket_map);
}
}
if (nulls_included)
{
total_rows= DBL_MAX;
break;
if (column_non_nulls < 1)
{
bitmap_free( &bucket_map );
bitmap_free( &new_bucket_map );
return column_nulls;
}
total_rows += rows;
else
total_rows+= column_nulls;
}
total_rows+=
column_non_nulls*get_selectivity_from_bitmap(field, &bucket_map);
bitmap_free( &bucket_map );
bitmap_free( &new_bucket_map );
if (unlikely(thd->trace_started()))
wo->add("selectivity_from_histogram", total_rows/table_records);
return total_rows;
}
......@@ -3520,14 +3577,11 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item **cond)
{
enum_check_fields save_count_cuted_fields= thd->count_cuted_fields;
thd->count_cuted_fields= CHECK_FIELD_IGNORE;
rows= records_in_column_ranges(&param, idx, key);
rows= records_in_column_ranges(&param, idx, key,
&selectivity_for_column);
thd->count_cuted_fields= save_count_cuted_fields;
if (rows != DBL_MAX)
{
key->field->cond_selectivity= rows/table_records;
selectivity_for_column.add("selectivity_from_histogram",
key->field->cond_selectivity);
}
}
}
}
......
......@@ -1579,7 +1579,7 @@ class Histogram_builder
column->pos_in_interval(min_value, max_value));
curr_bucket++;
while (curr_bucket != hist_width &&
count > bucket_capacity * (curr_bucket + 1))
count > (double)records * ((double)curr_bucket/(double)hist_width))
{
histogram->set_prev_value(curr_bucket);
curr_bucket++;
......@@ -3698,10 +3698,54 @@ double get_column_avg_frequency(Field * field)
}
/**
@brief
Handle the common non-histogram using conditions
@param
field Where the histogram lies.
@details
Handle conditions like
1) no histogram
2) histogram has no records
3) histogram has only unique values.
4) histogram has been disabled
*/
double histogram_test( Field *field, THD *thd, Json_writer_object *wo )
{
DBUG_ASSERT(field);
TABLE *table= field->table;
DBUG_ASSERT(table);
Column_statistics *col_stats= field->read_stats;
DBUG_ASSERT(col_stats);
Histogram *hist= &col_stats->histogram;
double avg_frequency= col_stats->get_avg_frequency();
if (avg_frequency < DOUBLE_TRUNCATION_OFFSET)
return (double)table->stat_records();
else
{
if (!hist->is_usable(thd))
{
if (col_stats)
return col_stats->get_avg_frequency();
return 1;
}
}
return -1; // look into the histogram
}
/**
@brief
Estimate the number of rows in a column range using data from stat tables
@param
dest A preallocated bitmap to fill in representing bucket numbers in
the histogram
@param
field The column whose range cardinality is to be estimated
@param
......@@ -3712,28 +3756,27 @@ double get_column_avg_frequency(Field * field)
range_flag The range flags
@details
The function gets an estimate of the number of rows in a column range
using the statistical data from the table column_stats.
@retval
- The required estimate of the rows in the column range
- If there is some kind of error, this function should return DBL_MAX (and
not HA_POS_ERROR as that is an integer constant).
The function fills in a bitmap of the buckets for the field using the
statistical data from the table column_stats. This should be a contiguous
range of bits, depending on the range supplied.
*/
double get_column_range_cardinality(Field *field,
void get_column_range_bitmap(MY_BITMAP *dest,
Field *field,
key_range *min_endp,
key_range *max_endp,
uint range_flag)
{
double res;
DBUG_ASSERT(dest);
DBUG_ASSERT(field);
TABLE *table= field->table;
DBUG_ASSERT(table);
Column_statistics *col_stats= field->read_stats;
double tab_records= (double)table->stat_records();
DBUG_ASSERT(col_stats);
Histogram *hist= &col_stats->histogram;
if (!col_stats)
return tab_records;
/*
Use statistics for a table only when we have actually read
the statistics from the stat tables. For example due to
......@@ -3741,73 +3784,28 @@ double get_column_range_cardinality(Field *field,
a table.
*/
if (!table->stats_is_read)
return tab_records;
THD *thd= table->in_use;
double col_nulls= tab_records * col_stats->get_nulls_ratio();
double col_non_nulls= tab_records - col_nulls;
bool nulls_incl= field->null_ptr && min_endp && min_endp->key[0] &&
!(range_flag & NEAR_MIN);
if (col_non_nulls < 1)
{
if (nulls_incl)
res= col_nulls;
else
res= 0;
}
else if (min_endp && max_endp && min_endp->length == max_endp->length &&
!memcmp(min_endp->key, max_endp->key, min_endp->length))
{
if (nulls_incl)
bitmap_set_above (dest, 0, 0);
if (!col_stats || !table->stats_is_read || !hist->is_usable(table->in_use))
{
/* This is null single point range */
res= col_nulls;
bitmap_invert(dest);
return;
}
else
{
double avg_frequency= col_stats->get_avg_frequency();
res= avg_frequency;
if (avg_frequency > 1.0 + 0.000001 &&
col_stats->min_max_values_are_provided())
double min_mp_pos= 0.0, max_mp_pos= 1.0;
if (col_stats->min_value_provided())
{
Histogram *hist= &col_stats->histogram;
if (hist->is_usable(thd))
if (min_endp && !(field->null_ptr && min_endp->key[0]))
{
store_key_image_to_rec(field, (uchar *) min_endp->key,
field->key_length());
double pos= field->pos_in_interval(col_stats->min_value,
min_mp_pos= field->pos_in_interval(col_stats->min_value,
col_stats->max_value);
res= col_non_nulls *
hist->point_selectivity(pos,
avg_frequency / col_non_nulls);
}
}
else if (avg_frequency == 0.0)
{
/* This actually means there is no statistics data */
res= tab_records;
}
}
}
else
{
if (col_stats->min_max_values_are_provided())
{
double sel, min_mp_pos, max_mp_pos;
if (min_endp && !(field->null_ptr && min_endp->key[0]))
if (col_stats->max_value_provided())
{
store_key_image_to_rec(field, (uchar *) min_endp->key,
field->key_length());
min_mp_pos= field->pos_in_interval(col_stats->min_value,
col_stats->max_value);
}
else
min_mp_pos= 0.0;
if (max_endp)
{
store_key_image_to_rec(field, (uchar *) max_endp->key,
......@@ -3815,26 +3813,43 @@ double get_column_range_cardinality(Field *field,
max_mp_pos= field->pos_in_interval(col_stats->min_value,
col_stats->max_value);
}
else
max_mp_pos= 1.0;
Histogram *hist= &col_stats->histogram;
if (hist->is_usable(thd))
sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
else
sel= (max_mp_pos - min_mp_pos);
res= col_non_nulls * sel;
set_if_bigger(res, col_stats->get_avg_frequency());
}
else
res= col_non_nulls;
if (nulls_incl)
res+= col_nulls;
}
return res;
hist->selectivity_fill_bucketmap(dest, min_mp_pos, max_mp_pos);
}
/**
@brief
Estimate the number of rows in a column range.
@param
field The column whose range cardinality is to be estimated
@param
bucket_map A bitmap of the used buckets.
@details
The function gets an estimate of the number of rows in a column range
using a bitmap of the used histogram buckets in the table column_stats.
@retval
- The required estimate of the rows in the column range
- If there is some kind of error, this function should return DBL_MAX (and
not HA_POS_ERROR as that is an integer constant).
*/
double get_selectivity_from_bitmap( Field *field, MY_BITMAP *bucket_map )
{
double bucket_sel= 1.0/field->read_stats->histogram.get_width();
uint count= 0, size= field->read_stats->histogram.get_size();
for (uint i= 0; i < size; i++)
if (bitmap_is_set(bucket_map, i))
count++;
return bucket_sel * count;
}
/*
Estimate selectivity of "col=const" using a histogram
......
......@@ -16,6 +16,11 @@
#ifndef SQL_STATISTICS_H
#define SQL_STATISTICS_H
#include "my_json_writer.h"
// a number to use in floating point comparasons
#define FLOAT_TRUNCATION_OFFSET 0.00001
#define DOUBLE_TRUNCATION_OFFSET 0.000001
/*
For COMPLEMENTARY_FOR_QUERIES and PREFERABLY_FOR_QUERIES they are
similar to the COMPLEMENTARY and PREFERABLY respectively except that
......@@ -133,10 +138,13 @@ void set_statistics_for_table(THD *thd, TABLE *table);
double get_column_avg_frequency(Field * field);
double get_column_range_cardinality(Field *field,
double histogram_test( Field *field, THD *thd, Json_writer_object *wo );
void get_column_range_bitmap(MY_BITMAP *dest,
Field *field,
key_range *min_endp,
key_range *max_endp,
uint range_flag);
double get_selectivity_from_bitmap( Field *field, MY_BITMAP *bucket_map );
bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table);
bool is_eits_usable(Field* field);
......@@ -184,29 +192,31 @@ class Histogram
return 0;
}
/* Find the bucket which value 'pos' falls into. */
/*
* Find the bucket which value 'pos' falls into.
* return the bucket number [0, getsize())
*/
uint find_bucket(double pos, bool first)
{
uint val= (uint) (pos * prec_factor());
int lp= 0;
int rp= get_width() - 1;
int d= get_width() / 2;
uint i= lp + d;
for ( ; d; d= (rp - lp) / 2, i= lp + d)
uint val= (uint) (pos * prec_factor()); // scale to 2^8||2^16
int lp= 0; // left bracket
int rp= get_width() - 1; // right backet
int d= get_width() / 2; // spacing
uint i= lp + d; // middle
// bisect search
for ( ; d; d= (rp - lp) / 2, i= lp + d) // while left<right
{
if (val == get_value(i))
if (val == get_value(i)) // found it
break;
if (val < get_value(i))
if (val < get_value(i)) // its to the left
rp= i;
else if (val > get_value(i + 1))
else if (val > get_value(i + 1)) // its to the right
lp= i + 1;
else
break;
}
if (val > get_value(i) && i < (get_width() - 1))
i++;
if (val == get_value(i))
{
if (first)
......@@ -274,6 +284,16 @@ class Histogram
}
}
void selectivity_fill_bucketmap(MY_BITMAP *dest,
double min_pos, double max_pos)
{
uint min= find_bucket(min_pos, TRUE);
uint max= find_bucket(max_pos, FALSE);
// fill in the dest bitmap
for( uint i= min; i <= max; i++ )
bitmap_set_bit(dest, i);
}
double range_selectivity(double min_pos, double max_pos)
{
double sel;
......@@ -425,10 +445,14 @@ class Column_statistics
avg_frequency= (ulonglong) (val * Scale_factor_avg_frequency);
}
bool min_max_values_are_provided()
bool min_value_provided()
{
return !is_null(COLUMN_STAT_MIN_VALUE);
}
bool max_value_provided()
{
return !is_null(COLUMN_STAT_MIN_VALUE) &&
!is_null(COLUMN_STAT_MAX_VALUE);
return !is_null(COLUMN_STAT_MAX_VALUE);
}
/*
This function checks whether the values for the fields of the statistical
......
......@@ -6150,6 +6150,7 @@ static Sys_var_ulong Sys_histogram_size(
"If set to 0, no histograms are created by ANALYZE.",
SESSION_VAR(histogram_size), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, 255), DEFAULT(254), BLOCK_SIZE(1));
// VALID_RANGE(0, 254), DEFAULT(252), BLOCK_SIZE(1));
extern const char *histogram_types[];
static Sys_var_enum Sys_histogram_type(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment