Commit 1e9c922f authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26623 Possible race condition between statistics and bulk insert

When computing statistics, let us play it safe and check whether
an insert into an empty table is in progress, once we have acquired
the root page latch. If yes, let us pretend that the table is empty,
just like MVCC reads would do.

It is unlikely that this race condition could lead into any crashes
before MDEV-24621, because the index tree structure should be protected
by the page latches. But, at least we can avoid some busy work and
return earlier.

As part of this, some code that is only used for statistics calculation
is being moved into static functions in that compilation unit.
parent 1a4a7ddd
...@@ -273,38 +273,6 @@ btr_root_get( ...@@ -273,38 +273,6 @@ btr_root_get(
return(root ? buf_block_get_frame(root) : NULL); return(root ? buf_block_get_frame(root) : NULL);
} }
/**************************************************************//**
Gets the height of the B-tree (the level of the root, when the leaf
level is assumed to be 0). The caller must hold an S or X latch on
the index.
@return tree height (level of the root) */
ulint
btr_height_get(
/*===========*/
const dict_index_t* index, /*!< in: index tree */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
ulint height=0;
buf_block_t* root_block;
ut_ad(srv_read_only_mode
|| mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
| MTR_MEMO_X_LOCK
| MTR_MEMO_SX_LOCK));
/* S latches the page */
root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
if (root_block) {
height = btr_page_get_level(buf_block_get_frame(root_block));
/* Release the S latch on the root page. */
mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
}
return(height);
}
/**************************************************************//** /**************************************************************//**
Checks a file segment header within a B-tree root page and updates Checks a file segment header within a B-tree root page and updates
the segment header space id. the segment header space id.
...@@ -562,48 +530,6 @@ btr_page_alloc( ...@@ -562,48 +530,6 @@ btr_page_alloc(
index, hint_page_no, file_direction, level, mtr, init_mtr); index, hint_page_no, file_direction, level, mtr, init_mtr);
} }
/**************************************************************//**
Gets the number of pages in a B-tree.
@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
ulint
btr_get_size(
/*=========*/
const dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
ulint n=0;
ut_ad(srv_read_only_mode
|| mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
if (dict_index_is_online_ddl(index)
|| !index->is_committed()) {
return(ULINT_UNDEFINED);
}
buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
if (!root) {
return ULINT_UNDEFINED;
}
mtr->x_lock_space(index->table->space);
if (flag == BTR_N_LEAF_PAGES) {
fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ root->frame, &n, mtr);
} else {
ulint dummy;
n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+ root->frame, &dummy, mtr);
n += fseg_n_reserved_pages(*root,
PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ root->frame, &dummy, mtr);
}
return(n);
}
/**************************************************************//** /**************************************************************//**
Frees a page used in an ibuf tree. Puts the page to the free list of the Frees a page used in an ibuf tree. Puts the page to the free list of the
ibuf tree. */ ibuf tree. */
......
...@@ -137,17 +137,6 @@ can be released by page reorganize, then it is reorganized */ ...@@ -137,17 +137,6 @@ can be released by page reorganize, then it is reorganized */
#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
part header, in bytes */ part header, in bytes */
/** Estimated table level stats from sampled value.
@param value sampled stats
@param index index being sampled
@param sample number of sampled rows
@param ext_size external stored data size
@param not_empty table not empty
@return estimated table wide stats from sampled value */
#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+ (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
/* @} */ /* @} */
/*******************************************************************//** /*******************************************************************//**
...@@ -6552,339 +6541,6 @@ btr_estimate_n_rows_in_range( ...@@ -6552,339 +6541,6 @@ btr_estimate_n_rows_in_range(
index, tuple1, tuple2, 1); index, tuple1, tuple2, 1);
} }
/*******************************************************************//**
Record the number of non_null key values in a given index for
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are eventually stored in the array:
index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
static
void
btr_record_not_null_field_in_rec(
/*=============================*/
ulint n_unique, /*!< in: dict_index_get_n_unique(index),
number of columns uniquely determine
an index entry */
const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
its size could be for all fields or
that of "n_unique" */
ib_uint64_t* n_not_null) /*!< in/out: array to record number of
not null rows for n-column prefix */
{
ulint i;
ut_ad(rec_offs_n_fields(offsets) >= n_unique);
if (n_not_null == NULL) {
return;
}
for (i = 0; i < n_unique; i++) {
if (rec_offs_nth_sql_null(offsets, i)) {
break;
}
n_not_null[i]++;
}
}
/** Estimates the number of different key values in a given index, for
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
0..n_uniq-1) and the number of pages that were sampled is saved in
result.n_sample_sizes[].
If innodb_stats_method is nulls_ignored, we also record the number of
non-null values for each prefix and stored the estimates in
array result.n_non_null_key_vals.
@param[in] index index
@return vector with statistics information
empty vector if the index is unavailable. */
std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t* index)
{
btr_cur_t cursor;
page_t* page;
rec_t* rec;
ulint n_cols;
ib_uint64_t* n_diff;
ib_uint64_t* n_not_null;
ibool stats_null_not_equal;
uintmax_t n_sample_pages=1; /* number of pages to sample */
ulint not_empty_flag = 0;
ulint total_external_size = 0;
ulint i;
ulint j;
uintmax_t add_on;
mtr_t mtr;
mem_heap_t* heap = NULL;
rec_offs* offsets_rec = NULL;
rec_offs* offsets_next_rec = NULL;
std::vector<index_field_stats_t> result;
/* For spatial index, there is no such stats can be
fetched. */
ut_ad(!dict_index_is_spatial(index));
n_cols = dict_index_get_n_unique(index);
heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
* n_cols
+ dict_index_get_n_fields(index)
* (sizeof *offsets_rec
+ sizeof *offsets_next_rec));
n_diff = (ib_uint64_t*) mem_heap_zalloc(
heap, n_cols * sizeof(n_diff[0]));
n_not_null = NULL;
/* Check srv_innodb_stats_method setting, and decide whether we
need to record non-null value and also decide if NULL is
considered equal (by setting stats_null_not_equal value) */
switch (srv_innodb_stats_method) {
case SRV_STATS_NULLS_IGNORED:
n_not_null = (ib_uint64_t*) mem_heap_zalloc(
heap, n_cols * sizeof *n_not_null);
/* fall through */
case SRV_STATS_NULLS_UNEQUAL:
/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
case, we will treat NULLs as unequal value */
stats_null_not_equal = TRUE;
break;
case SRV_STATS_NULLS_EQUAL:
stats_null_not_equal = FALSE;
break;
default:
ut_error;
}
if (srv_stats_sample_traditional) {
/* It makes no sense to test more pages than are contained
in the index, thus we lower the number if it is too high */
if (srv_stats_transient_sample_pages > index->stat_index_size) {
if (index->stat_index_size > 0) {
n_sample_pages = index->stat_index_size;
}
} else {
n_sample_pages = srv_stats_transient_sample_pages;
}
} else {
/* New logaritmic number of pages that are estimated.
Number of pages estimated should be between 1 and
index->stat_index_size.
If we have only 0 or 1 index pages then we can only take 1
sample. We have already initialized n_sample_pages to 1.
So taking index size as I and sample as S and log(I)*S as L
requirement 1) we want the out limit of the expression to not exceed I;
requirement 2) we want the ideal pages to be at least S;
so the current expression is min(I, max( min(S,I), L)
looking for simplifications:
case 1: assume S < I
min(I, max( min(S,I), L) -> min(I , max( S, L))
but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
so we have: min(I , L)
case 2: assume I < S
min(I, max( min(S,I), L) -> min(I, max( I, L))
case 2a: L > I
min(I, max( I, L)) -> min(I, L) -> I
case 2b: when L < I
min(I, max( I, L)) -> min(I, I ) -> I
so taking all case2 paths is I, our expression is:
n_pages = S < I? min(I,L) : I
*/
if (index->stat_index_size > 1) {
n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
? ut_min(index->stat_index_size,
static_cast<ulint>(
log2(double(index->stat_index_size))
* double(srv_stats_transient_sample_pages)))
: index->stat_index_size;
}
}
/* Sanity check */
ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
/* We sample some pages in the index to get an estimate */
for (i = 0; i < n_sample_pages; i++) {
mtr_start(&mtr);
bool available;
available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
&cursor, &mtr);
if (!available) {
mtr_commit(&mtr);
mem_heap_free(heap);
return result;
}
/* Count the number of different key values for each prefix of
the key on this index page. If the prefix does not determine
the index record uniquely in the B-tree, then we subtract one
because otherwise our algorithm would give a wrong estimate
for an index where there is just one key value. */
if (!index->is_readable()) {
mtr_commit(&mtr);
goto exit_loop;
}
page = btr_cur_get_page(&cursor);
rec = page_rec_get_next(page_get_infimum_rec(page));
const ulint n_core = page_is_leaf(page)
? index->n_core_fields : 0;
if (!page_rec_is_supremum(rec)) {
not_empty_flag = 1;
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
n_core,
ULINT_UNDEFINED, &heap);
if (n_not_null != NULL) {
btr_record_not_null_field_in_rec(
n_cols, offsets_rec, n_not_null);
}
}
while (!page_rec_is_supremum(rec)) {
ulint matched_fields;
rec_t* next_rec = page_rec_get_next(rec);
if (page_rec_is_supremum(next_rec)) {
total_external_size +=
btr_rec_get_externally_stored_len(
rec, offsets_rec);
break;
}
offsets_next_rec = rec_get_offsets(next_rec, index,
offsets_next_rec,
n_core,
ULINT_UNDEFINED,
&heap);
cmp_rec_rec(rec, next_rec,
offsets_rec, offsets_next_rec,
index, stats_null_not_equal,
&matched_fields);
for (j = matched_fields; j < n_cols; j++) {
/* We add one if this index record has
a different prefix from the previous */
n_diff[j]++;
}
if (n_not_null != NULL) {
btr_record_not_null_field_in_rec(
n_cols, offsets_next_rec, n_not_null);
}
total_external_size
+= btr_rec_get_externally_stored_len(
rec, offsets_rec);
rec = next_rec;
/* Initialize offsets_rec for the next round
and assign the old offsets_rec buffer to
offsets_next_rec. */
{
rec_offs* offsets_tmp = offsets_rec;
offsets_rec = offsets_next_rec;
offsets_next_rec = offsets_tmp;
}
}
if (n_cols == dict_index_get_n_unique_in_tree(index)
&& page_has_siblings(page)) {
/* If there is more than one leaf page in the tree,
we add one because we know that the first record
on the page certainly had a different prefix than the
last record on the previous index page in the
alphabetical order. Before this fix, if there was
just one big record on each clustered index page, the
algorithm grossly underestimated the number of rows
in the table. */
n_diff[n_cols - 1]++;
}
mtr_commit(&mtr);
}
exit_loop:
/* If we saw k borders between different key values on
n_sample_pages leaf pages, we can estimate how many
there will be in index->stat_n_leaf_pages */
/* We must take into account that our sample actually represents
also the pages used for external storage of fields (those pages are
included in index->stat_n_leaf_pages) */
result.reserve(n_cols);
for (j = 0; j < n_cols; j++) {
index_field_stats_t stat;
stat.n_diff_key_vals
= BTR_TABLE_STATS_FROM_SAMPLE(
n_diff[j], index, n_sample_pages,
total_external_size, not_empty_flag);
/* If the tree is small, smaller than
10 * n_sample_pages + total_external_size, then
the above estimate is ok. For bigger trees it is common that we
do not see any borders between key values in the few pages
we pick. But still there may be n_sample_pages
different key values, or even more. Let us try to approximate
that: */
add_on = index->stat_n_leaf_pages
/ (10 * (n_sample_pages
+ total_external_size));
if (add_on > n_sample_pages) {
add_on = n_sample_pages;
}
stat.n_diff_key_vals += add_on;
stat.n_sample_sizes = n_sample_pages;
if (n_not_null != NULL) {
stat.n_non_null_key_vals =
BTR_TABLE_STATS_FROM_SAMPLE(
n_not_null[j], index, n_sample_pages,
total_external_size, not_empty_flag);
}
result.push_back(stat);
}
mem_heap_free(heap);
return result;
}
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
/***********************************************************//** /***********************************************************//**
......
...@@ -1024,6 +1024,367 @@ dict_stats_snapshot_free( ...@@ -1024,6 +1024,367 @@ dict_stats_snapshot_free(
dict_stats_table_clone_free(t); dict_stats_table_clone_free(t);
} }
/** Statistics for one field of an index. */
struct index_field_stats_t
{
ib_uint64_t n_diff_key_vals;
ib_uint64_t n_sample_sizes;
ib_uint64_t n_non_null_key_vals;
index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
ib_uint64_t n_sample_sizes= 0,
ib_uint64_t n_non_null_key_vals= 0)
: n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
n_non_null_key_vals(n_non_null_key_vals)
{
}
};
/*******************************************************************//**
Record the number of non_null key values in a given index for
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are eventually stored in the array:
index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
static
void
btr_record_not_null_field_in_rec(
/*=============================*/
ulint n_unique, /*!< in: dict_index_get_n_unique(index),
number of columns uniquely determine
an index entry */
const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
its size could be for all fields or
that of "n_unique" */
ib_uint64_t* n_not_null) /*!< in/out: array to record number of
not null rows for n-column prefix */
{
ulint i;
ut_ad(rec_offs_n_fields(offsets) >= n_unique);
if (n_not_null == NULL) {
return;
}
for (i = 0; i < n_unique; i++) {
if (rec_offs_nth_sql_null(offsets, i)) {
break;
}
n_not_null[i]++;
}
}
/** Estimated table level stats from sampled value.
@param value sampled stats
@param index index being sampled
@param sample number of sampled rows
@param ext_size external stored data size
@param not_empty table not empty
@return estimated table wide stats from sampled value */
#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+ (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
/** Estimates the number of different key values in a given index, for
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
0..n_uniq-1) and the number of pages that were sampled is saved in
result.n_sample_sizes[].
If innodb_stats_method is nulls_ignored, we also record the number of
non-null values for each prefix and stored the estimates in
array result.n_non_null_key_vals.
@param index B-tree index
@param bulk_trx_id the value of index->table->bulk_trx_id at the start
@return vector with statistics information
empty vector if the index is unavailable. */
static
std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t* index,
trx_id_t bulk_trx_id)
{
btr_cur_t cursor;
page_t* page;
rec_t* rec;
ulint n_cols;
ib_uint64_t* n_diff;
ib_uint64_t* n_not_null;
ibool stats_null_not_equal;
uintmax_t n_sample_pages=1; /* number of pages to sample */
ulint not_empty_flag = 0;
ulint total_external_size = 0;
ulint i;
ulint j;
uintmax_t add_on;
mtr_t mtr;
mem_heap_t* heap = NULL;
rec_offs* offsets_rec = NULL;
rec_offs* offsets_next_rec = NULL;
std::vector<index_field_stats_t> result;
ut_ad(!index->is_spatial());
n_cols = dict_index_get_n_unique(index);
heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
* n_cols
+ dict_index_get_n_fields(index)
* (sizeof *offsets_rec
+ sizeof *offsets_next_rec));
n_diff = (ib_uint64_t*) mem_heap_zalloc(
heap, n_cols * sizeof(n_diff[0]));
n_not_null = NULL;
/* Check srv_innodb_stats_method setting, and decide whether we
need to record non-null value and also decide if NULL is
considered equal (by setting stats_null_not_equal value) */
switch (srv_innodb_stats_method) {
case SRV_STATS_NULLS_IGNORED:
n_not_null = (ib_uint64_t*) mem_heap_zalloc(
heap, n_cols * sizeof *n_not_null);
/* fall through */
case SRV_STATS_NULLS_UNEQUAL:
/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
case, we will treat NULLs as unequal value */
stats_null_not_equal = TRUE;
break;
case SRV_STATS_NULLS_EQUAL:
stats_null_not_equal = FALSE;
break;
default:
ut_error;
}
if (srv_stats_sample_traditional) {
/* It makes no sense to test more pages than are contained
in the index, thus we lower the number if it is too high */
if (srv_stats_transient_sample_pages > index->stat_index_size) {
if (index->stat_index_size > 0) {
n_sample_pages = index->stat_index_size;
}
} else {
n_sample_pages = srv_stats_transient_sample_pages;
}
} else {
/* New logaritmic number of pages that are estimated.
Number of pages estimated should be between 1 and
index->stat_index_size.
If we have only 0 or 1 index pages then we can only take 1
sample. We have already initialized n_sample_pages to 1.
So taking index size as I and sample as S and log(I)*S as L
requirement 1) we want the out limit of the expression to not exceed I;
requirement 2) we want the ideal pages to be at least S;
so the current expression is min(I, max( min(S,I), L)
looking for simplifications:
case 1: assume S < I
min(I, max( min(S,I), L) -> min(I , max( S, L))
but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
so we have: min(I , L)
case 2: assume I < S
min(I, max( min(S,I), L) -> min(I, max( I, L))
case 2a: L > I
min(I, max( I, L)) -> min(I, L) -> I
case 2b: when L < I
min(I, max( I, L)) -> min(I, I ) -> I
so taking all case2 paths is I, our expression is:
n_pages = S < I? min(I,L) : I
*/
if (index->stat_index_size > 1) {
n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
? ut_min(index->stat_index_size,
static_cast<ulint>(
log2(double(index->stat_index_size))
* double(srv_stats_transient_sample_pages)))
: index->stat_index_size;
}
}
/* Sanity check */
ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
/* We sample some pages in the index to get an estimate */
for (i = 0; i < n_sample_pages; i++) {
mtr.start();
bool available;
available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
&cursor, &mtr);
if (!available || index->table->bulk_trx_id != bulk_trx_id) {
mtr.commit();
mem_heap_free(heap);
return result;
}
/* Count the number of different key values for each prefix of
the key on this index page. If the prefix does not determine
the index record uniquely in the B-tree, then we subtract one
because otherwise our algorithm would give a wrong estimate
for an index where there is just one key value. */
if (!index->is_readable()) {
mtr.commit();
goto exit_loop;
}
page = btr_cur_get_page(&cursor);
rec = page_rec_get_next(page_get_infimum_rec(page));
const ulint n_core = page_is_leaf(page)
? index->n_core_fields : 0;
if (!page_rec_is_supremum(rec)) {
not_empty_flag = 1;
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
n_core,
ULINT_UNDEFINED, &heap);
if (n_not_null != NULL) {
btr_record_not_null_field_in_rec(
n_cols, offsets_rec, n_not_null);
}
}
while (!page_rec_is_supremum(rec)) {
ulint matched_fields;
rec_t* next_rec = page_rec_get_next(rec);
if (page_rec_is_supremum(next_rec)) {
total_external_size +=
btr_rec_get_externally_stored_len(
rec, offsets_rec);
break;
}
offsets_next_rec = rec_get_offsets(next_rec, index,
offsets_next_rec,
n_core,
ULINT_UNDEFINED,
&heap);
cmp_rec_rec(rec, next_rec,
offsets_rec, offsets_next_rec,
index, stats_null_not_equal,
&matched_fields);
for (j = matched_fields; j < n_cols; j++) {
/* We add one if this index record has
a different prefix from the previous */
n_diff[j]++;
}
if (n_not_null != NULL) {
btr_record_not_null_field_in_rec(
n_cols, offsets_next_rec, n_not_null);
}
total_external_size
+= btr_rec_get_externally_stored_len(
rec, offsets_rec);
rec = next_rec;
/* Initialize offsets_rec for the next round
and assign the old offsets_rec buffer to
offsets_next_rec. */
{
rec_offs* offsets_tmp = offsets_rec;
offsets_rec = offsets_next_rec;
offsets_next_rec = offsets_tmp;
}
}
if (n_cols == dict_index_get_n_unique_in_tree(index)
&& page_has_siblings(page)) {
/* If there is more than one leaf page in the tree,
we add one because we know that the first record
on the page certainly had a different prefix than the
last record on the previous index page in the
alphabetical order. Before this fix, if there was
just one big record on each clustered index page, the
algorithm grossly underestimated the number of rows
in the table. */
n_diff[n_cols - 1]++;
}
mtr.commit();
}
exit_loop:
/* If we saw k borders between different key values on
n_sample_pages leaf pages, we can estimate how many
there will be in index->stat_n_leaf_pages */
/* We must take into account that our sample actually represents
also the pages used for external storage of fields (those pages are
included in index->stat_n_leaf_pages) */
result.reserve(n_cols);
for (j = 0; j < n_cols; j++) {
index_field_stats_t stat;
stat.n_diff_key_vals
= BTR_TABLE_STATS_FROM_SAMPLE(
n_diff[j], index, n_sample_pages,
total_external_size, not_empty_flag);
/* If the tree is small, smaller than
10 * n_sample_pages + total_external_size, then
the above estimate is ok. For bigger trees it is common that we
do not see any borders between key values in the few pages
we pick. But still there may be n_sample_pages
different key values, or even more. Let us try to approximate
that: */
add_on = index->stat_n_leaf_pages
/ (10 * (n_sample_pages
+ total_external_size));
if (add_on > n_sample_pages) {
add_on = n_sample_pages;
}
stat.n_diff_key_vals += add_on;
stat.n_sample_sizes = n_sample_pages;
if (n_not_null != NULL) {
stat.n_non_null_key_vals =
BTR_TABLE_STATS_FROM_SAMPLE(
n_not_null[j], index, n_sample_pages,
total_external_size, not_empty_flag);
}
result.push_back(stat);
}
mem_heap_free(heap);
return result;
}
/*********************************************************************//** /*********************************************************************//**
Calculates new estimates for index statistics. This function is Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that relatively quick and is used to calculate transient statistics that
...@@ -1054,39 +1415,48 @@ dict_stats_update_transient_for_index( ...@@ -1054,39 +1415,48 @@ dict_stats_update_transient_for_index(
} else if (ibuf_debug && !dict_index_is_clust(index)) { } else if (ibuf_debug && !dict_index_is_clust(index)) {
goto dummy_empty; goto dummy_empty;
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else if (dict_index_is_online_ddl(index) || !index->is_committed()
|| !index->table->space) {
goto dummy_empty;
} else { } else {
mtr_t mtr; mtr_t mtr;
ulint size;
mtr.start(); mtr.start();
mtr_s_lock_index(index, &mtr); mtr_s_lock_index(index, &mtr);
size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
&mtr);
if (size != ULINT_UNDEFINED) { if (!root) {
index->stat_index_size = size; invalid:
mtr.commit();
goto dummy_empty;
}
size = btr_get_size( const auto bulk_trx_id = index->table->bulk_trx_id;
index, BTR_N_LEAF_PAGES, &mtr); if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
goto invalid;
} }
mtr.commit(); mtr.x_lock_space(index->table->space);
switch (size) { ulint dummy, size;
case ULINT_UNDEFINED: index->stat_index_size
goto dummy_empty; = fseg_n_reserved_pages(*root, PAGE_HEADER
case 0: + PAGE_BTR_SEG_LEAF
/* The root node of the tree is a leaf */ + root->frame, &size, &mtr)
size = 1; + fseg_n_reserved_pages(*root, PAGE_HEADER
} + PAGE_BTR_SEG_TOP
+ root->frame, &dummy, &mtr);
mtr.commit();
index->stat_n_leaf_pages = size; index->stat_n_leaf_pages = size ? size : 1;
/* Do not continue if table decryption has failed or /* Do not continue if table decryption has failed or
table is already marked as corrupted. */ table is already marked as corrupted. */
if (index->is_readable()) { if (index->is_readable()) {
std::vector<index_field_stats_t> stats std::vector<index_field_stats_t> stats
= btr_estimate_number_of_different_key_vals( = btr_estimate_number_of_different_key_vals(
index); index, bulk_trx_id);
if (!stats.empty()) { if (!stats.empty()) {
index->table->stats_mutex_lock(); index->table->stats_mutex_lock();
...@@ -2037,7 +2407,7 @@ struct index_stats_t ...@@ -2037,7 +2407,7 @@ struct index_stats_t
{ {
stats.reserve(n_uniq); stats.reserve(n_uniq);
for (ulint i= 0; i < n_uniq; ++i) for (ulint i= 0; i < n_uniq; ++i)
stats.push_back(index_field_stats_t(0, 1, 0)); stats.push_back(index_field_stats_t{0, 1, 0});
} }
}; };
...@@ -2123,15 +2493,12 @@ stat_n_leaf_pages. This function can be slow. ...@@ -2123,15 +2493,12 @@ stat_n_leaf_pages. This function can be slow.
@return index stats */ @return index stats */
static index_stats_t dict_stats_analyze_index(dict_index_t* index) static index_stats_t dict_stats_analyze_index(dict_index_t* index)
{ {
ulint root_level;
ulint level;
bool level_is_analyzed; bool level_is_analyzed;
ulint n_uniq; ulint n_uniq;
ulint n_prefix; ulint n_prefix;
ib_uint64_t total_recs; ib_uint64_t total_recs;
ib_uint64_t total_pages; ib_uint64_t total_pages;
mtr_t mtr; mtr_t mtr;
ulint size;
index_stats_t result(index->n_uniq); index_stats_t result(index->n_uniq);
DBUG_ENTER("dict_stats_analyze_index"); DBUG_ENTER("dict_stats_analyze_index");
...@@ -2150,30 +2517,43 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) ...@@ -2150,30 +2517,43 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
mtr.start(); mtr.start();
mtr_s_lock_index(index, &mtr); mtr_s_lock_index(index, &mtr);
size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); uint16_t root_level;
if (size != ULINT_UNDEFINED) { {
result.index_size = size; buf_block_t* root;
size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr); root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
} if (!root) {
empty_index:
/* Release the X locks on the root page taken by btr_get_size() */
mtr.commit(); mtr.commit();
switch (size) {
case ULINT_UNDEFINED:
dict_stats_assert_initialized_index(index); dict_stats_assert_initialized_index(index);
DBUG_RETURN(result); DBUG_RETURN(result);
case 0:
/* The root node of the tree is a leaf */
size = 1;
} }
result.n_leaf_pages = size; root_level = btr_page_get_level(root->frame);
mtr.x_lock_space(index->table->space);
ulint dummy, size;
result.index_size
= fseg_n_reserved_pages(*root, PAGE_HEADER
+ PAGE_BTR_SEG_LEAF
+ root->frame, &size, &mtr)
+ fseg_n_reserved_pages(*root, PAGE_HEADER
+ PAGE_BTR_SEG_TOP
+ root->frame, &dummy, &mtr);
result.n_leaf_pages = size ? size : 1;
}
const auto bulk_trx_id = index->table->bulk_trx_id;
if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
result.index_size = 1;
result.n_leaf_pages = 1;
goto empty_index;
}
mtr.commit();
mtr.start(); mtr.start();
mtr_sx_lock_index(index, &mtr); mtr_sx_lock_index(index, &mtr);
root_level = btr_height_get(index, &mtr);
n_uniq = dict_index_get_n_unique(index); n_uniq = dict_index_get_n_unique(index);
...@@ -2251,7 +2631,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) ...@@ -2251,7 +2631,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
So if we find that the first level containing D distinct So if we find that the first level containing D distinct
keys (on n_prefix columns) is L, we continue from L when keys (on n_prefix columns) is L, we continue from L when
searching for D distinct keys on n_prefix-1 columns. */ searching for D distinct keys on n_prefix-1 columns. */
level = root_level; auto level = root_level;
level_is_analyzed = false; level_is_analyzed = false;
for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) { for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
...@@ -2265,7 +2645,10 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) ...@@ -2265,7 +2645,10 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
mtr.commit(); mtr.commit();
mtr.start(); mtr.start();
mtr_sx_lock_index(index, &mtr); mtr_sx_lock_index(index, &mtr);
if (root_level != btr_height_get(index, &mtr)) { buf_block_t *root = btr_root_block_get(index, RW_S_LATCH,
&mtr);
if (!root || root_level != btr_page_get_level(root->frame)
|| index->table->bulk_trx_id != bulk_trx_id) {
/* Just quit if the tree has changed beyond /* Just quit if the tree has changed beyond
recognition here. The old stats from previous recognition here. The old stats from previous
runs will remain in the values that we have runs will remain in the values that we have
...@@ -2277,6 +2660,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) ...@@ -2277,6 +2660,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
break; break;
} }
mtr.memo_release(root, MTR_MEMO_PAGE_S_FIX);
/* check whether we should pick the current level; /* check whether we should pick the current level;
we pick level 1 even if it does not have enough we pick level 1 even if it does not have enough
distinct records because we do not want to scan the distinct records because we do not want to scan the
......
...@@ -196,18 +196,6 @@ btr_root_adjust_on_import( ...@@ -196,18 +196,6 @@ btr_root_adjust_on_import(
const dict_index_t* index) /*!< in: index tree */ const dict_index_t* index) /*!< in: index tree */
MY_ATTRIBUTE((warn_unused_result)); MY_ATTRIBUTE((warn_unused_result));
/**************************************************************//**
Gets the height of the B-tree (the level of the root, when the leaf
level is assumed to be 0). The caller must hold an S or X latch on
the index.
@return tree height (level of the root) */
ulint
btr_height_get(
/*===========*/
const dict_index_t* index, /*!< in: index tree */
mtr_t* mtr) /*!< in/out: mini-transaction */
MY_ATTRIBUTE((warn_unused_result));
/** Get an index page and declare its latching order level. /** Get an index page and declare its latching order level.
@param[in] index index tree @param[in] index index tree
@param[in] page page number @param[in] page page number
...@@ -537,17 +525,6 @@ btr_discard_page( ...@@ -537,17 +525,6 @@ btr_discard_page(
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
the root page */ the root page */
mtr_t* mtr); /*!< in: mtr */ mtr_t* mtr); /*!< in: mtr */
/**************************************************************//**
Gets the number of pages in a B-tree.
@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
ulint
btr_get_size(
/*=========*/
const dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
MY_ATTRIBUTE((warn_unused_result));
/**************************************************************//** /**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume Allocates a new file page to be used in an index tree. NOTE: we assume
...@@ -614,7 +591,6 @@ btr_root_block_get( ...@@ -614,7 +591,6 @@ btr_root_block_get(
rw_lock_type_t mode, /*!< in: either RW_S_LATCH rw_lock_type_t mode, /*!< in: either RW_S_LATCH
or RW_X_LATCH */ or RW_X_LATCH */
mtr_t* mtr); /*!< in: mtr */ mtr_t* mtr); /*!< in: mtr */
/*************************************************************//** /*************************************************************//**
Reorganizes an index page. Reorganizes an index page.
......
...@@ -575,37 +575,6 @@ btr_estimate_n_rows_in_range( ...@@ -575,37 +575,6 @@ btr_estimate_n_rows_in_range(
btr_pos_t* range_start, btr_pos_t* range_start,
btr_pos_t* range_end); btr_pos_t* range_end);
/** Statistics for one field of an index. */
struct index_field_stats_t
{
ib_uint64_t n_diff_key_vals;
ib_uint64_t n_sample_sizes;
ib_uint64_t n_non_null_key_vals;
index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
ib_uint64_t n_sample_sizes= 0,
ib_uint64_t n_non_null_key_vals= 0)
: n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
n_non_null_key_vals(n_non_null_key_vals)
{
}
};
/** Estimates the number of different key values in a given index, for
each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
0..n_uniq-1) and the number of pages that were sampled is saved in
index->stat_n_sample_sizes[].
If innodb_stats_method is nulls_ignored, we also record the number of
non-null values for each prefix and stored the estimates in
array index->stat_n_non_null_key_vals.
@param[in] index index
@return stat vector if the index is available and we get the estimated numbers,
empty vector if the index is unavailable. */
std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t* index);
/** Gets the externally stored size of a record, in units of a database page. /** Gets the externally stored size of a record, in units of a database page.
@param[in] rec record @param[in] rec record
@param[in] offsets array returned by rec_get_offsets() @param[in] offsets array returned by rec_get_offsets()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment