Commit ffead38d authored by Marko Mäkelä's avatar Marko Mäkelä

Improve the bulk_trx_id checks

row_merge_read_clustered_index(): Check bulk_trx_id at the correct spot.

row_sel(), row_sel_try_search_shortcut(),
row_sel_try_search_shortcut_for_mysql(): Check bulk_trx_id.

ReadView::changes_visible(trx_id_t) const: New accessor for the case
where the trx_id_t is not read from a potentially corrupted index page
but directly from the memory. In this case, we can skip a sanity check.
parent 1080fde1
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2018, 2020, MariaDB Corporation.
Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -140,6 +140,20 @@ class ReadViewBase
*/
static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);
/**
Check whether the changes by id are visible.
@param[in] id transaction id to check against the view
@return whether the view sees the modifications of id.
*/
bool changes_visible(trx_id_t id) const
MY_ATTRIBUTE((warn_unused_result))
{
if (id >= m_low_limit_id)
return false;
return id < m_up_limit_id ||
m_ids.empty() ||
!std::binary_search(m_ids.begin(), m_ids.end(), id);
}
/**
Check whether the changes by id are visible.
......@@ -266,7 +280,8 @@ class ReadView: public ReadViewBase
*/
bool changes_visible(trx_id_t id, const table_name_t &name) const
{ return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
bool changes_visible(trx_id_t id) const
{ return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
/**
A wrapper around ReadViewBase::append().
......
......@@ -1733,30 +1733,13 @@ row_merge_read_clustered_index(
DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
#endif
/* Check early (without accessing index pages) if the table is empty.
If we read bulk_trx_id as an older transaction ID,
it is not incorrect to check here whether that transaction should
be visible to us. If not, the table must have been empty.
We would only update bulk_trx_id in row_ins_clust_index_entry_low()
if the table really was empty (everything had been purged).
So, this shortcut is safe. */
if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
if (trx->read_view.is_open()
&& !trx->read_view.changes_visible(
bulk_trx_id, old_table->name)) {
trx->op_info="";
DBUG_RETURN(DB_SUCCESS);
}
}
/* Create and initialize memory for record buffers */
merge_buf = static_cast<row_merge_buf_t**>(
ut_malloc_nokey(n_index * sizeof *merge_buf));
row_merge_dup_t clust_dup = {index[0], table, col_map, 0};
dfield_t* prev_fields;
dfield_t* prev_fields = nullptr;
const ulint n_uniq = dict_index_get_n_unique(index[0]);
ut_ad(trx->mysql_thd != NULL);
......@@ -1767,10 +1750,6 @@ row_merge_read_clustered_index(
/* There is no previous tuple yet. */
prev_mtuple.fields = NULL;
/* Note: we must recheck old_table->bulk_trx_id after we have
acquired the page latch on the clustered index root page or
the leftmost leaf page. */
for (ulint i = 0; i < n_index; i++) {
if (index[i]->type & DICT_FTS) {
......@@ -1859,6 +1838,34 @@ row_merge_read_clustered_index(
btr_pcur_move_to_prev_on_page(&pcur);
}
uint64_t n_rows = 0;
/* Check if the table is supposed to be empty for our read view.
If we read bulk_trx_id as an older transaction ID, it is not
incorrect to check here whether that transaction should be
visible to us. If bulk_trx_id is not visible to us, the table
must have been empty at an earlier point of time, also in our
read view.
An INSERT would only update bulk_trx_id in
row_ins_clust_index_entry_low() if the table really was empty
(everything had been purged), when holding a leaf page latch
in the clustered index (actually, the root page is the only
leaf page in that case).
We are holding a clustered index leaf page latch here.
That will obviously prevent any concurrent INSERT from
updating bulk_trx_id while we read it. */
if (!online) {
} else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
ut_ad(trx->read_view.is_open());
ut_ad(bulk_trx_id != trx->id);
if (!trx->read_view.changes_visible(bulk_trx_id)) {
goto func_exit;
}
}
if (old_table != new_table) {
/* The table is being rebuilt. Identify the columns
that were flagged NOT NULL in the new table, so that
......@@ -1905,13 +1912,10 @@ row_merge_read_clustered_index(
prev_fields = static_cast<dfield_t*>(
ut_malloc_nokey(n_uniq * sizeof *prev_fields));
mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
} else {
prev_fields = NULL;
}
mach_write_to_8(new_sys_trx_start, trx->id);
mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
uint64_t n_rows = 0;
/* Scan the clustered index. */
for (;;) {
......@@ -2741,7 +2745,7 @@ row_merge_read_clustered_index(
UT_DELETE(clust_btr_bulk);
}
if (prev_fields != NULL) {
if (prev_fields) {
ut_free(prev_fields);
mem_heap_free(mtuple_heap);
}
......
......@@ -1450,7 +1450,9 @@ row_sel_try_search_shortcut(
{
dict_index_t* index = plan->index;
ut_ad(!index->table->is_temporary());
ut_ad(node->read_view);
ut_ad(node->read_view->is_open());
ut_ad(plan->unique_search);
ut_ad(!plan->must_get_clust);
......@@ -1474,6 +1476,13 @@ row_sel_try_search_shortcut(
return(SEL_EXHAUSTED);
}
if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
/* See row_search_mvcc() for a comment on bulk_trx_id */
if (!node->read_view->changes_visible(bulk_trx_id)) {
goto exhausted;
}
}
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
......@@ -1541,7 +1550,6 @@ row_sel(
rec_t* rec;
rec_t* old_vers;
rec_t* clust_rec;
ibool consistent_read;
/* The following flag becomes TRUE when we are doing a
consistent read from a non-clustered index and we must look
......@@ -1564,21 +1572,11 @@ row_sel(
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets = offsets_;
rec_offs_init(offsets_);
const trx_t* trx = thr_get_trx(thr);
ut_ad(thr->run_node == node);
if (node->read_view) {
/* In consistent reads, we try to do with the hash index and
not to use the buffer page get. This is to reduce memory bus
load resulting from semaphore operations. The search latch
will be s-locked when we access an index with a unique search
condition, but not locked when we access an index with a
less selective search condition. */
consistent_read = TRUE;
} else {
consistent_read = FALSE;
}
ut_ad(!node->read_view || node->read_view == &trx->read_view);
ut_ad(!node->read_view || node->read_view->is_open());
table_loop:
/* TABLE LOOP
......@@ -1613,7 +1611,7 @@ row_sel(
mtr.start();
#ifdef BTR_CUR_HASH_ADAPT
if (consistent_read && plan->unique_search && !plan->pcur_is_open
if (node->read_view && plan->unique_search && !plan->pcur_is_open
&& !plan->must_get_clust) {
switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
case SEL_FOUND:
......@@ -1658,6 +1656,15 @@ row_sel(
}
}
if (!node->read_view
|| trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
/* See row_search_mvcc() for a comment on bulk_trx_id */
if (!trx->read_view.changes_visible(bulk_trx_id)) {
goto table_exhausted;
}
}
rec_loop:
/* RECORD LOOP
-----------
......@@ -1689,12 +1696,9 @@ row_sel(
and it might be that these new records should appear in the
search result set, resulting in the phantom problem. */
if (!consistent_read) {
if (!node->read_view) {
rec_t* next_rec = page_rec_get_next(rec);
unsigned lock_type;
trx_t* trx;
trx = thr_get_trx(thr);
offsets = rec_get_offsets(next_rec, index, offsets,
true,
......@@ -1752,16 +1756,13 @@ row_sel(
goto next_rec;
}
if (!consistent_read) {
if (!node->read_view) {
/* Try to place a lock on the index record */
unsigned lock_type;
trx_t* trx;
offsets = rec_get_offsets(rec, index, offsets, true,
ULINT_UNDEFINED, &heap);
trx = thr_get_trx(thr);
/* At READ UNCOMMITTED or READ COMMITTED isolation level,
we lock only the record, i.e., next-key locking is
not used. */
......@@ -1845,7 +1846,7 @@ row_sel(
offsets = rec_get_offsets(rec, index, offsets, true,
ULINT_UNDEFINED, &heap);
if (consistent_read) {
if (node->read_view) {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
......@@ -1970,7 +1971,7 @@ row_sel(
if (clust_rec == NULL) {
/* The record did not exist in the read view */
ut_ad(consistent_read);
ut_ad(node->read_view);
goto next_rec;
}
......@@ -3847,8 +3848,10 @@ row_sel_try_search_shortcut_for_mysql(
trx_t* trx = prebuilt->trx;
const rec_t* rec;
ut_ad(dict_index_is_clust(index));
ut_ad(index->is_primary());
ut_ad(!index->table->is_temporary());
ut_ad(!prebuilt->templ_contains_blob);
ut_ad(trx->read_view.is_open());
srw_lock* ahi_latch = btr_search_sys.get_latch(*index);
ahi_latch->rd_lock(SRW_LOCK_CALL);
......@@ -3872,7 +3875,13 @@ row_sel_try_search_shortcut_for_mysql(
return(SEL_EXHAUSTED);
}
/* FIXME: check index->table->bulk_trx_id! */
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
/* See row_search_mvcc() for a comment on bulk_trx_id */
if (!trx->read_view.changes_visible(bulk_trx_id)) {
goto exhausted;
}
}
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
......@@ -4415,6 +4424,7 @@ row_search_mvcc(
&& unique_search
&& btr_search_enabled
&& dict_index_is_clust(index)
&& !index->table->is_temporary()
&& !prebuilt->templ_contains_blob
&& !prebuilt->used_in_HANDLER
&& (prebuilt->mysql_row_len < srv_page_size / 8)) {
......@@ -4711,35 +4721,50 @@ row_search_mvcc(
}
}
/* Check early (without accessing index pages) if the table is empty.
If we read bulk_trx_id as an older transaction ID,
it is not incorrect to check here whether that transaction should
be visible to us. If not, the table must have been empty.
We would only update bulk_trx_id in row_ins_clust_index_entry_low()
if the table really was empty (everything had been purged).
So, this shortcut is safe.
Note: because we are not holding the clustered index root page latch
here, and likely not holding a table lock either, this is a dirty
read. It is possible that the table has been emptied again and
bulk_trx_id is being updated concurrently by an active insert
transaction. But, that must be an even later transaction than the
one that we might have checked here. */
if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
if (trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
&& trx->read_view.is_open()
&& !trx->read_view.changes_visible(
bulk_trx_id, index->table->name)) {
/* Check if the table is supposed to be empty for our read view.
If we read bulk_trx_id as an older transaction ID, it is not
incorrect to check here whether that transaction should be
visible to us. If bulk_trx_id is not visible to us, the table
must have been empty at an earlier point of time, also in our
read view.
An INSERT would only update bulk_trx_id in
row_ins_clust_index_entry_low() if the table really was empty
(everything had been purged), when holding a leaf page latch
in the clustered index (actually, the root page is the only
leaf page in that case).
We are already holding a leaf page latch here, either
in a secondary index or in a clustered index.
If we are holding a clustered index page latch, there clearly
is no potential for race condition with a concurrent INSERT:
such INSERT would be blocked by us.
If we are holding a secondary index page latch, then we are
not directly blocking a concurrent INSERT that might update
bulk_trx_id to something that does not exist in our read view.
But, in that case, the entire table (all indexes) must have
been empty. So, even if our read below missed the update of
index->table->bulk_trx_id, we can safely proceed to reading
the empty secondary index page. Our latch will prevent the
INSERT from proceeding to that page. It will first modify
the clustered index. Also, we may only look up something in
the clustered index if the secondary index page is not empty
to begin with. So, only if the table is corrupted
(the clustered index is empty but the secondary index is not)
we could return corrupted results. */
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
|| !trx->read_view.is_open()) {
} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
if (!trx->read_view.changes_visible(bulk_trx_id)) {
trx->op_info = "";
err = DB_END_OF_INDEX;
goto normal_return;
}
}
/* Note: we must recheck index->table->bulk_trx_id while
we are holding the clustered index root page latch. */
rec_loop:
DEBUG_SYNC_C("row_search_rec_loop");
if (trx_is_interrupted(trx)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment