Improve the bulk_trx_id checks

row_merge_read_clustered_index(): Check bulk_trx_id at the correct spot. row_sel(), row_sel_try_search_shortcut(), row_sel_try_search_shortcut_for_mysql(): Check bulk_trx_id. ReadView::changes_visible(trx_id_t) const: New accessor for the case where the trx_id_t is not read from a potentially corrupted index page but directly from the memory. In this case, we can skip a sanity check.

Improve the bulk_trx_id checks
row_merge_read_clustered_index(): Check bulk_trx_id at the correct spot. row_sel(), row_sel_try_search_shortcut(), row_sel_try_search_shortcut_for_mysql(): Check bulk_trx_id. ReadView::changes_visible(trx_id_t) const: New accessor for the case where the trx_id_t is not read from a potentially corrupted index page but directly from the memory. In this case, we can skip a sanity check.
ffead38d · Marko Mäkelä · 1080fde1 · ffead38d · ffead38d · ffead38d
Commit ffead38d authored Jan 20, 2021 by Marko Mäkelä
3 changed files
--- a/storage/innobase/include/read0types.h
+++ b/storage/innobase/include/read0types.h
 /*****************************************************************************

 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -140,6 +140,20 @@ class ReadViewBase
  */
  static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);

+  /**
+    Check whether the changes by id are visible.
+    @param[in] id transaction id to check against the view
+    @return whether the view sees the modifications of id.
+  */
+  bool changes_visible(trx_id_t id) const
+  MY_ATTRIBUTE((warn_unused_result))
+  {
+    if (id >= m_low_limit_id)
+      return false;
+    return id < m_up_limit_id ||
+           m_ids.empty() ||
+           !std::binary_search(m_ids.begin(), m_ids.end(), id);
+  }

  /**
    Check whether the changes by id are visible.
@@ -266,7 +280,8 @@ class ReadView: public ReadViewBase
  */
  bool changes_visible(trx_id_t id, const table_name_t &name) const
  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
-
+  bool changes_visible(trx_id_t id) const
+  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }

  /**
    A wrapper around ReadViewBase::append().

--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -1733,30 +1733,13 @@ row_merge_read_clustered_index(
 	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
 #endif

-	/* Check early (without accessing index pages) if the table is empty.
-
-	If we read bulk_trx_id as an older transaction ID,
-	it is not incorrect to check here whether that transaction should
-	be visible to us. If not, the table must have been empty.
-	We would only update bulk_trx_id in row_ins_clust_index_entry_low()
-	if the table really was empty (everything had been purged).
-	So, this shortcut is safe. */
-	if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
-		if (trx->read_view.is_open()
-		    && !trx->read_view.changes_visible(
-				bulk_trx_id, old_table->name)) {
-			trx->op_info="";
-			DBUG_RETURN(DB_SUCCESS);
-		}
-	}
-
 	/* Create and initialize memory for record buffers */

 	merge_buf = static_cast<row_merge_buf_t**>(
 		ut_malloc_nokey(n_index * sizeof *merge_buf));

 	row_merge_dup_t	clust_dup = {index[0], table, col_map, 0};
-	dfield_t*	prev_fields;
+	dfield_t*	prev_fields = nullptr;
 	const ulint	n_uniq = dict_index_get_n_unique(index[0]);

 	ut_ad(trx->mysql_thd != NULL);
@@ -1767,10 +1750,6 @@ row_merge_read_clustered_index(
 	/* There is no previous tuple yet. */
 	prev_mtuple.fields = NULL;

-	/* Note: we must recheck old_table->bulk_trx_id after we have
-	acquired the page latch on the clustered index root page or
-	the leftmost leaf page. */
-
 	for (ulint i = 0; i < n_index; i++) {
 		if (index[i]->type & DICT_FTS) {

@@ -1859,6 +1838,34 @@ row_merge_read_clustered_index(
 		btr_pcur_move_to_prev_on_page(&pcur);
 	}

+	uint64_t n_rows = 0;
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are holding a clustered index leaf page latch here.
+	That will obviously prevent any concurrent INSERT from
+	updating bulk_trx_id while we read it. */
+	if (!online) {
+	} else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
+		ut_ad(trx->read_view.is_open());
+		ut_ad(bulk_trx_id != trx->id);
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto func_exit;
+		}
+	}
+
 	if (old_table != new_table) {
 		/* The table is being rebuilt.  Identify the columns
 		that were flagged NOT NULL in the new table, so that
@@ -1905,13 +1912,10 @@ row_merge_read_clustered_index(
 		prev_fields = static_cast<dfield_t*>(
 			ut_malloc_nokey(n_uniq * sizeof *prev_fields));
 		mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
-	} else {
-		prev_fields = NULL;
 	}

 	mach_write_to_8(new_sys_trx_start, trx->id);
 	mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
-	uint64_t	n_rows = 0;

 	/* Scan the clustered index. */
 	for (;;) {
@@ -2741,7 +2745,7 @@ row_merge_read_clustered_index(
 		UT_DELETE(clust_btr_bulk);
 	}

-	if (prev_fields != NULL) {
+	if (prev_fields) {
 		ut_free(prev_fields);
 		mem_heap_free(mtuple_heap);
 	}

--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -1450,7 +1450,9 @@ row_sel_try_search_shortcut(
 {
 	dict_index_t*	index = plan->index;

+	ut_ad(!index->table->is_temporary());
 	ut_ad(node->read_view);
+	ut_ad(node->read_view->is_open());
 	ut_ad(plan->unique_search);
 	ut_ad(!plan->must_get_clust);

@@ -1474,6 +1476,13 @@ row_sel_try_search_shortcut(
 		return(SEL_EXHAUSTED);
 	}

+	if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!node->read_view->changes_visible(bulk_trx_id)) {
+			goto exhausted;
+		}
+	}
+
 	/* This is a non-locking consistent read: if necessary, fetch
 	a previous version of the record */

@@ -1541,7 +1550,6 @@ row_sel(
 	rec_t*		rec;
 	rec_t*		old_vers;
 	rec_t*		clust_rec;
-	ibool		consistent_read;

 	/* The following flag becomes TRUE when we are doing a
 	consistent read from a non-clustered index and we must look
@@ -1564,21 +1572,11 @@ row_sel(
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets				= offsets_;
 	rec_offs_init(offsets_);
+	const trx_t*	trx = thr_get_trx(thr);

 	ut_ad(thr->run_node == node);
-
-	if (node->read_view) {
-		/* In consistent reads, we try to do with the hash index and
-		not to use the buffer page get. This is to reduce memory bus
-		load resulting from semaphore operations. The search latch
-		will be s-locked when we access an index with a unique search
-		condition, but not locked when we access an index with a
-		less selective search condition. */
-
-		consistent_read = TRUE;
-	} else {
-		consistent_read = FALSE;
-	}
+	ut_ad(!node->read_view || node->read_view == &trx->read_view);
+	ut_ad(!node->read_view || node->read_view->is_open());

 table_loop:
 	/* TABLE LOOP
@@ -1613,7 +1611,7 @@ row_sel(
 	mtr.start();

 #ifdef BTR_CUR_HASH_ADAPT
-	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+	if (node->read_view && plan->unique_search && !plan->pcur_is_open
 	    && !plan->must_get_clust) {
 		switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
 		case SEL_FOUND:
@@ -1658,6 +1656,15 @@ row_sel(
 		}
 	}

+	if (!node->read_view
+	    || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto table_exhausted;
+		}
+	}
+
 rec_loop:
 	/* RECORD LOOP
 	-----------
@@ -1689,12 +1696,9 @@ row_sel(
 		and it might be that these new records should appear in the
 		search result set, resulting in the phantom problem. */

-		if (!consistent_read) {
+		if (!node->read_view) {
 			rec_t*	next_rec = page_rec_get_next(rec);
 			unsigned lock_type;
-			trx_t*	trx;
-
-			trx = thr_get_trx(thr);

 			offsets = rec_get_offsets(next_rec, index, offsets,
 						  true,
@@ -1752,16 +1756,13 @@ row_sel(
 		goto next_rec;
 	}

-	if (!consistent_read) {
+	if (!node->read_view) {
 		/* Try to place a lock on the index record */
 		unsigned lock_type;
-		trx_t*	trx;

 		offsets = rec_get_offsets(rec, index, offsets, true,
 					  ULINT_UNDEFINED, &heap);

-		trx = thr_get_trx(thr);
-
 		/* At READ UNCOMMITTED or READ COMMITTED isolation level,
 		we lock only the record, i.e., next-key locking is
 		not used. */
@@ -1845,7 +1846,7 @@ row_sel(
 	offsets = rec_get_offsets(rec, index, offsets, true,
 				  ULINT_UNDEFINED, &heap);

-	if (consistent_read) {
+	if (node->read_view) {
 		/* This is a non-locking consistent read: if necessary, fetch
 		a previous version of the record */

@@ -1970,7 +1971,7 @@ row_sel(

 		if (clust_rec == NULL) {
 			/* The record did not exist in the read view */
-			ut_ad(consistent_read);
+			ut_ad(node->read_view);

 			goto next_rec;
 		}
@@ -3847,8 +3848,10 @@ row_sel_try_search_shortcut_for_mysql(
 	trx_t*		trx		= prebuilt->trx;
 	const rec_t*	rec;

-	ut_ad(dict_index_is_clust(index));
+	ut_ad(index->is_primary());
+	ut_ad(!index->table->is_temporary());
 	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(trx->read_view.is_open());

 	srw_lock* ahi_latch = btr_search_sys.get_latch(*index);
 	ahi_latch->rd_lock(SRW_LOCK_CALL);
@@ -3872,7 +3875,13 @@ row_sel_try_search_shortcut_for_mysql(
 		return(SEL_EXHAUSTED);
 	}

-	/* FIXME: check index->table->bulk_trx_id! */
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto exhausted;
+		}
+	}

 	/* This is a non-locking consistent read: if necessary, fetch
 	a previous version of the record */
@@ -4415,6 +4424,7 @@ row_search_mvcc(
 	    && unique_search
 	    && btr_search_enabled
 	    && dict_index_is_clust(index)
+	    && !index->table->is_temporary()
 	    && !prebuilt->templ_contains_blob
 	    && !prebuilt->used_in_HANDLER
 	    && (prebuilt->mysql_row_len < srv_page_size / 8)) {
@@ -4711,35 +4721,50 @@ row_search_mvcc(
 		}
 	}

-	/* Check early (without accessing index pages) if the table is empty.
-
-	If we read bulk_trx_id as an older transaction ID,
-	it is not incorrect to check here whether that transaction should
-	be visible to us. If not, the table must have been empty.
-	We would only update bulk_trx_id in row_ins_clust_index_entry_low()
-	if the table really was empty (everything had been purged).
-	So, this shortcut is safe.
-
-	Note: because we are not holding the clustered index root page latch
-	here, and likely not holding a table lock either, this is a dirty
-	read. It is possible that the table has been emptied again and
-	bulk_trx_id is being updated concurrently by an active insert
-	transaction. But, that must be an even later transaction than the
-	one that we might have checked here. */
-	if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
-		if (trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
-		    && trx->read_view.is_open()
-		    && !trx->read_view.changes_visible(
-				bulk_trx_id, index->table->name)) {
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are already holding a leaf page latch here, either
+	in a secondary index or in a clustered index.
+
+	If we are holding a clustered index page latch, there clearly
+	is no potential for race condition with a concurrent INSERT:
+	such INSERT would be blocked by us.
+
+	If we are holding a secondary index page latch, then we are
+	not directly blocking a concurrent INSERT that might update
+	bulk_trx_id to something that does not exist in our read view.
+	But, in that case, the entire table (all indexes) must have
+	been empty. So, even if our read below missed the update of
+	index->table->bulk_trx_id, we can safely proceed to reading
+	the empty secondary index page. Our latch will prevent the
+	INSERT from proceeding to that page. It will first modify
+	the clustered index. Also, we may only look up something in
+	the clustered index if the secondary index page is not empty
+	to begin with. So, only if the table is corrupted
+	(the clustered index is empty but the secondary index is not)
+	we could return corrupted results. */
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+	    || !trx->read_view.is_open()) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
 			trx->op_info = "";
 			err = DB_END_OF_INDEX;
 			goto normal_return;
 		}
 	}

-	/* Note: we must recheck index->table->bulk_trx_id while
-	we are holding the clustered index root page latch. */
-
 rec_loop:
 	DEBUG_SYNC_C("row_search_rec_loop");
 	if (trx_is_interrupted(trx)) {