MDEV-29250 InnoDB: Failing assertion: table->get_ref_count() == 0

Reason:
======
 This issue is caused by race condition between fulltext DDL and
purge thread. DDL sets the signal to stop the purge thread to
process the new undo log records and wait for the ongoing
processed FTS table undo log records to finish.

But in dict_acquire_mdl_shared(),InnoDB release all innodb
table related locks before acquiring the mdl. At the same time,
DDL assumes that there are no purge threads working on fts table.

There is a possiblity that purge thread can skip processing the
valid undo log records if it checks purge_sys.must_wait_FTS() twice
in different places.

Solution:
==========

Add the purge_sys.must_wait_FTS() check in dict_acquire_mdl_shared()
to avoid the purge thread processing undo log records.

dict_open_table_on_id(): return -1 if the purge thread has to
wait

dict_acquire_mdl_shared(): Added 1 new parameters to indicate that
purge thread invoking the function, return -1  if the purge
thread has to wait.
parent 75c416d3
......@@ -662,7 +662,7 @@ dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
@param[in] table_op operation to perform when opening
@return table object after locking MDL shared
@retval nullptr if the table is not readable, or if trylock && MDL blocked */
template<bool trylock>
template<bool trylock, bool purge_thd>
dict_table_t*
dict_acquire_mdl_shared(dict_table_t *table,
THD *thd,
......@@ -674,9 +674,11 @@ dict_acquire_mdl_shared(dict_table_t *table,
MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
size_t db_len;
dict_table_t *not_found= nullptr;
if (trylock)
{
static_assert(!trylock || !purge_thd, "usage");
dict_sys.freeze(SRW_LOCK_CALL);
db_len= dict_get_db_name_len(table->name.m_name);
dict_sys.unfreeze();
......@@ -748,7 +750,13 @@ dict_acquire_mdl_shared(dict_table_t *table,
}
}
retry_table_open:
dict_sys.freeze(SRW_LOCK_CALL);
if (purge_thd && purge_sys.must_wait_FTS())
{
not_found= reinterpret_cast<dict_table_t*>(-1);
goto return_without_mdl;
}
table= dict_sys.find_table(table_id);
if (table)
table->acquire();
......@@ -756,6 +764,11 @@ dict_acquire_mdl_shared(dict_table_t *table,
{
dict_sys.unfreeze();
dict_sys.lock(SRW_LOCK_CALL);
if (purge_thd && purge_sys.must_wait_FTS())
{
dict_sys.unlock();
goto retry_table_open;
}
table= dict_load_table_on_id(table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
......@@ -777,7 +790,7 @@ dict_acquire_mdl_shared(dict_table_t *table,
mdl_context->release_lock(*mdl);
*mdl= nullptr;
}
return nullptr;
return not_found;
}
size_t db1_len, tbl1_len;
......@@ -814,9 +827,9 @@ dict_acquire_mdl_shared(dict_table_t *table,
goto retry;
}
template dict_table_t* dict_acquire_mdl_shared<false>
template dict_table_t* dict_acquire_mdl_shared<false, false>
(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
template dict_table_t* dict_acquire_mdl_shared<true>
template dict_table_t* dict_acquire_mdl_shared<true, false>
(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
/** Look up a table by numeric identifier.
......@@ -842,13 +855,14 @@ dict_table_open_on_id(table_id_t table_id, bool dict_locked,
{
if (purge_thd && purge_sys.must_wait_FTS())
{
table= nullptr;
table= reinterpret_cast<dict_table_t*>(-1);
goto func_exit;
}
table->acquire();
if (thd && !dict_locked)
table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
table= dict_acquire_mdl_shared<false, purge_thd>(
table, thd, mdl, table_op);
}
else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
{
......@@ -866,7 +880,7 @@ dict_table_open_on_id(table_id_t table_id, bool dict_locked,
if (purge_thd && purge_sys.must_wait_FTS())
{
dict_sys.unlock();
return nullptr;
return reinterpret_cast<dict_table_t*>(-1);
}
table->acquire();
}
......@@ -876,7 +890,8 @@ dict_table_open_on_id(table_id_t table_id, bool dict_locked,
if (table && thd)
{
dict_sys.freeze(SRW_LOCK_CALL);
table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
table= dict_acquire_mdl_shared<false, purge_thd>(
table, thd, mdl, table_op);
dict_sys.unfreeze();
}
return table;
......
......@@ -132,7 +132,7 @@ enum dict_table_op_t {
@param[in] table_op operation to perform when opening
@return table object after locking MDL shared
@retval NULL if the table is not readable, or if trylock && MDL blocked */
template<bool trylock>
template<bool trylock, bool purge_thd= false>
dict_table_t*
dict_acquire_mdl_shared(dict_table_t *table,
THD *thd,
......
......@@ -941,7 +941,8 @@ row_purge_parse_undo_rec(
table_id, false, DICT_TABLE_OP_NORMAL, node->purge_thd,
&node->mdl_ticket);
if (!node->table && purge_sys.must_wait_FTS()) {
if (node->table == reinterpret_cast<dict_table_t*>(-1)) {
/* purge stop signal */
goto try_again;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment