Commit 5918b170 authored by seppo's avatar seppo Committed by GitHub

MDEV-21473 conflicts with async slave BF aborting (#1475)

If async slave thread (slave SQL handler), becomes a BF victim, it may occasionally happen that rollbacker thread is used to carry out the rollback instead of the async slave thread.
This can happen, if async slave thread has flagged "idle" state when BF thread tries to figure out how to kill the victim.
The issue was possible to test by using a galera cluster as slave for external master, and issuing high load of conflicting writes through async replication and directly against galera cluster nodes.
However, a deterministic mtr test for the "conflict window" has not yet been worked on.

The fix, in this patch makes sure that async slave thread state is never set to IDLE. This prevents the rollbacker thread to intervene.
The wsrep_query_state change was refactored to happen by dedicated function to make controlling the idle state change in one place.
parent a7cbce06
......@@ -5050,6 +5050,7 @@ pthread_handler_t handle_slave_sql(void *arg)
#ifdef WITH_WSREP
thd->wsrep_exec_mode= LOCAL_STATE;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
/* synchronize with wsrep replication */
if (WSREP_ON)
wsrep_ready_wait();
......
......@@ -1207,7 +1207,7 @@ void THD::init(void)
#ifdef WITH_WSREP
wsrep_exec_mode= wsrep_applier ? REPL_RECV : LOCAL_STATE;
wsrep_conflict_state= NO_CONFLICT;
wsrep_query_state= QUERY_IDLE;
wsrep_thd_set_query_state(this, QUERY_IDLE);
wsrep_last_query_id= 0;
wsrep_trx_meta.gtid= WSREP_GTID_UNDEFINED;
wsrep_trx_meta.depends_on= WSREP_SEQNO_UNDEFINED;
......
......@@ -1342,7 +1342,7 @@ void do_handle_one_connection(CONNECT *connect)
if (WSREP(thd))
{
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->wsrep_query_state= QUERY_EXITING;
wsrep_thd_set_query_state(thd, QUERY_EXITING);
mysql_mutex_unlock(&thd->LOCK_thd_data);
}
#endif
......
......@@ -1210,7 +1210,7 @@ bool do_command(THD *thd)
if (WSREP(thd))
{
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->wsrep_query_state= QUERY_IDLE;
wsrep_thd_set_query_state(thd, QUERY_IDLE);
if (thd->wsrep_conflict_state==MUST_ABORT)
{
wsrep_client_rollback(thd);
......@@ -1278,7 +1278,7 @@ bool do_command(THD *thd)
thd->store_globals();
}
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
mysql_mutex_unlock(&thd->LOCK_thd_data);
}
#endif /* WITH_WSREP */
......@@ -1575,7 +1575,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
}
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
if (thd->wsrep_conflict_state== RETRY_AUTOCOMMIT)
{
thd->wsrep_conflict_state= NO_CONFLICT;
......
......@@ -98,7 +98,7 @@ static wsrep_cb_status_t wsrep_apply_events(THD* thd,
}
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
if (thd->wsrep_conflict_state!= REPLAYING)
thd->wsrep_conflict_state= NO_CONFLICT;
mysql_mutex_unlock(&thd->LOCK_thd_data);
......@@ -197,7 +197,7 @@ static wsrep_cb_status_t wsrep_apply_events(THD* thd,
error:
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->wsrep_query_state= QUERY_IDLE;
wsrep_thd_set_query_state(thd, QUERY_IDLE);
mysql_mutex_unlock(&thd->LOCK_thd_data);
assert(thd->wsrep_exec_mode== REPL_RECV);
......
......@@ -439,7 +439,7 @@ wsrep_run_wsrep_commit(THD *thd, bool all)
DBUG_RETURN(WSREP_TRX_CERT_FAIL);
}
thd->wsrep_query_state = QUERY_COMMITTING;
wsrep_thd_set_query_state(thd, QUERY_COMMITTING);
mysql_mutex_unlock(&thd->LOCK_thd_data);
cache = get_trans_log(thd);
......@@ -473,7 +473,7 @@ wsrep_run_wsrep_commit(THD *thd, bool all)
{
WSREP_DEBUG("empty rbr buffer, query: %s", thd->query());
}
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
DBUG_RETURN(WSREP_TRX_OK);
}
......@@ -581,7 +581,7 @@ wsrep_run_wsrep_commit(THD *thd, bool all)
WSREP_DEBUG("commit failed for reason: %d", rcode);
DBUG_PRINT("wsrep", ("replicating commit fail"));
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
if (thd->wsrep_conflict_state == MUST_ABORT) {
thd->wsrep_conflict_state= ABORTED;
......@@ -613,7 +613,7 @@ wsrep_run_wsrep_commit(THD *thd, bool all)
DBUG_RETURN(WSREP_TRX_ERROR);
}
thd->wsrep_query_state= QUERY_EXEC;
wsrep_thd_set_query_state(thd, QUERY_EXEC);
mysql_mutex_unlock(&thd->LOCK_thd_data);
DBUG_RETURN(WSREP_TRX_OK);
......
......@@ -2586,6 +2586,17 @@ extern "C" void wsrep_thd_set_exec_mode(THD *thd, enum wsrep_exec_mode mode)
extern "C" void wsrep_thd_set_query_state(
THD *thd, enum wsrep_query_state state)
{
/* async slave thread should never flag IDLE state, as it may
give rollbacker thread chance to interfere and rollback async slave
transaction.
in fact, async slave thread is never idle as it reads complete
transactions from relay log and applies them, as a whole.
BF abort happens voluntarily by async slave thread.
*/
if (thd->slave_thread && state == QUERY_IDLE) {
WSREP_DEBUG("Skipping IDLE state change for slave SQL");
return;
}
thd->wsrep_query_state= state;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment