MDEV-27701 Race on trx->lock.wait_lock between lock_rec_move() and lock_sys_t::cancel()

The initial issue was in assertion failure, which checked the equality of lock to cancel with trx->lock.wait_lock in lock_sys_t::cancel(). If we analyze lock_sys_t::cancel() code from the perspective of trx->lock.wait_lock racing, we won't find the error there, except the cases when we need to reload it after the corresponding latches acquiring. So the fix is just to remove the assertion and reload trx->lock.wait_lock after acquiring necessary latches. Reviewed by: Marko Mäkelä <marko.makela@mariadb.com>

MDEV-27701 Race on trx->lock.wait_lock between lock_rec_move() and lock_sys_t::cancel()
The initial issue was in assertion failure, which checked the equality of lock to cancel with trx->lock.wait_lock in lock_sys_t::cancel(). If we analyze lock_sys_t::cancel() code from the perspective of trx->lock.wait_lock racing, we won't find the error there, except the cases when we need to reload it after the corresponding latches acquiring. So the fix is just to remove the assertion and reload trx->lock.wait_lock after acquiring necessary latches. Reviewed by: Marko Mäkelä <marko.makela@mariadb.com>
a474e327 · Vlad Lesin · 67a6ad0a · a474e327 · a474e327 · a474e327
Commit a474e327 authored Feb 09, 2023 by Vlad Lesin
4 changed files
--- a/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result
+++ b/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result
+CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB;
+INSERT INTO t VALUES (10, "0123456789");
+connection default;
+BEGIN;
+SELECT * FROM t WHERE c = 10 FOR UPDATE;
+pk	c
+connect trx2, localhost,root,,;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting";
+SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd";
+SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont";
+UPDATE t SET c = NULL WHERE pk = 10;
+connect trx3, localhost,root,,;
+SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting";
+SET innodb_lock_wait_timeout=1;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting";
+SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting";
+UPDATE t SET c = "abcdefghij" WHERE pk = 10;
+connection default;
+SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting";
+COMMIT;
+SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting";
+SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter";
+SET DEBUG_SYNC="now SIGNAL trx2_cont_upd";
+SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting";
+SET DEBUG_SYNC="now SIGNAL trx2_cont";
+disconnect trx2;
+disconnect trx3;
+connection default;
+SET DEBUG_SYNC="RESET";
+DROP TABLE t;
--- a/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test
+++ b/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test
+--source include/have_innodb.inc
+--source include/count_sessions.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB;
+INSERT INTO t VALUES (10, "0123456789");
+
+--connection default
+BEGIN;
+SELECT * FROM t WHERE c = 10 FOR UPDATE;
+
+--connect(trx2, localhost,root,,)
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting";
+SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd";
+SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont";
+#################
+# We need to update clustered record without changing ordering fields and
+# changing the size of non-ordering fields to cause locks moving from deleted
+# record to infimum.
+###
+--send UPDATE t SET c = NULL WHERE pk = 10
+
+
+--connect(trx3, localhost,root,,)
+SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting";
+#################
+# The condition wariable waiting in lock_wait() must be finished by timeout
+###
+SET innodb_lock_wait_timeout=1;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting";
+SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting";
+--send UPDATE t SET c = "abcdefghij" WHERE pk = 10
+
+--connection default
+SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting";
+COMMIT;
+SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting";
+SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter";
+SET DEBUG_SYNC="now SIGNAL trx2_cont_upd";
+SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks";
+#################
+# If the bug is not fixed, there will be assertion failure here, because trx2
+# moved trx3 lock from deleted record to infimum when trx3 tried to cancel the
+# lock.
+###
+SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting";
+SET DEBUG_SYNC="now SIGNAL trx2_cont";
+
+--disconnect trx2
+--disconnect trx3
+--connection default
+SET DEBUG_SYNC="RESET";
+DROP TABLE t;
+--source include/wait_until_count_sessions.inc
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -891,8 +891,8 @@ class lock_sys_t

  /** Cancel a waiting lock request.
  @tparam check_victim  whether to check for DB_DEADLOCK
-  @param lock           waiting lock request
  @param trx            active transaction
+  @param lock           waiting lock request
  @retval DB_SUCCESS    if no lock existed
  @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
  @retval DB_LOCK_WAIT  if the lock was canceled */

--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -46,12 +46,12 @@ Created 5/7/1996 Heikki Tuuri
 #include "srv0mon.h"
 #include "que0que.h"
 #include "scope.h"
+#include <debug_sync.h>

 #include <set>

 #ifdef WITH_WSREP
 #include <mysql/service_wsrep.h>
-#include <debug_sync.h>
 #endif /* WITH_WSREP */

 /** The value of innodb_deadlock_detect */
@@ -1882,6 +1882,7 @@ dberr_t lock_wait(que_thr_t *thr)
  if (row_lock_wait)
    lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());

+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
  if (lock_t *lock= trx->lock.wait_lock)
  {
    lock_sys_t::cancel<false>(trx, lock);
@@ -1905,6 +1906,12 @@ void lock_wait_end(trx_t *trx)
  ut_d(const auto state= trx->state);
  ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE ||
        state == TRX_STATE_PREPARED);
+  /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before
+  requesting lock_sys.wait_mutex, and if the flag is set, it returns error,
+  what causes transaction rollback, which can reset trx->lock.wait_thr before
+  deadlock resolution starts cancelling victim's waiting lock. That's why we
+  don't check trx->lock.wait_thr here if the function was called from deadlock
+  resolution function. */
  ut_ad(from_deadlock || trx->lock.wait_thr);

  if (trx->lock.was_chosen_as_deadlock_victim)
@@ -3193,6 +3200,8 @@ lock_rec_store_on_page_infimum(

  ut_ad(block->page.frame == page_align(rec));
  const page_id_t id{block->page.id()};
+  ut_d(SCOPE_EXIT(
+      []() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); }));

  LockGuard g{lock_sys.rec_hash, id};
  lock_rec_move(g.cell(), *block, id, g.cell(), id,
@@ -5770,17 +5779,30 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)

 /** Cancel a waiting lock request.
 @tparam check_victim  whether to check for DB_DEADLOCK
-@param lock           waiting lock request
 @param trx            active transaction
+@param lock           waiting lock request
 @retval DB_SUCCESS    if no lock existed
 @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
 @retval DB_LOCK_WAIT  if the lock was canceled */
 template<bool check_victim>
 dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
 {
+  DEBUG_SYNC_C("lock_sys_t_cancel_enter");
  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
-  ut_ad(trx->lock.wait_lock == lock);
  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  /* trx->lock.wait_lock may be changed by other threads as long as
+  we are not holding lock_sys.latch.
+
+  So, trx->lock.wait_lock==lock does not necessarily hold, but both
+  pointers should be valid, because other threads cannot assign
+  trx->lock.wait_lock=nullptr (or invalidate *lock) while we are
+  holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock
+  (record or table lock) cannot be changed by other threads. So, it is
+  safe to call lock->is_table() while not holding lock_sys.latch. If
+  we have to release and reacquire lock_sys.wait_mutex, we must reread
+  trx->lock.wait_lock. We must also reread trx->lock.wait_lock after
+  lock_sys.latch acquiring, as it can be changed to not-null in lock moving
+  functions even if we hold lock_sys.wait_mutex. */
  dberr_t err= DB_SUCCESS;
  /* This would be too large for a memory transaction, except in the
  DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
@@ -5802,6 +5824,15 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
    }
    else
    {
+      /* This function is invoked from the thread which executes the
+      transaction. Table locks are requested before record locks. Some other
+      transaction can't change trx->lock.wait_lock from table to record for the
+      current transaction at this point, because the current transaction has not
+      requested record locks yet. There is no need to move any table locks by
+      other threads. And trx->lock.wait_lock can't be set to null while we are
+      holding lock_sys.wait_mutex. That's why there is no need to reload
+      trx->lock.wait_lock here. */
+      ut_ad(lock == trx->lock.wait_lock);
 resolve_table_lock:
      dict_table_t *table= lock->un_member.tab_lock.table;
      if (!table->lock_mutex_trylock())
@@ -5812,6 +5843,7 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
        mysql_mutex_unlock(&lock_sys.wait_mutex);
        table->lock_mutex_lock();
        mysql_mutex_lock(&lock_sys.wait_mutex);
+        /* Cache trx->lock.wait_lock under the corresponding latches. */
        lock= trx->lock.wait_lock;
        if (!lock)
          goto retreat;
@@ -5821,6 +5853,10 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
          goto retreat;
        }
      }
+      else
+        /* Cache trx->lock.wait_lock under the corresponding latches if
+        it was not cached yet */
+        lock= trx->lock.wait_lock;
      if (lock->is_waiting())
        lock_cancel_waiting_and_release(lock);
      /* Even if lock->is_waiting() did not hold above, we must return
@@ -5844,6 +5880,7 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
      mysql_mutex_unlock(&lock_sys.wait_mutex);
      lock_sys.wr_lock(SRW_LOCK_CALL);
      mysql_mutex_lock(&lock_sys.wait_mutex);
+      /* Cache trx->lock.wait_lock under the corresponding latches. */
      lock= trx->lock.wait_lock;
      /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
      unlocked, we need to return deadlock error if transaction was chosen
@@ -5855,6 +5892,9 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
    }
    else
    {
+      /* Cache trx->lock.wait_lock under the corresponding latches if
+      it was not cached yet */
+      lock= trx->lock.wait_lock;
 resolve_record_lock:
      if (lock->is_waiting())
        lock_cancel_waiting_and_release(lock);
@@ -5876,6 +5916,7 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
 void lock_sys_t::cancel(trx_t *trx)
 {
  mysql_mutex_lock(&lock_sys.wait_mutex);
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
  if (lock_t *lock= trx->lock.wait_lock)
  {
    /* Dictionary transactions must be immune to KILL, because they
@@ -5943,6 +5984,7 @@ dberr_t lock_trx_handle_wait(trx_t *trx)
  mysql_mutex_lock(&lock_sys.wait_mutex);
  if (trx->lock.was_chosen_as_deadlock_victim)
    err= DB_DEADLOCK;
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
  else if (lock_t *wait_lock= trx->lock.wait_lock)
    err= lock_sys_t::cancel<true>(trx, wait_lock);
  lock_sys.deadlock_check();