MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock

When lock is checked for conflict, ignore other locks on the record if they wait for the requesting transaction. lock_rec_has_to_wait_in_queue() iterates not all locks for the page, but only the locks located before the waiting lock in the queue. So there is some invariant - any lock in the queue can wait only lock which is located before the waiting lock in the queue. In the case when conflicting lock waits for the transaction of requesting lock, we need to place the requesting lock before the waiting lock in the queue to preserve the invariant. That is why we are looking for the first waiting for requesting transation lock and place the new lock just after the last granted requesting transaction lock before the first waiting for requesting transaction lock. Example: trx1 waiting lock, trx1 granted lock, ..., trx2 lock - waiting for trx1 place new lock here -----------------^ There are also implicit locks which are lazily converted to explicit ones, and we need to place the newly created explicit lock to the correct place in a queue. All explicit locks converted from implicit ones are placed just after the last non-waiting lock of the same transaction before the first waiting for the transaction lock. Code review and cleanup was made by Marko Mäkelä.

MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock
When lock is checked for conflict, ignore other locks on the record if they wait for the requesting transaction. lock_rec_has_to_wait_in_queue() iterates not all locks for the page, but only the locks located before the waiting lock in the queue. So there is some invariant - any lock in the queue can wait only lock which is located before the waiting lock in the queue. In the case when conflicting lock waits for the transaction of requesting lock, we need to place the requesting lock before the waiting lock in the queue to preserve the invariant. That is why we are looking for the first waiting for requesting transation lock and place the new lock just after the last granted requesting transaction lock before the first waiting for requesting transaction lock. Example: trx1 waiting lock, trx1 granted lock, ..., trx2 lock - waiting for trx1 place new lock here -----------------^ There are also implicit locks which are lazily converted to explicit ones, and we need to place the newly created explicit lock to the correct place in a queue. All explicit locks converted from implicit ones are placed just after the last non-waiting lock of the same transaction before the first waiting for the transaction lock. Code review and cleanup was made by Marko Mäkelä.
bd03c0e5 · Vlad Lesin · 1abc476f · bd03c0e5 · bd03c0e5 · bd03c0e5
Commit bd03c0e5 authored Dec 02, 2021 by Vlad Lesin
7 changed files
--- a/mysql-test/suite/innodb/r/lock_wait_conflict.result
+++ b/mysql-test/suite/innodb/r/lock_wait_conflict.result
+#
+# MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock
+#
+CREATE TABLE t (a INT PRIMARY KEY, b INT NOT NULL UNIQUE) ENGINE=InnoDB;
+connect prevent_purge,localhost,root,,;
+start transaction with consistent snapshot;
+connection default;
+INSERT INTO t VALUES (20,20);
+DELETE FROM t WHERE b = 20;
+connect con_ins,localhost,root,,;
+SET DEBUG_SYNC = 'row_ins_sec_index_entry_dup_locks_created SIGNAL ins_set_locks WAIT_FOR ins_cont';
+INSERT INTO t VALUES(10, 20);
+connect con_del,localhost,root,,;
+SET DEBUG_SYNC = 'now WAIT_FOR ins_set_locks';
+SET DEBUG_SYNC = 'lock_wait_suspend_thread_enter SIGNAL del_locked';
+DELETE FROM t WHERE b = 20;
+connection default;
+SET DEBUG_SYNC = 'now WAIT_FOR del_locked';
+SET DEBUG_SYNC = 'now SIGNAL ins_cont';
+connection con_ins;
+disconnect con_ins;
+connection con_del;
+disconnect con_del;
+disconnect prevent_purge;
+connection default;
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t;
--- a/mysql-test/suite/innodb/t/lock_wait_conflict.test
+++ b/mysql-test/suite/innodb/t/lock_wait_conflict.test
+--source include/have_innodb.inc
+--source include/count_sessions.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+--echo #
+--echo # MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock
+--echo #
+
+# The test checks the ability to acquire exclusive record lock if the acquiring
+# transaction already holds a shared lock on the record and another transaction
+# is waiting for a lock.
+
+CREATE TABLE t (a INT PRIMARY KEY, b INT NOT NULL UNIQUE) ENGINE=InnoDB;
+
+--connect(prevent_purge,localhost,root,,)
+start transaction with consistent snapshot;
+
+--connection default
+INSERT INTO t VALUES (20,20);
+DELETE FROM t WHERE b = 20;
+
+--connect(con_ins,localhost,root,,)
+SET DEBUG_SYNC = 'row_ins_sec_index_entry_dup_locks_created SIGNAL ins_set_locks WAIT_FOR ins_cont';
+send
+INSERT INTO t VALUES(10, 20);
+
+--connect(con_del,localhost,root,,)
+SET DEBUG_SYNC = 'now WAIT_FOR ins_set_locks';
+SET DEBUG_SYNC = 'lock_wait_suspend_thread_enter SIGNAL del_locked';
+###############################################################################
+# This DELETE creates waiting ORDINARY X-lock for heap_no 2 as the record is
+# delete-marked, this lock conflicts with ORDINARY S-lock set by the the last
+# INSERT. After the last INSERT creates insert-intention lock on
+# heap_no 2, this lock will conflict with waiting ORDINARY X-lock of this
+# DELETE, what causes DEADLOCK error for this DELETE.
+###############################################################################
+send
+DELETE FROM t WHERE b = 20;
+
+--connection default
+SET DEBUG_SYNC = 'now WAIT_FOR del_locked';
+SET DEBUG_SYNC = 'now SIGNAL ins_cont';
+
+--connection con_ins
+--reap
+--disconnect con_ins
+
+--connection con_del
+# Without the fix, ER_LOCK_DEADLOCK would be reported here.
+--reap
+--disconnect con_del
+
+--disconnect prevent_purge
+
+--connection default
+
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t;
+--source include/wait_until_count_sessions.inc
--- a/mysql-test/suite/versioning/r/update.result
+++ b/mysql-test/suite/versioning/r/update.result
@@ -283,7 +283,6 @@ connection default;
 update t1 set b = 'foo';
 connection con1;
 update t1 set a = 'bar';
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
 disconnect con1;
 connection default;
 drop table t1;

--- a/mysql-test/suite/versioning/t/update.test
+++ b/mysql-test/suite/versioning/t/update.test
@@ -186,7 +186,9 @@ send update t1 set b = 'foo';
 connection con1;
 let $wait_condition= select count(*) from information_schema.innodb_lock_waits;
 source include/wait_condition.inc;
-error ER_LOCK_DEADLOCK;
+# There must no be DEADLOCK here as con1 transaction already holds locks, and
+# default's transaction lock is waiting, so the locks of the following "UPDATE"
+# must not conflict with waiting lock.
 update t1 set a = 'bar';
 disconnect con1;
 connection default;

--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
 /*****************************************************************************

 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2021, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +29,43 @@ Created 5/20/1997 Heikki Tuuri
 #include "ut0new.h"

 struct hash_table_t;
-struct hash_cell_t{
-	void*	node;	/*!< hash chain node, NULL if none */
+struct hash_cell_t
+{
+  /** singly-linked, nullptr terminated list of hash buckets */
+  void *node;
+
+  /** Append an element.
+  @tparam T      type of the element
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void append(T &insert, T *T::*next)
+  {
+    void **after;
+    for (after= &node; *after;
+         after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+    insert.*next= nullptr;
+    *after= &insert;
+  }
+
+  /** Insert an element after another.
+  @tparam T  type of the element
+  @param after   the element after which to insert
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void insert_after(T &after, T &insert, T *T::*next)
+  {
+#ifdef UNIV_DEBUG
+    for (const T *c= static_cast<const T*>(node); c; c= c->*next)
+      if (c == &after)
+        goto found;
+    ut_error;
+  found:
+#endif
+    insert.*next= after.*next;
+    after.*next= &insert;
+  }
 };

 /*******************************************************************//**

--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
 /*****************************************************************************

 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1175,6 +1175,10 @@ without checking for deadlocks or conflicts.
 @param[in]	index		the index tree
 @param[in,out]	trx		transaction
 @param[in]	holds_trx_mutex	whether the caller holds trx->mutex
+@param[in]	insert_before_waiting if true, inserts new B-tree record lock
+just after the last non-waiting lock of the current transaction which is
+located before the first waiting for the current transaction lock, otherwise
+the lock is inserted at the end of the queue
 @return created lock */
 lock_t*
 lock_rec_create_low(
@@ -1185,7 +1189,8 @@ lock_rec_create_low(
 	ulint		heap_no,
 	dict_index_t*	index,
 	trx_t*		trx,
-	bool		holds_trx_mutex);
+	bool		holds_trx_mutex,
+	bool		insert_before_waiting = false);

 /** Enqueue a waiting request for a lock which cannot be granted immediately.
 Check for deadlocks.

--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -1004,24 +1004,43 @@ static void lock_wait_wsrep(trx_t *trx)
 /*********************************************************************//**
 Checks if some other transaction has a conflicting explicit lock request
 in the queue, so that we have to wait.
-@return lock or NULL */
-static
-lock_t*
-lock_rec_other_has_conflicting(
-/*===========================*/
-	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
-					possibly ORed to LOCK_GAP or
-					LOC_REC_NOT_GAP,
-					LOCK_INSERT_INTENTION */
-	const hash_cell_t&	cell,	/*!< in: lock hash table cell */
-	const page_id_t		id,	/*!< in: page identifier */
-	ulint			heap_no,/*!< in: heap number of the record */
-	const trx_t*		trx)	/*!< in: our transaction */
+@param[in] mode LOCK_S or LOCK_X, possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+LOCK_INSERT_INTENTION
+@param[in] cell lock hash table cell
+@param[in] id page identifier
+@param[in] heap_no heap number of the record
+@param[in] trx our transaction
+@param[out] was_ignored true if conflicting locks waiting for the current
+transaction were ignored
+@return conflicting lock and the flag which indicated if conflicting locks
+which wait for the current transaction were ignored */
+static lock_t *lock_rec_other_has_conflicting(unsigned mode,
+                                              const hash_cell_t &cell,
+                                              const page_id_t id,
+                                              ulint heap_no, const trx_t *trx,
+                                              bool *was_ignored= nullptr)
 {
 	bool	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);

 	for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no);
 	     lock; lock = lock_rec_get_next(heap_no, lock)) {
+		/* There is no need to lock lock_sys.wait_mutex to check
+		trx->lock.wait_trx here because the current function is
+		executed under the cell latch, and trx->lock.wait_trx
+		transaction can change wait_trx field only under the cell
+		latch, wait_trx trx_t object can not be deinitialized before
+		releasing all its locks, and during releasing the locks the
+		cell latch will also be requested. So while the cell latch is
+		held, lock->trx->lock.wait_trx can't be changed. There also
+		can't be lock loops for one record, because all waiting locks
+		of the record  will always wait for the same lock of the record
+		in a cell array, and check for conflicting lock will always
+		start with the first lock for the heap_no, and go ahead with
+		the same order(the order of the locks in the cell array) */
+		if (lock->is_waiting() && lock->trx->lock.wait_trx == trx) {
+			if (was_ignored) *was_ignored= true;
+			continue;
+		}
 		if (lock_rec_has_to_wait(true, trx, mode, lock, is_supremum)) {
 			return(lock);
 		}
@@ -1147,6 +1166,10 @@ without checking for deadlocks or conflicts.
 @param[in]	index		the index tree
 @param[in,out]	trx		transaction
 @param[in]	holds_trx_mutex	whether the caller holds trx->mutex
+@param[in]	insert_before_waiting if true, inserts new B-tree record lock
+just after the last non-waiting lock of the current transaction which is
+located before the first waiting for the current transaction lock, otherwise
+the lock is inserted at the end of the queue
 @return created lock */
 lock_t*
 lock_rec_create_low(
@@ -1157,7 +1180,8 @@ lock_rec_create_low(
 	ulint		heap_no,
 	dict_index_t*	index,
 	trx_t*		trx,
-	bool		holds_trx_mutex)
+	bool		holds_trx_mutex,
+	bool		insert_before_waiting)
 {
 	lock_t*		lock;
 	ulint		n_bytes;
@@ -1232,7 +1256,36 @@ lock_rec_create_low(
 	ut_ad(index->table->get_ref_count() || !index->table->can_be_evicted);

 	const auto lock_hash = &lock_sys.hash_get(type_mode);
-	HASH_INSERT(lock_t, hash, lock_hash, page_id.fold(), lock);
+	hash_cell_t& cell = *lock_hash->cell_get(page_id.fold());
+
+	if (insert_before_waiting
+	    && !(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))) {
+		/* Try to insert the lock just after the last non-waiting
+		lock of the current transaction which immediately
+		precedes the first waiting lock request. */
+
+		lock_t* last_non_waiting = nullptr;
+
+		for (lock_t* l = lock_sys_t::get_first(cell, page_id, heap_no);
+		     l; l = lock_rec_get_next(heap_no, l)) {
+			if (l->is_waiting() && l->trx->lock.wait_trx == trx) {
+				break;
+			}
+			if (l->trx == trx) {
+				last_non_waiting = l;
+			}
+		}
+
+		if (!last_non_waiting) {
+			goto append_last;
+		}
+
+		cell.insert_after(*last_non_waiting, *lock, &lock_t::hash);
+	}
+	else {
+append_last:
+		cell.append(*lock, &lock_t::hash);
+	}

 	if (type_mode & LOCK_WAIT) {
 		if (trx->lock.wait_trx) {
@@ -1357,24 +1410,22 @@ on the record, and the request to be added is not a waiting request, we
 can reuse a suitable record lock object already existing on the same page,
 just setting the appropriate bit in its bitmap. This is a low-level function
 which does NOT check for deadlocks or lock compatibility!
-@return lock where the bit was set */
+@param[in] type_mode lock mode, wait, gap etc. flags
+@param[in,out] cell first hash table cell
+@param[in] id page identifier
+@param[in] page buffer block containing the record
+@param[in] heap_no heap number of the record
+@param[in] index index of record
+@param[in,out] trx transaction
+@param[in] caller_owns_trx_mutex TRUE if caller owns the transaction mutex
+@param[in] insert_before_waiting true=insert B-tree record lock right
+before a waiting lock request; false=insert the lock at the end of the queue */
 TRANSACTIONAL_TARGET
-static
-void
-lock_rec_add_to_queue(
-/*==================*/
-	unsigned		type_mode,/*!< in: lock mode, wait, gap
-					etc. flags */
-	hash_cell_t&		cell,	/*!< in,out: first hash table cell */
-	const page_id_t		id,	/*!< in: page identifier */
-	const page_t*		page,	/*!< in: buffer block containing
-					the record */
-	ulint			heap_no,/*!< in: heap number of the record */
-	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx,	/*!< in/out: transaction */
-	bool			caller_owns_trx_mutex)
-					/*!< in: TRUE if caller owns the
-					transaction mutex */
+static void lock_rec_add_to_queue(unsigned type_mode, hash_cell_t &cell,
+                                  const page_id_t id, const page_t *page,
+                                  ulint heap_no, dict_index_t *index,
+                                  trx_t *trx, bool caller_owns_trx_mutex,
+                                  bool insert_before_waiting= false)
 {
 	ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
 	ut_ad(xtest() || caller_owns_trx_mutex == trx->mutex_is_owner());
@@ -1469,7 +1520,7 @@ lock_rec_add_to_queue(

 	lock_rec_create_low(nullptr,
 			    type_mode, id, page, heap_no, index, trx,
-			    caller_owns_trx_mutex);
+			    caller_owns_trx_mutex, insert_before_waiting);
 }

 /*********************************************************************//**
@@ -1536,8 +1587,10 @@ lock_rec_lock(
      /* Do nothing if the trx already has a strong enough lock on rec */
      if (!lock_rec_has_expl(mode, g.cell(), id, heap_no, trx))
      {
+        bool was_ignored = false;
        if (lock_t *c_lock= lock_rec_other_has_conflicting(mode, g.cell(), id,
-                                                           heap_no, trx))
+                                                           heap_no, trx,
+                                                           &was_ignored))
          /*
            If another transaction has a non-gap conflicting
            request in the queue, as this transaction does not
@@ -1550,7 +1603,7 @@ lock_rec_lock(
        {
          /* Set the requested lock on the record. */
          lock_rec_add_to_queue(mode, g.cell(), id, block->page.frame, heap_no,
-                                index, trx, true);
+                                index, trx, true, was_ignored);
          err= DB_SUCCESS_LOCKED_REC;
        }
      }
@@ -4654,7 +4707,7 @@ lock_rec_queue_validate(
 				wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id);
 				wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);

-				if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+				if (!lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP,
 						       cell, id, heap_no,
 						       impl_trx)) {
 					ib::info() << "WSREP impl BF lock conflict";
@@ -4663,8 +4716,22 @@ lock_rec_queue_validate(
 #endif /* WITH_WSREP */
 			{
 				ut_ad(other_lock->is_waiting());
-				ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-						        cell, id, heap_no,
+				/* After MDEV-27025 fix the following case is
+				possible:
+				1. trx 1 acquires S-lock;
+				2. trx 2 creates X-lock waiting for trx 1;
+				3. trx 1 creates implicit lock, as
+				lock_rec_other_has_conflicting() returns no
+				conflicting trx 2 X-lock, the explicit lock
+				will not be created;
+				4. trx 3 creates waiting X-lock,
+				it will wait for S-lock of trx 1.
+				That is why we relaxing the condition here and
+				check only for S-lock.
+				*/
+				ut_ad(lock_rec_has_expl(LOCK_S
+							| LOCK_REC_NOT_GAP,
+							cell, id, heap_no,
 							impl_trx));
 			}
 		}
@@ -5060,7 +5127,7 @@ lock_rec_convert_impl_to_expl_for_trx(
        !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no,
                           trx))
      lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id,
-                            page_align(rec), heap_no, index, trx, true);
+                            page_align(rec), heap_no, index, trx, true, true);
  }

  trx->mutex_unlock();