MDEV-26467: More cache friendliness

srw_mutex_impl<bool>::wait_and_lock(): In commit a73eedbf we introduced an std::atomic::fetch_or() in a loop. Alas, on the IA-32 and AMD64, that was being translated into a loop around LOCK CMPXCHG. To avoid a nested loop, it is better to explicitly invoke std::atomic::compare_exchange_weak() in the loop, but only if the attempt has a chance to succeed (the HOLDER flag is not set). It is even more efficient to use LOCK BTS, but contemporary compilers fail to translate std::atomic::fetch_or(x) & x into that when x is a single-bit constant. On GCC-compatible compilers, we will use inline assembler to achieve that. On other ISA than IA-32 and AMD64, we will continue to use std::atomic::fetch_or(). ssux_lock_impl<spinloop>::rd_wait(): Use rd_lock_try(). A loop around std::atomic::compare_exchange_weak() should be cheaper than fetch_add(), fetch_sub() and a wakeup system call. These deficiencies were pointed out and the use of LOCK BTS was suggested by Thiago Macieira.

MDEV-26467: More cache friendliness
srw_mutex_impl<bool>::wait_and_lock(): In commit a73eedbf we introduced an std::atomic::fetch_or() in a loop. Alas, on the IA-32 and AMD64, that was being translated into a loop around LOCK CMPXCHG. To avoid a nested loop, it is better to explicitly invoke std::atomic::compare_exchange_weak() in the loop, but only if the attempt has a chance to succeed (the HOLDER flag is not set). It is even more efficient to use LOCK BTS, but contemporary compilers fail to translate std::atomic::fetch_or(x) & x into that when x is a single-bit constant. On GCC-compatible compilers, we will use inline assembler to achieve that. On other ISA than IA-32 and AMD64, we will continue to use std::atomic::fetch_or(). ssux_lock_impl<spinloop>::rd_wait(): Use rd_lock_try(). A loop around std::atomic::compare_exchange_weak() should be cheaper than fetch_add(), fetch_sub() and a wakeup system call. These deficiencies were pointed out and the use of LOCK BTS was suggested by Thiago Macieira.
35f59bc4 · Marko Mäkelä · 0d68b0a2 · 35f59bc4
Commit 35f59bc4 authored Sep 28, 2021 by Marko Mäkelä
Hide whitespace changes
Inline Side-by-side

Showing with 72 additions and 29 deletions

storage/innobase/sync/srw_lock.cc storage/innobase/sync/srw_lock.cc +72 -29

No files found.
--- a/storage/innobase/sync/srw_lock.cc
+++ b/storage/innobase/sync/srw_lock.cc
@@ -294,57 +294,107 @@ void srw_mutex_impl<true>::wait_and_lock()
    DBUG_ASSERT(~HOLDER & lk);
    if (lk & HOLDER)
      lk= lock.load(std::memory_order_relaxed);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+#elif defined _M_IX86||defined _M_X64||defined __i386__||defined __x86_64__
+    else if (lock.compare_exchange_weak(lk, lk | HOLDER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return;
+#else
+    else if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) &
+               HOLDER))
+      goto acquired;
+#endif
    else
    {
-      lk= lock.fetch_or(HOLDER, std::memory_order_relaxed);
-      if (!(lk & HOLDER))
-        goto acquired;
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+      static_assert(HOLDER == (1U << 31), "compatibility");
+      __asm__ goto("lock btsl $31, %0\n\t"
+                   "jnc %l1" : : "m" (*this) : "cc", "memory" : acquired);
+      lk|= HOLDER;
+#endif
+      srw_pause(delay);
    }
-    srw_pause(delay);
    if (!--spin)
      break;
  }

-  for (;; wait(lk))
+  for (;;)
  {
+    DBUG_ASSERT(~HOLDER & lk);
    if (lk & HOLDER)
    {
+      wait(lk);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+reload:
+#endif
      lk= lock.load(std::memory_order_relaxed);
-      if (lk & HOLDER)
-        continue;
    }
-    lk= lock.fetch_or(HOLDER, std::memory_order_relaxed);
-    if (!(lk & HOLDER))
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    else
+    {
+      static_assert(HOLDER == (1U << 31), "compatibility");
+      __asm__ goto("lock btsl $31, %0\n\t"
+                   "jc %l1" : : "m" (*this) : "cc", "memory" : reload);
+acquired:
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return;
+    }
+#elif defined _M_IX86||defined _M_X64||defined __i386__||defined __x86_64__
+    else if (lock.compare_exchange_weak(lk, lk | HOLDER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return;
+#else
+    else if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) &
+               HOLDER))
    {
 acquired:
      DBUG_ASSERT(lk);
      std::atomic_thread_fence(std::memory_order_acquire);
      return;
    }
-    DBUG_ASSERT(lk > HOLDER);
+#endif
  }
 }

 template<>
 void srw_mutex_impl<false>::wait_and_lock()
 {
-  uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed);
-  for (;; wait(lk))
+  for (uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed);;)
  {
+    DBUG_ASSERT(~HOLDER & lk);
    if (lk & HOLDER)
    {
+      wait(lk);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+reload:
+#endif
      lk= lock.load(std::memory_order_relaxed);
-      if (lk & HOLDER)
-        continue;
    }
-    lk= lock.fetch_or(HOLDER, std::memory_order_relaxed);
-    if (!(lk & HOLDER))
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    else
+    {
+      static_assert(HOLDER == (1U << 31), "compatibility");
+      __asm__ goto("lock btsl $31, %0\n\t"
+                   "jc %l1" : : "m" (*this) : "cc", "memory" : reload);
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return;
+    }
+#elif defined _M_IX86||defined _M_X64||defined __i386__||defined __x86_64__
+    else if (lock.compare_exchange_weak(lk, lk | HOLDER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return;
+#else
+    else if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) &
+               HOLDER))
    {
      DBUG_ASSERT(lk);
      std::atomic_thread_fence(std::memory_order_acquire);
      return;
    }
-    DBUG_ASSERT(lk > HOLDER);
+#endif
  }
 }

@@ -373,19 +423,12 @@ void ssux_lock_impl<spinloop>::rd_wait()
  for (;;)
  {
    writer.wr_lock();
-    uint32_t lk= readers.fetch_add(1, std::memory_order_acquire);
-    if (UNIV_UNLIKELY(lk == WRITER))
-    {
-      readers.fetch_sub(1, std::memory_order_relaxed);
-      wake();
-      writer.wr_unlock();
-      pthread_yield();
-      continue;
-    }
-    DBUG_ASSERT(!(lk & WRITER));
-    break;
+    bool acquired= rd_lock_try();
+    writer.wr_unlock();
+    if (acquired)
+      break;
+    std::this_thread::yield();
  }
-  writer.wr_unlock();
 }

 template void ssux_lock_impl<true>::rd_wait();