MDEV-34983: Remove x86 asm from InnoDB

Starting with GCC 7 and clang 15, single-bit operations such as fetch_or(1) & 1 are translated into 80386 instructions such as LOCK BTS, instead of using the generic translation pattern of emitting a loop around LOCK CMPXCHG. Given that the oldest currently supported GNU/Linux distributions ship GCC 7, and that older versions of GCC are out of support, let us remove some work-arounds that are not strictly necessary. If someone compiles the code using an older compiler, it will work but possibly less efficiently. srw_mutex_impl::HOLDER: Changed from 1U<<31 to 1 in order to work around https://github.com/llvm/llvm-project/issues/37322 which is specific to setting the most significant bit. srw_mutex_impl::WAITER: A multiplier of waiting requests. This used to be 1, which would now collide with HOLDER. fil_space_t::set_stopping(): Remove this unused function. In MSVC we need _interlockedbittestandset() for LOCK BTS.

MDEV-34983: Remove x86 asm from InnoDB
Starting with GCC 7 and clang 15, single-bit operations such as fetch_or(1) & 1 are translated into 80386 instructions such as LOCK BTS, instead of using the generic translation pattern of emitting a loop around LOCK CMPXCHG. Given that the oldest currently supported GNU/Linux distributions ship GCC 7, and that older versions of GCC are out of support, let us remove some work-arounds that are not strictly necessary. If someone compiles the code using an older compiler, it will work but possibly less efficiently. srw_mutex_impl::HOLDER: Changed from 1U<<31 to 1 in order to work around https://github.com/llvm/llvm-project/issues/37322 which is specific to setting the most significant bit. srw_mutex_impl::WAITER: A multiplier of waiting requests. This used to be 1, which would now collide with HOLDER. fil_space_t::set_stopping(): Remove this unused function. In MSVC we need _interlockedbittestandset() for LOCK BTS.
638c62ac · Marko Mäkelä · 71649b93 · 638c62ac · 638c62ac · 638c62ac
Commit 638c62ac authored Sep 23, 2024 by Marko Mäkelä
6 changed files
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -528,9 +528,6 @@ struct fil_space_t final
  /** Close each file. Only invoked on fil_system.temp_space. */
  void close();

-  /** Note that operations on the tablespace must stop. */
-  inline void set_stopping();
-
  /** Drop the tablespace and wait for any pending operations to cease
  @param id               tablespace identifier
  @param detached_handle  pointer to file to be closed later, or nullptr
@@ -589,32 +586,14 @@ struct fil_space_t final
  /** Clear the NEEDS_FSYNC flag */
  void clear_flush()
  {
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
-    __asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
-    _interlockedbittestandreset(reinterpret_cast<volatile long*>
-                                (&n_pending), 28);
-#else
    n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
-#endif
  }

 private:
  /** Clear the CLOSING flag */
  void clear_closing()
  {
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    static_assert(CLOSING == 1U << 29, "compatibility");
-    __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    static_assert(CLOSING == 1U << 29, "compatibility");
-    _interlockedbittestandreset(reinterpret_cast<volatile long*>
-                                (&n_pending), 29);
-#else
    n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
-#endif
  }

  /** @return pending operations (and flags) */
@@ -1605,21 +1584,6 @@ inline void fil_space_t::reacquire()
 #endif /* SAFE_MUTEX */
 }

-/** Note that operations on the tablespace must stop. */
-inline void fil_space_t::set_stopping()
-{
-  mysql_mutex_assert_owner(&fil_system.mutex);
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
-  __asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
-  _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
-#else
-  n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
-#endif
-}
-
 /** Flush pending writes from the file system cache to the file. */
 template<bool have_reference> inline void fil_space_t::flush()
 {

--- a/storage/innobase/include/rw_lock.h
+++ b/storage/innobase/include/rw_lock.h
@@ -39,15 +39,7 @@ class rw_lock
  /** Start waiting for an exclusive lock. */
  void write_lock_wait_start()
  {
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
-    __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
-    _interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
-#else
    lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
-#endif
  }
  /** Start waiting for an exclusive lock.
  @return current value of the lock word */

--- a/storage/innobase/include/srw_lock.h
+++ b/storage/innobase/include/srw_lock.h
@@ -92,11 +92,13 @@ template<bool spinloop>
 class srw_mutex_impl final
 {
  friend ssux_lock_impl<spinloop>;
-  /** The lock word, containing HOLDER + 1 if the lock is being held,
-  plus the number of waiters */
+  /** The lock word, containing HOLDER + WAITER if the lock is being held,
+  plus WAITER times the number of waiters */
  std::atomic<uint32_t> lock;
  /** Identifies that the lock is being held */
-  static constexpr uint32_t HOLDER= 1U << 31;
+  static constexpr uint32_t HOLDER= 1;
+  /** Identifies a lock waiter */
+  static constexpr uint32_t WAITER= 2;

 #ifdef SUX_LOCK_GENERIC
 public:
@@ -144,7 +146,7 @@ class srw_mutex_impl final
  bool wr_lock_try()
  {
    uint32_t lk= 0;
-    return lock.compare_exchange_strong(lk, HOLDER + 1,
+    return lock.compare_exchange_strong(lk, HOLDER + WAITER,
                                        std::memory_order_acquire,
                                        std::memory_order_relaxed);
  }
@@ -152,8 +154,9 @@ class srw_mutex_impl final
  void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
  void wr_unlock()
  {
-    const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
-    if (lk != HOLDER + 1)
+    const uint32_t lk=
+      lock.fetch_sub(HOLDER + WAITER, std::memory_order_release);
+    if (lk != HOLDER + WAITER)
    {
      DBUG_ASSERT(lk & HOLDER);
      wake();
@@ -269,10 +272,14 @@ class ssux_lock_impl
  {
    writer.wr_lock();
 #if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
-    /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
-    as a loop around LOCK CMPXCHG. In this particular case, setting the
-    most significant bit using fetch_add() is equivalent, and is
-    translated into a simple LOCK XADD. */
+    /* On IA-32 and AMD64, a fetch_XXX() that needs to return the
+    previous value of the word state can only be implemented
+    efficiently for fetch_add() or fetch_sub(), both of which
+    translate into a 80486 LOCK XADD instruction. Anything else would
+    translate into a loop around LOCK CMPXCHG. In this particular
+    case, we know that the bit was previously clear, and therefore
+    setting (actually toggling) the most significant bit using
+    fetch_add() or fetch_sub() is equivalent. */
    static_assert(WRITER == 1U << 31, "compatibility");
    if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
      wr_wait(lk);

--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -85,26 +85,12 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
  /** Set the SKIP bit */
  void ref_set_skip()
  {
-    static_assert(SKIP == 1U, "compatibility");
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
-#else
    ref.fetch_or(SKIP, std::memory_order_relaxed);
-#endif
  }
  /** Clear a bit in ref */
  void ref_reset_skip()
  {
-    static_assert(SKIP == 1U, "compatibility");
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
-#else
    ref.fetch_and(~SKIP, std::memory_order_relaxed);
-#endif
  }

 public:

--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -345,15 +345,7 @@ struct trx_lock_t
  /** Flag the lock owner as a victim in Galera conflict resolution. */
  void set_wsrep_victim()
  {
-# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    /* There is no 8-bit version of the 80386 BTS instruction.
-    Technically, this is the wrong addressing mode (16-bit), but
-    there are other data members stored after the byte. */
-    __asm__ __volatile__("lock btsw $1, %0"
-                         : "+m" (was_chosen_as_deadlock_victim));
-# else
    was_chosen_as_deadlock_victim.fetch_or(2);
-# endif
  }
 #else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */

@@ -1038,15 +1030,7 @@ struct trx_t : ilist_node<>

  void reset_skip_lock_inheritance()
  {
-#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-    __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref));
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-    _interlockedbittestandreset(
-        reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref),
-        31);
-#else
    skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31);
-#endif
  }

  /** @return whether the table has lock on

--- a/storage/innobase/sync/srw_lock.cc
+++ b/storage/innobase/sync/srw_lock.cc
@@ -269,44 +269,10 @@ template void ssux_lock_impl<false>::wake();
 template void srw_mutex_impl<true>::wake();
 template void ssux_lock_impl<true>::wake();

-/*
-
-Unfortunately, compilers targeting IA-32 or AMD64 currently cannot
-translate the following single-bit operations into Intel 80386 instructions:
-
-     m.fetch_or(1<<b) & 1<<b       LOCK BTS b, m
-     m.fetch_and(~(1<<b)) & 1<<b   LOCK BTR b, m
-     m.fetch_xor(1<<b) & 1<<b      LOCK BTC b, m
-
-Hence, we will manually translate fetch_or() using GCC-style inline
-assembler code or a Microsoft intrinsic function.
-
-*/
-
-#if defined __clang_major__ && __clang_major__ < 10
-/* Only clang-10 introduced support for asm goto */
-#elif defined __APPLE__
-/* At least some versions of Apple Xcode do not support asm goto */
-#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-# define IF_FETCH_OR_GOTO(mem, bit, label)				\
-  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
-               "jc %l1" : : "m" (mem) : "cc", "memory" : label);
-# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
-  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
-               "jnc %l1" : : "m" (mem) : "cc", "memory" : label);
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-# define IF_FETCH_OR_GOTO(mem, bit, label)				\
-  if (_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit)) \
-    goto label;
-# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
-  if (!_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit))\
-    goto label;
-#endif
-
 template<bool spinloop>
 void srw_mutex_impl<spinloop>::wait_and_lock()
 {
-  uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed);
+  uint32_t lk= WAITER + lock.fetch_add(WAITER, std::memory_order_relaxed);

  if (spinloop)
  {
@@ -318,10 +284,16 @@ void srw_mutex_impl<spinloop>::wait_and_lock()
      lk= lock.load(std::memory_order_relaxed);
      if (!(lk & HOLDER))
      {
-#ifdef IF_NOT_FETCH_OR_GOTO
-        static_assert(HOLDER == (1U << 31), "compatibility");
-        IF_NOT_FETCH_OR_GOTO(*this, 31, acquired);
-        lk|= HOLDER;
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+        lk |= HOLDER;
+# ifdef _MSC_VER
+        static_assert(HOLDER == (1U << 0), "compatibility");
+        if (!_interlockedbittestandset
+            (reinterpret_cast<volatile long*>(&lock), 0))
+# else
+        if (!(lock.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER))
+# endif
+          goto acquired;
 #else
        if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER))
          goto acquired;
@@ -339,16 +311,22 @@ void srw_mutex_impl<spinloop>::wait_and_lock()
    if (lk & HOLDER)
    {
      wait(lk);
-#ifdef IF_FETCH_OR_GOTO
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
 reload:
 #endif
      lk= lock.load(std::memory_order_relaxed);
    }
    else
    {
-#ifdef IF_FETCH_OR_GOTO
-      static_assert(HOLDER == (1U << 31), "compatibility");
-      IF_FETCH_OR_GOTO(*this, 31, reload);
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+# ifdef _MSC_VER
+      static_assert(HOLDER == (1U << 0), "compatibility");
+      if (_interlockedbittestandset
+          (reinterpret_cast<volatile long*>(&lock), 0))
+# else
+      if (lock.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER)
+# endif
+        goto reload;
 #else
      if ((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)
        continue;
@@ -416,7 +394,8 @@ void ssux_lock_impl<spinloop>::rd_wait()

  /* Subscribe to writer.wake() or write.wake_all() calls by
  concurrently executing rd_wait() or writer.wr_unlock(). */
-  uint32_t wl= 1 + writer.lock.fetch_add(1, std::memory_order_acquire);
+  uint32_t wl= writer.WAITER +
+    writer.lock.fetch_add(writer.WAITER, std::memory_order_acquire);

  for (;;)
  {
@@ -440,13 +419,13 @@ void ssux_lock_impl<spinloop>::rd_wait()
  }

  /* Unsubscribe writer.wake() and writer.wake_all(). */
-  wl= writer.lock.fetch_sub(1, std::memory_order_release);
+  wl= writer.lock.fetch_sub(writer.WAITER, std::memory_order_release);
  ut_ad(wl);

  /* Wake any other threads that may be blocked in writer.wait().
  All other waiters than this rd_wait() would end up acquiring writer.lock
  and waking up other threads on unlock(). */
-  if (wl > 1)
+  if (wl > writer.WAITER)
    writer.wake_all();
 }