Commit c2548988 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-19845: Adaptive spin loops

Starting with the Intel Skylake microarchitecture, the PAUSE
instruction latency is about 140 clock cycles instead of earlier 10.
On AMD processors, the latency could be 10 or 50 clock cycles,
depending on microarchitecture.

Because of this big range of latency, let us scale the loops around
the PAUSE instruction based on timing results at server startup.

my_cpu_relax_multiplier: New variable: How many times to invoke PAUSE
in a loop. Only defined for IA-32 and AMD64.

my_cpu_init(): Determine with RDTSC the time to run 16 PAUSE instructions
in two unrolled loops according, and based on the quicker of the two
runs, initialize my_cpu_relax_multiplier. This form of calibration was
suggested by Mikhail Sinyavin from Intel.

LF_BACKOFF(), ut_delay(): Use my_cpu_relax_multiplier when available.

ut_delay(): Define inline in my_cpu.h.

UT_COMPILER_BARRIER(): Remove. This does not seem to have any effect,
because in our ut_delay() implementation, no computations are being
performed inside the loop. The purpose of UT_COMPILER_BARRIER() was to
prohibit the compiler from reordering computations. It was not
emitting any code.
parent 620f4f8a
...@@ -187,8 +187,6 @@ ...@@ -187,8 +187,6 @@
#cmakedefine HAVE_LINUX_FALLOC_H 1 #cmakedefine HAVE_LINUX_FALLOC_H 1
#cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1 #cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1
#cmakedefine HAVE_PREAD 1 #cmakedefine HAVE_PREAD 1
#cmakedefine HAVE_PAUSE_INSTRUCTION 1
#cmakedefine HAVE_FAKE_PAUSE_INSTRUCTION 1
#cmakedefine HAVE_RDTSCLL 1 #cmakedefine HAVE_RDTSCLL 1
#cmakedefine HAVE_READ_REAL_TIME 1 #cmakedefine HAVE_READ_REAL_TIME 1
#cmakedefine HAVE_PTHREAD_ATTR_CREATE 1 #cmakedefine HAVE_PTHREAD_ATTR_CREATE 1
......
...@@ -758,32 +758,6 @@ IF(NOT C_HAS_inline) ...@@ -758,32 +758,6 @@ IF(NOT C_HAS_inline)
ENDIF() ENDIF()
ENDIF() ENDIF()
IF(NOT CMAKE_CROSSCOMPILING AND NOT MSVC)
STRING(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} processor)
IF(processor MATCHES "86" OR processor MATCHES "amd64" OR processor MATCHES "x64")
#Check for x86 PAUSE instruction
# We have to actually try running the test program, because of a bug
# in Solaris on x86_64, where it wrongly reports that PAUSE is not
# supported when trying to run an application. See
# http://bugs.opensolaris.org/bugdatabase/printableBug.do?bug_id=6478684
CHECK_C_SOURCE_RUNS("
int main()
{
__asm__ __volatile__ (\"pause\");
return 0;
}" HAVE_PAUSE_INSTRUCTION)
ENDIF()
IF (NOT HAVE_PAUSE_INSTRUCTION)
CHECK_C_SOURCE_COMPILES("
int main()
{
__asm__ __volatile__ (\"rep; nop\");
return 0;
}
" HAVE_FAKE_PAUSE_INSTRUCTION)
ENDIF()
ENDIF()
CHECK_SYMBOL_EXISTS(tcgetattr "termios.h" HAVE_TCGETATTR 1) CHECK_SYMBOL_EXISTS(tcgetattr "termios.h" HAVE_TCGETATTR 1)
# #
......
...@@ -46,10 +46,20 @@ ...@@ -46,10 +46,20 @@
#define HMT_high() #define HMT_high()
#endif #endif
#if defined __i386__ || defined __x86_64__ || defined _WIN32
# define HAVE_PAUSE_INSTRUCTION /* added in Intel Pentium 4 */
#endif
static inline void MY_RELAX_CPU(void) static inline void MY_RELAX_CPU(void)
{ {
#ifdef HAVE_PAUSE_INSTRUCTION #ifdef _WIN32
/*
In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor.
*/
YieldProcessor();
#elif defined HAVE_PAUSE_INSTRUCTION
/* /*
According to the gcc info page, asm volatile means that the According to the gcc info page, asm volatile means that the
instruction has important side-effects and must not be removed. instruction has important side-effects and must not be removed.
...@@ -61,16 +71,6 @@ static inline void MY_RELAX_CPU(void) ...@@ -61,16 +71,6 @@ static inline void MY_RELAX_CPU(void)
#else #else
__asm__ __volatile__ ("pause"); __asm__ __volatile__ ("pause");
#endif #endif
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
__asm__ __volatile__ ("rep; nop");
#elif defined _WIN32
/*
In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor.
*/
YieldProcessor();
#elif defined(_ARCH_PWR8) #elif defined(_ARCH_PWR8)
__ppc_get_timebase(); __ppc_get_timebase();
#else #else
...@@ -81,6 +81,20 @@ static inline void MY_RELAX_CPU(void) ...@@ -81,6 +81,20 @@ static inline void MY_RELAX_CPU(void)
} }
#ifdef HAVE_PAUSE_INSTRUCTION
# ifdef __cplusplus
extern "C" {
# endif
extern unsigned my_cpu_relax_multiplier;
void my_cpu_init(void);
# ifdef __cplusplus
}
# endif
#else
# define my_cpu_relax_multiplier 200
# define my_cpu_init() /* nothing */
#endif
/* /*
LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel
recommends to use it in spin loops also on non-HT machines to reduce power recommends to use it in spin loops also on non-HT machines to reduce power
...@@ -94,9 +108,23 @@ static inline void MY_RELAX_CPU(void) ...@@ -94,9 +108,23 @@ static inline void MY_RELAX_CPU(void)
static inline int LF_BACKOFF(void) static inline int LF_BACKOFF(void)
{ {
int i; unsigned i= my_cpu_relax_multiplier;
for (i= 0; i < 200; i++) while (i--)
MY_RELAX_CPU(); MY_RELAX_CPU();
return 1; return 1;
} }
/**
Run a delay loop while waiting for a shared resource to be released.
@param delay originally, roughly microseconds on 100 MHz Intel Pentium
*/
static inline void ut_delay(unsigned delay)
{
unsigned i= my_cpu_relax_multiplier / 4 * delay;
HMT_low();
while (i--)
MY_RELAX_CPU();
HMT_medium();
}
#endif #endif
...@@ -44,7 +44,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c ...@@ -44,7 +44,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c
my_getncpus.c my_safehash.c my_chmod.c my_rnd.c my_getncpus.c my_safehash.c my_chmod.c my_rnd.c
my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
my_rdtsc.c my_context.c psi_noop.c my_rdtsc.c my_context.c psi_noop.c
my_atomic_writes.c my_likely.c my_atomic_writes.c my_cpu.c my_likely.c
file_logger.c my_dlerror.c) file_logger.c my_dlerror.c)
IF (WIN32) IF (WIN32)
......
/* Copyright (c) 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
#include <my_global.h>
#include <my_cpu.h>
#ifdef HAVE_PAUSE_INSTRUCTION
/** How many times to invoke PAUSE in a loop */
unsigned my_cpu_relax_multiplier = 200;
# include <stdint.h>
# ifdef _MSC_VER
# include <intrin.h>
# else
# include <x86intrin.h>
# endif
#define PAUSE4 MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU()
#define PAUSE16 PAUSE4; PAUSE4; PAUSE4; PAUSE4
/**
Initialize my_cpu_relax_multiplier.
Determine the duration of a PAUSE instruction by running an
unrolled loop of 16 PAUSE instructions twice, and taking the
faster of the two runs. In this way, even if the execution is
interrupted by the operating system, it should be extremely
unlikely that both loops get interrupted.
On the Intel Skylake microarchitecture, the PAUSE instruction takes
around 140 clock cycles, while on earlier microarchitectures it could
be 10 clock cycles or less. Scale the PAUSE loop counter accordingly.
On a pre-Skylake Intel Xeon CPU E5-2630 v4 @ 2.20GHz running an AMD64
executable, the numbers would be between 176 and 198 when all the code
is inlined as follows:
lfence,rdtsc,mov,shl,or, 16*pause,
lfence,rdtsc,mov,shl,or, 16*pause,
lfence,rdtsc.
That would yield 11 to 12 cycles per PAUSE instruction even if we
(wrongly) ignore the overhead of the other instructions.
On a Skylake mobile processor Intel Core i7-6500U CPU @ 2.50GHz, the
numbers would be somewhere around 6000 or 7000, yielding up to 430
cycles per instruction. This could be partly due to increased latency
for LFENCE and RDTSC, or simply dynamic clock scaling.
Let us define a threshold at roughly 30 cycles per PAUSE instruction,
and use a shorter delay if the PAUSE instruction takes longer than
that. In some AMD processors, the PAUSE instruction could take 40 or
50 cycles. Let us use a shorter delay multiplier for them as well.
The 1/10 scaling factor (200/20) was derived experimentally by
Mikhail Sinyavin from Intel.
*/
void my_cpu_init(void)
{
uint64_t t0, t1, t2;
_mm_lfence();
t0= __rdtsc();
PAUSE16;
_mm_lfence();
t1= __rdtsc();
PAUSE16;
_mm_lfence();
t2= __rdtsc();
t2-= t1;
t1-= t0;
if (t1 > 30 * 16 && t2 > 30 * 16)
my_cpu_relax_multiplier= 20;
}
#endif
...@@ -5113,6 +5113,7 @@ static int init_server_components() ...@@ -5113,6 +5113,7 @@ static int init_server_components()
We need to call each of these following functions to ensure that We need to call each of these following functions to ensure that
all things are initialized so that unireg_abort() doesn't fail all things are initialized so that unireg_abort() doesn't fail
*/ */
my_cpu_init();
mdl_init(); mdl_init();
if (tdc_init() || hostname_cache_init()) if (tdc_init() || hostname_cache_init())
unireg_abort(1); unireg_abort(1);
......
/***************************************************************************** /*****************************************************************************
Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. Copyright (c) 2017, 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software the terms of the GNU General Public License as published by the Free Software
...@@ -29,8 +29,7 @@ Created 2013-03-26 Sunny Bains. ...@@ -29,8 +29,7 @@ Created 2013-03-26 Sunny Bains.
#ifndef ib0mutex_h #ifndef ib0mutex_h
#define ib0mutex_h #define ib0mutex_h
#include "ut0ut.h" #include "my_cpu.h"
#include "ut0rnd.h"
#include "os0event.h" #include "os0event.h"
#include "sync0arr.h" #include "sync0arr.h"
......
...@@ -54,14 +54,6 @@ Created 1/20/1994 Heikki Tuuri ...@@ -54,14 +54,6 @@ Created 1/20/1994 Heikki Tuuri
/** Time stamp */ /** Time stamp */
typedef time_t ib_time_t; typedef time_t ib_time_t;
#if defined (__GNUC__)
# define UT_COMPILER_BARRIER() __asm__ __volatile__ ("":::"memory")
#elif defined (_MSC_VER)
# define UT_COMPILER_BARRIER() _ReadWriteBarrier()
#else
# define UT_COMPILER_BARRIER()
#endif
/*********************************************************************//** /*********************************************************************//**
Delays execution for at most max_wait_us microseconds or returns earlier Delays execution for at most max_wait_us microseconds or returns earlier
if cond becomes true. if cond becomes true.
...@@ -270,14 +262,7 @@ void ...@@ -270,14 +262,7 @@ void
ut_sprintf_timestamp( ut_sprintf_timestamp(
/*=================*/ /*=================*/
char* buf); /*!< in: buffer where to sprintf */ char* buf); /*!< in: buffer where to sprintf */
/*************************************************************//**
Runs an idle loop on CPU. The argument gives the desired delay
in microseconds on 100 MHz Pentium + Visual C++.
@return dummy value */
void
ut_delay(
/*=====*/
ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */
/*************************************************************//** /*************************************************************//**
Prints the contents of a memory buffer in hex and ascii. */ Prints the contents of a memory buffer in hex and ascii. */
void void
......
/***************************************************************************** /*****************************************************************************
Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, MariaDB Corporation. Copyright (c) 2017, 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software the terms of the GNU General Public License as published by the Free Software
...@@ -283,27 +283,6 @@ ut_sprintf_timestamp( ...@@ -283,27 +283,6 @@ ut_sprintf_timestamp(
#endif #endif
} }
/*************************************************************//**
Runs an idle loop on CPU. The argument gives the desired delay
in microseconds on 100 MHz Pentium + Visual C++.
@return dummy value */
void
ut_delay(
/*=====*/
ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */
{
ulint i;
HMT_low();
for (i = 0; i < delay * 50; i++) {
MY_RELAX_CPU();
UT_COMPILER_BARRIER();
}
HMT_medium();
}
/*************************************************************//** /*************************************************************//**
Prints the contents of a memory buffer in hex and ascii. */ Prints the contents of a memory buffer in hex and ascii. */
void void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment