Commit afdd6b1d authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-22669 InnoDB lacks CRC-32C acceleration on IA-32

In mysql/mysql-server@17e497bdb793bc6b8360aa1c626dcd8bb5cfad1b
MySQL 5.6.3 introduced innodb_checksum_algorithm=crc32 and
implemented it for AMD64 using the SSE 4.2 instructions
(incorrectly advertised as "SSE2" in a startup message).
It was not implemented on IA-32 or on Windows.

Since MariaDB 10.2.11 commit 2401d14e
we make use of the SSE4.2 CRC-32C instructions on Windows on both IA-32
and AMD64.

Let us be consistent and implement CRC-32C for IA-32 on all
available platforms. GCC 4.8.2 and GCC 4.8.5 complain
"error: PIC register clobbered by 'ebx' in 'asm'"
so we will only enable this code for IA-32 starting with GCC 5.

Also, we will clean up the implementation further after
commit 1312b4eb.

has_sse4_2(): Replaces ut_cpuid().

ut_crc32c_8(): Replaces ut_crc32_8_hw().

ut_crc32c_64(): Replaces ut_crc32_64_low_hw(), ut_crc32_64_hw().

ut_crc32_hw(): Rewrite.

ut_crc32c_8_sw(): Replaces ut_crc32_8_sw().

ut_crc32c_64_sw(): Replaces ut_crc32_64_low_sw(), ut_crc32_64_sw().

ut_crc32_sw(): Rewrite. Avoid code bloat and do not unroll the
ut_crc32c_64_sw() loop, because no benefit has been demonstrated.

ut_crc32_init(): Only invoke ut_crc32_slice8_table_init()
if no acceleration is available.
parent 14f1453b
...@@ -86,7 +86,7 @@ mysys/my_perf.c, contributed by Facebook under the following license. ...@@ -86,7 +86,7 @@ mysys/my_perf.c, contributed by Facebook under the following license.
#include "ut0crc32.h" #include "ut0crc32.h"
#ifdef _MSC_VER #ifdef _MSC_VER
#include <intrin.h> # include <intrin.h>
#endif #endif
/* CRC32 hardware implementation. */ /* CRC32 hardware implementation. */
...@@ -103,151 +103,75 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -103,151 +103,75 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
/* For runtime check */ /* For runtime check */
unsigned int crc32c_aarch64_available(void); unsigned int crc32c_aarch64_available(void);
}; };
# elif defined(_MSC_VER)
# define TRY_SSE4_2
# elif defined (__GNUC__)
# ifdef __x86_64__
# define TRY_SSE4_2
# elif defined(__i386__) && (__GNUC__ > 4 || defined __clang__)
# define TRY_SSE4_2
# endif
# endif # endif
# ifdef TRY_SSE4_2
# if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER) /** return whether SSE4.2 instructions are available */
/********************************************************************//** static inline bool has_sse4_2()
Fetches CPU info */
static
void
ut_cpuid(
/*=====*/
uint32_t vend[3], /*!< out: CPU vendor */
uint32_t* model, /*!< out: CPU model */
uint32_t* family, /*!< out: CPU family */
uint32_t* stepping, /*!< out: CPU stepping */
uint32_t* features_ecx, /*!< out: CPU features ecx */
uint32_t* features_edx) /*!< out: CPU features edx */
{ {
uint32_t sig; /* We assume that the CPUID instruction and its parameter 1 are available.
We do not support any precursors of the Intel 80486. */
# ifdef _MSC_VER # ifdef _MSC_VER
int data[4]; int data[4];
__cpuid(data, 0); __cpuid(data, 1);
/* ebx */ return !!(data[2] & 1 << 20);
vend[0] = data[1];
/* edx */
vend[1] = data[3];
/* ecx */
vend[2] = data[2];
__cpuid(data, 1);
/* eax */
sig = data[0];
/* ecx */
*features_ecx = data[2];
/* edx */
*features_edx = data[3];
# else # else
asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0)); uint32_t eax, ecx;
asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx) asm("cpuid" : "=a"(eax), "=c"(ecx) : "a"(1) : "ebx", "edx");
: "a" (1) return !!(ecx & 1 << 20);
: "ebx");
# endif # endif
*model = ((sig >> 4) & 0xF);
*family = ((sig >> 8) & 0xF);
*stepping = (sig & 0xF);
if (memcmp(vend, "GenuineIntel", 12) == 0
|| (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
*model += (((sig >> 16) & 0xF) << 4);
*family += ((sig >> 20) & 0xFF);
}
} }
/** Calculate CRC32 over 8-bit data using a hardware/CPU instruction. /** Append 8 bits (1 byte) to a CRC-32C checksum.
@param[in,out] crc crc32 checksum so far when this function is called, @param crc CRC-32C checksum so far
when the function ends it will contain the new checksum @param data data to be checksummed
@param[in,out] data data to be checksummed, the pointer will be advanced @return the updated CRC-32C */
with 1 byte static inline ulint ut_crc32c_8(ulint crc, byte data)
@param[in,out] len remaining bytes, it will be decremented with 1 */
inline
void
ut_crc32_8_hw(
uint32_t* crc,
const byte** data,
ulint* len)
{ {
# ifdef _MSC_VER # ifdef _MSC_VER
*crc = _mm_crc32_u8(*crc, (*data)[0]); return _mm_crc32_u8(static_cast<uint32_t>(crc), data);
# elif __has_feature(memory_sanitizer) # elif __has_feature(memory_sanitizer)
*crc = __builtin_ia32_crc32qi(*crc, (*data)[0]); return __builtin_ia32_crc32qi(crc, data);
# else # else
asm("crc32b %1, %0" asm("crc32b %1, %0" : "+r" (crc) : "rm" (data));
/* output operands */ return crc;
: "+r" (*crc)
/* input operands */
: "rm" ((*data)[0]));
# endif # endif
(*data)++;
(*len)--;
} }
/** Calculate CRC32 over a 64-bit integer using a hardware/CPU instruction. /** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
@param[in] crc crc32 checksum so far @param[in] crc CRC-32C checksum so far
@param[in] data data to be checksummed @param[in] data 8 bytes of aligned data
@return resulting checksum of crc + crc(data) */ @return the updated CRC-32C */
inline static inline ulint ut_crc32c_64(ulint crc, uint64_t data)
uint32_t
ut_crc32_64_low_hw(
uint32_t crc,
uint64_t data)
{ {
uint64_t crc_64bit = crc;
# ifdef _MSC_VER # ifdef _MSC_VER
# ifdef _M_X64 # ifdef _M_X64
crc_64bit = _mm_crc32_u64(crc_64bit, data); return _mm_crc32_u64(crc, data);
# elif defined(_M_IX86) # elif defined(_M_IX86)
crc = _mm_crc32_u32(crc, static_cast<uint32_t>(data)); crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data));
crc_64bit = _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32)); crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
return crc;
# else # else
# error Not Supported processors type. # error Unsupported processor type
# endif # endif
# elif __has_feature(memory_sanitizer) # elif __has_feature(memory_sanitizer)
crc_64bit = __builtin_ia32_crc32di(crc_64bit, data); return __builtin_ia32_crc32di(crc, data);
# elif defined __x86_64__
asm("crc32q %1, %0" : "+r" (crc) : "rm" (data));
return crc;
# else # else
asm("crc32q %1, %0" asm("crc32l %1, %0" : "+r" (crc) : "rm" (static_cast<uint32_t>(data)));
/* output operands */ asm("crc32l %1, %0" : "+r" (crc) : "rm" (static_cast<uint32_t>(data >> 32)));
: "+r" (crc_64bit) return crc;
/* input operands */
: "rm" (data));
# endif # endif
return(static_cast<uint32_t>(crc_64bit));
}
/** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction.
@param[in,out] crc crc32 checksum so far when this function is called,
when the function ends it will contain the new checksum
@param[in,out] data data to be checksummed, the pointer will be advanced
with 8 bytes
@param[in,out] len remaining bytes, it will be decremented with 8 */
inline
void
ut_crc32_64_hw(
uint32_t* crc,
const byte** data,
ulint* len)
{
uint64_t data_int = *reinterpret_cast<const uint64_t*>(*data);
# ifdef WORDS_BIGENDIAN
/* Currently we only support x86_64 (little endian) CPUs. In case
some big endian CPU supports a CRC32 instruction, then maybe we will
need a byte order swap here. */
# error Dont know how to handle big endian CPUs
/*
data_int = ut_crc32_swap_byteorder(data_int);
*/
# endif /* WORDS_BIGENDIAN */
*crc = ut_crc32_64_low_hw(*crc, data_int);
*data += 8;
*len -= 8;
} }
/** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions /** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions
...@@ -257,84 +181,50 @@ ut_crc32_64_hw( ...@@ -257,84 +181,50 @@ ut_crc32_64_hw(
@return CRC-32C (polynomial 0x11EDC6F41) */ @return CRC-32C (polynomial 0x11EDC6F41) */
uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len) uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len)
{ {
crc = ~crc; ulint c= static_cast<uint32_t>(~crc);
/* Calculate byte-by-byte up to an 8-byte aligned address. After /* Calculate byte-by-byte up to an 8-byte aligned address. After
this consume the input 8-bytes at a time. */ this consume the input 8-bytes at a time. */
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) { while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
ut_crc32_8_hw(&crc, &buf, &len); {
} c= ut_crc32c_8(c, *buf++);
len--;
/* Perf testing }
./unittest/gunit/innodb/merge_innodb_tests-t --gtest_filter=ut0crc32.perf
on CPU "Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz" const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
with different N in "while (len >= N) {" shows:
N=16 for (; len >= 128; len-= 128)
2.867254 sec {
2.866860 sec /* This call is repeated 16 times. 16 * 8 = 128. */
2.867973 sec c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
N=32 c= ut_crc32c_64(c, *b64++);
2.715725 sec c= ut_crc32c_64(c, *b64++);
2.713008 sec c= ut_crc32c_64(c, *b64++);
2.712520 sec c= ut_crc32c_64(c, *b64++);
(5.36% speedup over N=16) c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
N=64 c= ut_crc32c_64(c, *b64++);
2.634140 sec c= ut_crc32c_64(c, *b64++);
2.636558 sec c= ut_crc32c_64(c, *b64++);
2.636488 sec c= ut_crc32c_64(c, *b64++);
(2.88% speedup over N=32) c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
N=128 c= ut_crc32c_64(c, *b64++);
2.599534 sec c= ut_crc32c_64(c, *b64++);
2.599919 sec }
2.598035 sec
(1.39% speedup over N=64) for (; len >= 8; len-= 8)
c= ut_crc32c_64(c, *b64++);
N=256
2.576993 sec buf= reinterpret_cast<const byte*>(b64);
2.576748 sec
2.575700 sec while (len--)
(0.87% speedup over N=128) c= ut_crc32c_8(c, *buf++);
N=512 return ~static_cast<uint32_t>(c);
2.693928 sec
2.691663 sec
2.692142 sec
(4.51% slowdown over N=256)
*/
while (len >= 128) {
/* This call is repeated 16 times. 16 * 8 = 128. */
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
ut_crc32_64_hw(&crc, &buf, &len);
}
while (len >= 8) {
ut_crc32_64_hw(&crc, &buf, &len);
}
while (len > 0) {
ut_crc32_8_hw(&crc, &buf, &len);
}
return(~crc);
} }
# endif /* defined(__GNUC__) && defined(__x86_64__) || (_WIN64) */ # endif /* (defined(__GNUC__) && defined(__i386__)) || _MSC_VER */
/* CRC32 software implementation. */ /* CRC32 software implementation. */
...@@ -373,91 +263,44 @@ ut_crc32_slice8_table_init() ...@@ -373,91 +263,44 @@ ut_crc32_slice8_table_init()
} }
} }
/** Calculate CRC32 over 8-bit data using a software implementation. /** Append 8 bits (1 byte) to a CRC-32C checksum.
@param[in,out] crc crc32 checksum so far when this function is called, @param crc CRC-32C checksum so far
when the function ends it will contain the new checksum @param data data to be checksummed
@param[in,out] data data to be checksummed, the pointer will be advanced @return the updated CRC-32C */
with 1 byte static inline uint32_t ut_crc32c_8_sw(uint32_t crc, byte data)
@param[in,out] len remaining bytes, it will be decremented with 1 */
inline
void
ut_crc32_8_sw(
uint32_t* crc,
const byte** data,
ulint* len)
{ {
const uint8_t i = (*crc ^ (*data)[0]) & 0xFF; const uint8_t i= (crc ^ data) & 0xFF;
*crc = (*crc >> 8) ^ ut_crc32_slice8_table[0][i];
(*data)++; return (crc >> 8) ^ ut_crc32_slice8_table[0][i];
(*len)--;
} }
/** Swap the byte order of an 8 byte integer. /** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
@param[in] i 8-byte integer @param[in] crc CRC-32C checksum so far
@return 8-byte integer */ @param[in] data 8 bytes of aligned data
# ifdef WORDS_BIGENDIAN @return the updated CRC-32C */
inline uint64_t ut_crc32_swap_byteorder(uint64_t i) static inline uint32_t ut_crc32c_64_sw(uint32_t crc, uint64_t data)
{ {
return i << 56 |
(i & 0x000000000000FF00ULL) << 40 |
(i & 0x0000000000FF0000ULL) << 24 |
(i & 0x00000000FF000000ULL) << 8 |
(i & 0x000000FF00000000ULL) >> 8 |
(i & 0x0000FF0000000000ULL) >> 24 |
(i & 0x00FF000000000000ULL) >> 40 |
i >> 56;
}
# endif /* WORDS_BIGENDIAN */
/** Calculate CRC32 over a 64-bit integer using a software implementation.
@param[in] crc crc32 checksum so far
@param[in] data data to be checksummed
@return resulting checksum of crc + crc(data) */
inline
uint32_t
ut_crc32_64_low_sw(
uint32_t crc,
uint64_t data)
{
const uint64_t i = crc ^ data;
return(
ut_crc32_slice8_table[7][(i ) & 0xFF] ^
ut_crc32_slice8_table[6][(i >> 8) & 0xFF] ^
ut_crc32_slice8_table[5][(i >> 16) & 0xFF] ^
ut_crc32_slice8_table[4][(i >> 24) & 0xFF] ^
ut_crc32_slice8_table[3][(i >> 32) & 0xFF] ^
ut_crc32_slice8_table[2][(i >> 40) & 0xFF] ^
ut_crc32_slice8_table[1][(i >> 48) & 0xFF] ^
ut_crc32_slice8_table[0][(i >> 56)]
);
}
/** Calculate CRC32 over 64-bit byte string using a software implementation.
@param[in,out] crc crc32 checksum so far when this function is called,
when the function ends it will contain the new checksum
@param[in,out] data data to be checksummed, the pointer will be advanced
with 8 bytes
@param[in,out] len remaining bytes, it will be decremented with 8 */
inline
void
ut_crc32_64_sw(
uint32_t* crc,
const byte** data,
ulint* len)
{
uint64_t data_int = *reinterpret_cast<const uint64_t*>(*data);
# ifdef WORDS_BIGENDIAN # ifdef WORDS_BIGENDIAN
data_int = ut_crc32_swap_byteorder(data_int); data= data << 56 |
(data & 0x000000000000FF00ULL) << 40 |
(data & 0x0000000000FF0000ULL) << 24 |
(data & 0x00000000FF000000ULL) << 8 |
(data & 0x000000FF00000000ULL) >> 8 |
(data & 0x0000FF0000000000ULL) >> 24 |
(data & 0x00FF000000000000ULL) >> 40 |
data >> 56;
# endif /* WORDS_BIGENDIAN */ # endif /* WORDS_BIGENDIAN */
*crc = ut_crc32_64_low_sw(*crc, data_int); data^= crc;
return
*data += 8; ut_crc32_slice8_table[7][(data ) & 0xFF] ^
*len -= 8; ut_crc32_slice8_table[6][(data >> 8) & 0xFF] ^
ut_crc32_slice8_table[5][(data >> 16) & 0xFF] ^
ut_crc32_slice8_table[4][(data >> 24) & 0xFF] ^
ut_crc32_slice8_table[3][(data >> 32) & 0xFF] ^
ut_crc32_slice8_table[2][(data >> 40) & 0xFF] ^
ut_crc32_slice8_table[1][(data >> 48) & 0xFF] ^
ut_crc32_slice8_table[0][(data >> 56)];
} }
/** Calculate CRC-32C using a look-up table. /** Calculate CRC-32C using a look-up table.
...@@ -467,43 +310,27 @@ ut_crc32_64_sw( ...@@ -467,43 +310,27 @@ ut_crc32_64_sw(
@return CRC-32C (polynomial 0x11EDC6F41) */ @return CRC-32C (polynomial 0x11EDC6F41) */
uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len) uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len)
{ {
crc = ~crc; crc= ~crc;
/* Calculate byte-by-byte up to an 8-byte aligned address. After /* Calculate byte-by-byte up to an 8-byte aligned address. After
this consume the input 8-bytes at a time. */ this consume the input 8-bytes at a time. */
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) { while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
ut_crc32_8_sw(&crc, &buf, &len); {
} crc= ut_crc32c_8_sw(crc, *buf++);
len--;
}
while (len >= 128) { const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
/* This call is repeated 16 times. 16 * 8 = 128. */
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
ut_crc32_64_sw(&crc, &buf, &len);
}
while (len >= 8) { for (; len >= 8; len-= 8)
ut_crc32_64_sw(&crc, &buf, &len); crc= ut_crc32c_64_sw(crc, *b64++);
}
while (len > 0) { buf= reinterpret_cast<const byte*>(b64);
ut_crc32_8_sw(&crc, &buf, &len);
}
return(~crc); while (len--)
crc= ut_crc32c_8_sw(crc, *buf++);
return ~crc;
} }
ut_crc32_func_t ut_crc32_low= ut_crc32_sw; ut_crc32_func_t ut_crc32_low= ut_crc32_sw;
...@@ -513,37 +340,24 @@ const char *ut_crc32_implementation= "Using generic crc32 instructions"; ...@@ -513,37 +340,24 @@ const char *ut_crc32_implementation= "Using generic crc32 instructions";
/********************************************************************//** /********************************************************************//**
Initializes the data structures used by ut_crc32*(). Does not do any Initializes the data structures used by ut_crc32*(). Does not do any
allocations, would not hurt if called twice, but would be pointless. */ allocations, would not hurt if called twice, but would be pointless. */
void void ut_crc32_init()
ut_crc32_init()
{ {
#ifdef HAVE_CRC32_VPMSUM #ifndef HAVE_CRC32_VPMSUM
#else
ut_crc32_slice8_table_init();
# if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
uint32_t vend[3];
uint32_t model;
uint32_t family;
uint32_t stepping;
uint32_t features_ecx;
uint32_t features_edx;
ut_cpuid(vend, &model, &family, &stepping,
&features_ecx, &features_edx);
if (features_ecx & 1 << 20) {
ut_crc32_low = ut_crc32_hw;
ut_crc32_implementation = "Using SSE2 crc32 instructions";
}
# endif
# if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC) # if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
if (crc32c_aarch64_available()) { if (crc32c_aarch64_available())
ut_crc32_low = crc32c_aarch64; {
ut_crc32_implementation = "Using Armv8 crc32 instructions"; ut_crc32_low= crc32c_aarch64;
ut_crc32_implementation= "Using ARMv8 crc32 instructions";
} return;
}
# elif defined(TRY_SSE4_2)
if (has_sse4_2())
{
ut_crc32_low= ut_crc32_hw;
ut_crc32_implementation= "Using SSE4.2 crc32 instructions";
return;
}
# endif # endif
#endif ut_crc32_slice8_table_init();
#endif /* !HAVE_CRC32_VPMSUM */
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment