MDEV-22669 InnoDB lacks CRC-32C acceleration on IA-32

In mysql/mysql-server@17e497bdb793bc6b8360aa1c626dcd8bb5cfad1b MySQL 5.6.3 introduced innodb_checksum_algorithm=crc32 and implemented it for AMD64 using the SSE 4.2 instructions (incorrectly advertised as "SSE2" in a startup message). It was not implemented on IA-32 or on Windows. Since MariaDB 10.2.11 commit 2401d14e we make use of the SSE4.2 CRC-32C instructions on Windows on both IA-32 and AMD64. Let us be consistent and implement CRC-32C for IA-32 on all available platforms. GCC 4.8.2 and GCC 4.8.5 complain "error: PIC register clobbered by 'ebx' in 'asm'" so we will only enable this code for IA-32 starting with GCC 5. Also, we will clean up the implementation further after commit 1312b4eb. has_sse4_2(): Replaces ut_cpuid(). ut_crc32c_8(): Replaces ut_crc32_8_hw(). ut_crc32c_64(): Replaces ut_crc32_64_low_hw(), ut_crc32_64_hw(). ut_crc32_hw(): Rewrite. ut_crc32c_8_sw(): Replaces ut_crc32_8_sw(). ut_crc32c_64_sw(): Replaces ut_crc32_64_low_sw(), ut_crc32_64_sw(). ut_crc32_sw(): Rewrite. Avoid code bloat and do not unroll the ut_crc32c_64_sw() loop, because no benefit has been demonstrated. ut_crc32_init(): Only invoke ut_crc32_slice8_table_init() if no acceleration is available.

MDEV-22669 InnoDB lacks CRC-32C acceleration on IA-32
In mysql/mysql-server@17e497bdb793bc6b8360aa1c626dcd8bb5cfad1b MySQL 5.6.3 introduced innodb_checksum_algorithm=crc32 and implemented it for AMD64 using the SSE 4.2 instructions (incorrectly advertised as "SSE2" in a startup message). It was not implemented on IA-32 or on Windows. Since MariaDB 10.2.11 commit 2401d14e we make use of the SSE4.2 CRC-32C instructions on Windows on both IA-32 and AMD64. Let us be consistent and implement CRC-32C for IA-32 on all available platforms. GCC 4.8.2 and GCC 4.8.5 complain "error: PIC register clobbered by 'ebx' in 'asm'" so we will only enable this code for IA-32 starting with GCC 5. Also, we will clean up the implementation further after commit 1312b4eb. has_sse4_2(): Replaces ut_cpuid(). ut_crc32c_8(): Replaces ut_crc32_8_hw(). ut_crc32c_64(): Replaces ut_crc32_64_low_hw(), ut_crc32_64_hw(). ut_crc32_hw(): Rewrite. ut_crc32c_8_sw(): Replaces ut_crc32_8_sw(). ut_crc32c_64_sw(): Replaces ut_crc32_64_low_sw(), ut_crc32_64_sw(). ut_crc32_sw(): Rewrite. Avoid code bloat and do not unroll the ut_crc32c_64_sw() loop, because no benefit has been demonstrated. ut_crc32_init(): Only invoke ut_crc32_slice8_table_init() if no acceleration is available.
afdd6b1d · Marko Mäkelä · 14f1453b · afdd6b1d
Commit afdd6b1d authored May 22, 2020 by Marko Mäkelä
Show whitespace changes
Inline Side-by-side

Showing with 152 additions and 338 deletions

storage/innobase/ut/ut0crc32.cc storage/innobase/ut/ut0crc32.cc +152 -338

No files found.
--- a/storage/innobase/ut/ut0crc32.cc
+++ b/storage/innobase/ut/ut0crc32.cc
@@ -86,7 +86,7 @@ mysys/my_perf.c, contributed by Facebook under the following license.
 #include "ut0crc32.h"

 #ifdef _MSC_VER
-#include <intrin.h>
+# include <intrin.h>
 #endif

 /* CRC32 hardware implementation. */
@@ -103,151 +103,75 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
 /* For runtime check  */
 unsigned int crc32c_aarch64_available(void);
 };
+# elif defined(_MSC_VER)
+#  define TRY_SSE4_2
+# elif defined (__GNUC__)
+#  ifdef __x86_64__
+#   define TRY_SSE4_2
+#  elif defined(__i386__) && (__GNUC__ > 4 || defined __clang__)
+#   define TRY_SSE4_2
+#  endif
 # endif

-
-# if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
-/********************************************************************//**
-Fetches CPU info */
-static
-void
-ut_cpuid(
-/*=====*/
-	uint32_t	vend[3],	/*!< out: CPU vendor */
-	uint32_t*	model,		/*!< out: CPU model */
-	uint32_t*	family,		/*!< out: CPU family */
-	uint32_t*	stepping,	/*!< out: CPU stepping */
-	uint32_t*	features_ecx,	/*!< out: CPU features ecx */
-	uint32_t*	features_edx)	/*!< out: CPU features edx */
+# ifdef TRY_SSE4_2
+/** return whether SSE4.2 instructions are available */
+static inline bool has_sse4_2()
 {
-	uint32_t	sig;
+  /* We assume that the CPUID instruction and its parameter 1 are available.
+  We do not support any precursors of the Intel 80486. */
 #  ifdef _MSC_VER
  int data[4];
-	__cpuid(data, 0);
-	/* ebx */
-	vend[0] = data[1];
-	/* edx */
-	vend[1] = data[3];
-	/* ecx */
-	vend[2] = data[2];
-
  __cpuid(data, 1);
-	/* eax */
-	sig = data[0];
-	/* ecx */
-	*features_ecx = data[2];
-	/* edx */
-	*features_edx = data[3];
+  return !!(data[2] & 1 << 20);
 #  else
-	asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0));
-	asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx)
-	    : "a" (1)
-	    : "ebx");
+  uint32_t eax, ecx;
+  asm("cpuid" : "=a"(eax), "=c"(ecx) : "a"(1) : "ebx", "edx");
+  return !!(ecx & 1 << 20);
 #  endif
-
-	*model = ((sig >> 4) & 0xF);
-	*family = ((sig >> 8) & 0xF);
-	*stepping = (sig & 0xF);
-
-	if (memcmp(vend, "GenuineIntel", 12) == 0
-	    || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
-
-		*model += (((sig >> 16) & 0xF) << 4);
-		*family += ((sig >> 20) & 0xFF);
-	}
 }

-/** Calculate CRC32 over 8-bit data using a hardware/CPU instruction.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 1 byte
-@param[in,out]	len	remaining bytes, it will be decremented with 1 */
-inline
-void
-ut_crc32_8_hw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
+/** Append 8 bits (1 byte) to a CRC-32C checksum.
+@param crc   CRC-32C checksum so far
+@param data  data to be checksummed
+@return the updated CRC-32C */
+static inline ulint ut_crc32c_8(ulint crc, byte data)
 {
 #  ifdef _MSC_VER
-	*crc = _mm_crc32_u8(*crc, (*data)[0]);
+  return _mm_crc32_u8(static_cast<uint32_t>(crc), data);
 #  elif __has_feature(memory_sanitizer)
-	*crc = __builtin_ia32_crc32qi(*crc, (*data)[0]);
+  return __builtin_ia32_crc32qi(crc, data);
 #  else
-	asm("crc32b %1, %0"
-	    /* output operands */
-	    : "+r" (*crc)
-	    /* input operands */
-	    : "rm" ((*data)[0]));
+  asm("crc32b %1, %0" : "+r" (crc) : "rm" (data));
+  return crc;
 #  endif
-
-	(*data)++;
-	(*len)--;
 }

-/** Calculate CRC32 over a 64-bit integer using a hardware/CPU instruction.
-@param[in]	crc	crc32 checksum so far
-@param[in]	data	data to be checksummed
-@return resulting checksum of crc + crc(data) */
-inline
-uint32_t
-ut_crc32_64_low_hw(
-	uint32_t	crc,
-	uint64_t	data)
+/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
+@param[in] crc    CRC-32C checksum so far
+@param[in] data   8 bytes of aligned data
+@return the updated CRC-32C */
+static inline ulint ut_crc32c_64(ulint crc, uint64_t data)
 {
-	uint64_t	crc_64bit = crc;
 #  ifdef _MSC_VER
 #   ifdef _M_X64
-	crc_64bit = _mm_crc32_u64(crc_64bit, data);
+  return _mm_crc32_u64(crc, data);
 #   elif defined(_M_IX86)
-	crc = _mm_crc32_u32(crc, static_cast<uint32_t>(data));
-	crc_64bit = _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
+  crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data));
+  crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
+  return crc;
 #   else
-#    error Not Supported processors type.
+#    error Unsupported processor type
 #   endif
 #  elif __has_feature(memory_sanitizer)
-	crc_64bit = __builtin_ia32_crc32di(crc_64bit, data);
+  return __builtin_ia32_crc32di(crc, data);
+#  elif defined __x86_64__
+  asm("crc32q %1, %0" : "+r" (crc) : "rm" (data));
+  return crc;
 #  else
-	asm("crc32q %1, %0"
-	    /* output operands */
-	    : "+r" (crc_64bit)
-	    /* input operands */
-	    : "rm" (data));
+  asm("crc32l %1, %0" : "+r" (crc) : "rm" (static_cast<uint32_t>(data)));
+  asm("crc32l %1, %0" : "+r" (crc) : "rm" (static_cast<uint32_t>(data >> 32)));
+  return crc;
 #  endif
-
-	return(static_cast<uint32_t>(crc_64bit));
-}
-
-/** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 8 bytes
-@param[in,out]	len	remaining bytes, it will be decremented with 8 */
-inline
-void
-ut_crc32_64_hw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-	uint64_t	data_int = *reinterpret_cast<const uint64_t*>(*data);
-
-#  ifdef WORDS_BIGENDIAN
-	/* Currently we only support x86_64 (little endian) CPUs. In case
-	some big endian CPU supports a CRC32 instruction, then maybe we will
-	need a byte order swap here. */
-#   error Dont know how to handle big endian CPUs
-	/*
-	data_int = ut_crc32_swap_byteorder(data_int);
-	*/
-#  endif /* WORDS_BIGENDIAN */
-
-	*crc = ut_crc32_64_low_hw(*crc, data_int);
-
-	*data += 8;
-	*len -= 8;
 }

 /** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions
@@ -257,84 +181,50 @@ ut_crc32_64_hw(
 @return CRC-32C (polynomial 0x11EDC6F41) */
 uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len)
 {
-	crc = ~crc;
+  ulint c= static_cast<uint32_t>(~crc);

  /* Calculate byte-by-byte up to an 8-byte aligned address. After
  this consume the input 8-bytes at a time. */
-	while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) {
-		ut_crc32_8_hw(&crc, &buf, &len);
+  while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
+  {
+    c= ut_crc32c_8(c, *buf++);
+    len--;
  }

-	/* Perf testing
-	./unittest/gunit/innodb/merge_innodb_tests-t --gtest_filter=ut0crc32.perf
-	on CPU "Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz"
-	with different N in "while (len >= N) {" shows:
-	N=16
-	2.867254 sec
-	2.866860 sec
-	2.867973 sec
-
-	N=32
-	2.715725 sec
-	2.713008 sec
-	2.712520 sec
-	(5.36% speedup over N=16)
-
-	N=64
-	2.634140 sec
-	2.636558 sec
-	2.636488 sec
-	(2.88% speedup over N=32)
-
-	N=128
-	2.599534 sec
-	2.599919 sec
-	2.598035 sec
-	(1.39% speedup over N=64)
-
-	N=256
-	2.576993 sec
-	2.576748 sec
-	2.575700 sec
-	(0.87% speedup over N=128)
-
-	N=512
-	2.693928 sec
-	2.691663 sec
-	2.692142 sec
-	(4.51% slowdown over N=256)
-	*/
-	while (len >= 128) {
+  const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
+
+  for (; len >= 128; len-= 128)
+  {
    /* This call is repeated 16 times. 16 * 8 = 128. */
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
+    c= ut_crc32c_64(c, *b64++);
  }

-	while (len >= 8) {
-		ut_crc32_64_hw(&crc, &buf, &len);
-	}
+  for (; len >= 8; len-= 8)
+    c= ut_crc32c_64(c, *b64++);

-	while (len > 0) {
-		ut_crc32_8_hw(&crc, &buf, &len);
-	}
+  buf= reinterpret_cast<const byte*>(b64);
+
+  while (len--)
+    c= ut_crc32c_8(c, *buf++);

-	return(~crc);
+  return ~static_cast<uint32_t>(c);
 }
-# endif /* defined(__GNUC__) && defined(__x86_64__) || (_WIN64) */
+# endif /* (defined(__GNUC__) && defined(__i386__)) || _MSC_VER */

 /* CRC32 software implementation. */

@@ -373,91 +263,44 @@ ut_crc32_slice8_table_init()
 	}
 }

-/** Calculate CRC32 over 8-bit data using a software implementation.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 1 byte
-@param[in,out]	len	remaining bytes, it will be decremented with 1 */
-inline
-void
-ut_crc32_8_sw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
+/** Append 8 bits (1 byte) to a CRC-32C checksum.
+@param crc   CRC-32C checksum so far
+@param data  data to be checksummed
+@return the updated CRC-32C */
+static inline uint32_t ut_crc32c_8_sw(uint32_t crc, byte data)
 {
-	const uint8_t	i = (*crc ^ (*data)[0]) & 0xFF;
-
-	*crc = (*crc >> 8) ^ ut_crc32_slice8_table[0][i];
+  const uint8_t i= (crc ^ data) & 0xFF;

-	(*data)++;
-	(*len)--;
+  return (crc >> 8) ^ ut_crc32_slice8_table[0][i];
 }

-/** Swap the byte order of an 8 byte integer.
-@param[in]	i	8-byte integer
-@return 8-byte integer */
-# ifdef WORDS_BIGENDIAN
-inline uint64_t ut_crc32_swap_byteorder(uint64_t i)
+/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
+@param[in] crc    CRC-32C checksum so far
+@param[in] data   8 bytes of aligned data
+@return the updated CRC-32C */
+static inline uint32_t ut_crc32c_64_sw(uint32_t crc, uint64_t data)
 {
-  return i << 56 |
-    (i & 0x000000000000FF00ULL) << 40 |
-    (i & 0x0000000000FF0000ULL) << 24 |
-    (i & 0x00000000FF000000ULL) << 8 |
-    (i & 0x000000FF00000000ULL) >> 8 |
-    (i & 0x0000FF0000000000ULL) >> 24 |
-    (i & 0x00FF000000000000ULL) >> 40 |
-    i >> 56;
-}
-# endif /* WORDS_BIGENDIAN */
-
-/** Calculate CRC32 over a 64-bit integer using a software implementation.
-@param[in]	crc	crc32 checksum so far
-@param[in]	data	data to be checksummed
-@return resulting checksum of crc + crc(data) */
-inline
-uint32_t
-ut_crc32_64_low_sw(
-	uint32_t	crc,
-	uint64_t	data)
-{
-	const uint64_t	i = crc ^ data;
-
-	return(
-		ut_crc32_slice8_table[7][(i      ) & 0xFF] ^
-		ut_crc32_slice8_table[6][(i >>  8) & 0xFF] ^
-		ut_crc32_slice8_table[5][(i >> 16) & 0xFF] ^
-		ut_crc32_slice8_table[4][(i >> 24) & 0xFF] ^
-		ut_crc32_slice8_table[3][(i >> 32) & 0xFF] ^
-		ut_crc32_slice8_table[2][(i >> 40) & 0xFF] ^
-		ut_crc32_slice8_table[1][(i >> 48) & 0xFF] ^
-		ut_crc32_slice8_table[0][(i >> 56)]
-	);
-}
-
-/** Calculate CRC32 over 64-bit byte string using a software implementation.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 8 bytes
-@param[in,out]	len	remaining bytes, it will be decremented with 8 */
-inline
-void
-ut_crc32_64_sw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-	uint64_t	data_int = *reinterpret_cast<const uint64_t*>(*data);
-
 # ifdef WORDS_BIGENDIAN
-	data_int = ut_crc32_swap_byteorder(data_int);
+  data= data << 56 |
+    (data & 0x000000000000FF00ULL) << 40 |
+    (data & 0x0000000000FF0000ULL) << 24 |
+    (data & 0x00000000FF000000ULL) << 8 |
+    (data & 0x000000FF00000000ULL) >> 8 |
+    (data & 0x0000FF0000000000ULL) >> 24 |
+    (data & 0x00FF000000000000ULL) >> 40 |
+    data >> 56;
 # endif /* WORDS_BIGENDIAN */

-	*crc = ut_crc32_64_low_sw(*crc, data_int);
-
-	*data += 8;
-	*len -= 8;
+  data^= crc;
+  return
+    ut_crc32_slice8_table[7][(data      ) & 0xFF] ^
+    ut_crc32_slice8_table[6][(data >>  8) & 0xFF] ^
+    ut_crc32_slice8_table[5][(data >> 16) & 0xFF] ^
+    ut_crc32_slice8_table[4][(data >> 24) & 0xFF] ^
+    ut_crc32_slice8_table[3][(data >> 32) & 0xFF] ^
+    ut_crc32_slice8_table[2][(data >> 40) & 0xFF] ^
+    ut_crc32_slice8_table[1][(data >> 48) & 0xFF] ^
+    ut_crc32_slice8_table[0][(data >> 56)];
 }

 /** Calculate CRC-32C using a look-up table.
@@ -467,43 +310,27 @@ ut_crc32_64_sw(
 @return CRC-32C (polynomial 0x11EDC6F41) */
 uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len)
 {
-	crc = ~crc;
+  crc= ~crc;

  /* Calculate byte-by-byte up to an 8-byte aligned address. After
  this consume the input 8-bytes at a time. */
-	while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) {
-		ut_crc32_8_sw(&crc, &buf, &len);
+  while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
+  {
+    crc= ut_crc32c_8_sw(crc, *buf++);
+    len--;
  }

-	while (len >= 128) {
-		/* This call is repeated 16 times. 16 * 8 = 128. */
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-	}
+  const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);

-	while (len >= 8) {
-		ut_crc32_64_sw(&crc, &buf, &len);
-	}
+  for (; len >= 8; len-= 8)
+    crc= ut_crc32c_64_sw(crc, *b64++);

-	while (len > 0) {
-		ut_crc32_8_sw(&crc, &buf, &len);
-	}
+  buf= reinterpret_cast<const byte*>(b64);
+
+  while (len--)
+    crc= ut_crc32c_8_sw(crc, *buf++);

-	return(~crc);
+  return ~crc;
 }

 ut_crc32_func_t ut_crc32_low= ut_crc32_sw;
@@ -513,37 +340,24 @@ const char *ut_crc32_implementation= "Using generic crc32 instructions";
 /********************************************************************//**
 Initializes the data structures used by ut_crc32*(). Does not do any
 allocations, would not hurt if called twice, but would be pointless. */
-void
-ut_crc32_init()
+void ut_crc32_init()
 {
-#ifdef HAVE_CRC32_VPMSUM
-#else
-	ut_crc32_slice8_table_init();
-
-# if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
-	uint32_t	vend[3];
-	uint32_t	model;
-	uint32_t	family;
-	uint32_t	stepping;
-	uint32_t	features_ecx;
-	uint32_t	features_edx;
-
-	ut_cpuid(vend, &model, &family, &stepping,
-		 &features_ecx, &features_edx);
-
-	if (features_ecx & 1 << 20) {
-		ut_crc32_low = ut_crc32_hw;
-		ut_crc32_implementation = "Using SSE2 crc32 instructions";
-	}
-# endif
-
-
+#ifndef HAVE_CRC32_VPMSUM
 # if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
-	if (crc32c_aarch64_available()) {
-		ut_crc32_low = crc32c_aarch64;
-		ut_crc32_implementation = "Using Armv8 crc32 instructions";
-
+  if (crc32c_aarch64_available())
+  {
+    ut_crc32_low= crc32c_aarch64;
+    ut_crc32_implementation= "Using ARMv8 crc32 instructions";
+    return;
+  }
+# elif defined(TRY_SSE4_2)
+  if (has_sse4_2())
+  {
+    ut_crc32_low= ut_crc32_hw;
+    ut_crc32_implementation= "Using SSE4.2 crc32 instructions";
+    return;
  }
 # endif
-#endif
+  ut_crc32_slice8_table_init();
+#endif /* !HAVE_CRC32_VPMSUM */
 }