Commit 58f184a4 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-24745 Generic CRC-32C computation wrongly uses SSE4.2 instructions

In commit d25f806d (MDEV-22749)
the CRC-32C implementation of MariaDB was broken on some
IA-32 and AMD64 builds, depending on the compiler version and
build options. This was verified for IA-32 on GCC 10.2.1.

Even though we try to identify the SSE4.2 extensions and the
availaibility of the PCLMULQDQ instruction by executing CPUID,
the fall-back code could be generated with extended instructions,
because the entire file mysys/crc32/crc32c.c was being compiled
with -msse4.2 -mpclmul. This would cause SIGILL on a PINSRD
instruction on affected IA-32 targets (such as some Intel Atom
processors). This might also affect old AMD64 processors
(predating the 2007 Intel Nehalem microarchitecture), if some
compiler chose to emit the offending instructions.

While it is fine to pass a target-specific option to a target-specific
compilation unit (like -mpclmul to a PCLMUL-specific compilation unit),
that is not safe for mixed-architecture compilation units.

For mixed-architecture compilation units, the correct way is to set
target attributes on the target-specific functions.

There does not seem to be a way to pass target attributes to
function template instantiation. Hence, we must replace the
ExtendImpl template with plain functions crc32_sse42() and
crc32_slow().

We will also remove some inconsistency between
my_crc32_implementation() and mysys_namespace::crc32::Choose_Extend().

The function crc32_pclmul_enabled() will be moved to mysys/crc32/crc32c.cc
so that the detection code will be compiled without -msse4.2 -mpclmul.

The AMD64 PCLMUL accelerated crc32c_3way() will be moved to a new
file crc32c_amd64.cc. In this way, only a few functions that depend
on -msse4.2 in mysys/crc32/crc32c.cc can be declared with
__attribute__((target("sse4.2"))), and most of the file can be compiled
for the generic target.

Last, the file mysys/crc32ieee.cc will be omitted on 64-bit POWER,
because it was dead code (no symbols were exported).

Reviewed by: Vladislav Vaintroub
parent 18a82901
......@@ -16,7 +16,7 @@
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys)
SET(MYSYS_SOURCES array.c charset-def.c charset.c crc32ieee.cc my_default.c
SET(MYSYS_SOURCES array.c charset-def.c charset.c my_default.c
get_password.c
errors.c hash.c list.c
mf_cache.c mf_dirname.c mf_fn_ext.c
......@@ -60,19 +60,29 @@ ENDIF()
IF(MSVC)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc)
ENDIF()
ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
IF(CLANG_CL)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
MY_CHECK_C_COMPILER_FLAG(-msse4.2)
MY_CHECK_C_COMPILER_FLAG(-mpclmul)
MY_CHECK_CXX_COMPILER_FLAG(-msse4.2)
MY_CHECK_CXX_COMPILER_FLAG(-mpclmul)
CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H)
IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H)
IF(have_CXX__msse4.2 AND HAVE_CPUID_H)
ADD_DEFINITIONS(-DHAVE_SSE42)
IF (have_CXX__mpclmul AND HAVE_X86INTRIN_H)
ADD_DEFINITIONS(-DHAVE_PCLMUL)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32c_amd64.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
ENDIF()
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
IF(CMAKE_COMPILER_IS_GNUCC)
......@@ -129,11 +139,15 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
COMPILE_FLAGS "-march=armv8-a+crc+crypto")
ENDIF()
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64" OR CMAKE_SYSTEM_NAME MATCHES AIX)
ENDIF()
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64" OR CMAKE_SYSTEM_NAME MATCHES AIX)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES
COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC)
ELSE()
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32ieee.cc)
ENDIF()
IF(UNIX)
......
/* Copyright (c) 2020 MariaDB
/* Copyright (c) 2020, 2021, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -55,38 +55,14 @@
#include <stdint.h>
#include <stddef.h>
#if defined(__GNUC__)
#ifdef __GNUC__
#include <x86intrin.h>
#include <cpuid.h>
#elif defined(_MSC_VER)
#include <intrin.h>
#else
#error "unknown compiler"
#endif
static int has_sse42_and_pclmul(uint32_t recx)
{
/* 1 << 20 is SSE42, 1 << 1 is PCLMULQDQ */
#define bits_SSE42_AND_PCLMUL (1 << 20 | 1 << 1)
return (recx & bits_SSE42_AND_PCLMUL) == bits_SSE42_AND_PCLMUL;
}
#ifdef __GNUC__
int crc32_pclmul_enabled(void)
{
uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
__cpuid(1, reax, rebx, recx, redx);
return has_sse42_and_pclmul(recx);
}
#elif defined(_MSC_VER)
int crc32_pclmul_enabled(void)
{
int regs[4];
__cpuid(regs, 1);
return has_sse42_and_pclmul(regs[2]);
}
#endif
/**
* @brief Shifts left 128 bit register by specified number of bytes
*
......
......@@ -32,11 +32,20 @@ static inline uint32_t DecodeFixed32(const char *ptr)
#endif
#ifdef HAVE_SSE42
#include <nmmintrin.h>
#include <wmmintrin.h>
#ifdef __GNUC__
#include <cpuid.h>
#endif
# ifdef __GNUC__
# include <cpuid.h>
# if __GNUC__ < 5 && !defined __clang__
/* the headers do not really work in GCC before version 5 */
# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
# else
# include <nmmintrin.h>
# endif
# define USE_SSE42 __attribute__((target("sse4.2")))
# else
# define USE_SSE42 /* nothing */
# endif
#endif
......@@ -337,19 +346,8 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
return DecodeFixed32(reinterpret_cast<const char*>(p));
}
#if defined(HAVE_SSE42) && (SIZEOF_SIZE_T == 8)
static inline uint64_t DecodeFixed64(const char *ptr)
static inline void Slow_CRC32(uint64_t* l, uint8_t const **p)
{
return uint8korr(ptr);
}
static inline uint64_t LE_LOAD64(const uint8_t *p) {
return DecodeFixed64(reinterpret_cast<const char*>(p));
}
#endif
static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
*p += 4;
*l = table3_[c & 0xff] ^
......@@ -365,27 +363,6 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
table0_[c >> 24];
}
__attribute__((unused)) static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
#ifndef HAVE_SSE42
Slow_CRC32(l, p);
#elif (SIZEOF_SIZE_T == 8)
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
*p += 8;
#else
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
#endif
}
template<void (*CRC32)(uint64_t*, uint8_t const**)>
uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
#ifdef ALIGN
#undef ALIGN
#endif
......@@ -398,70 +375,115 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
l = table0_[c] ^ (l >> 8); \
} while (0)
static uint32_t crc32c_slow(uint32_t crc, const char* buf, size_t size)
{
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
// Point x at first 16-byte aligned byte in string. This might be
// just past the end of the string.
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
if (x <= e) {
if (x <= e)
// Process bytes until finished or p is 16-byte aligned
while (p != x) {
while (p != x)
STEP1;
}
}
// Process bytes 16 at a time
while ((e-p) >= 16) {
CRC32(&l, &p);
CRC32(&l, &p);
while ((e-p) >= 16)
{
Slow_CRC32(&l, &p);
Slow_CRC32(&l, &p);
}
// Process bytes 8 at a time
while ((e-p) >= 8) {
CRC32(&l, &p);
}
while ((e-p) >= 8)
Slow_CRC32(&l, &p);
// Process the last few bytes
while (p != e) {
while (p != e)
STEP1;
}
#undef STEP1
#undef ALIGN
return static_cast<uint32_t>(l ^ 0xffffffffu);
}
// Detect if ARM64 CRC or not.
#ifndef HAVE_ARMV8_CRC
// Detect if SS42 or not.
#ifndef HAVE_POWER8
#if defined HAVE_POWER8
#elif defined HAVE_ARMV8_CRC
#elif defined HAVE_SSE42
constexpr uint32_t cpuid_ecx_SSE42= 1U << 20;
constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U<<1;
static bool isSSE42() {
#ifndef HAVE_SSE42
return false;
#elif defined(__GNUC__)
static uint32_t cpuid_ecx()
{
#ifdef __GNUC__
uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
__cpuid(1, reax, rebx, recx, redx);
return (recx & ((int)1 << 20)) != 0;
#elif defined(_MSC_VER)
int info[4];
__cpuid(info, 0x00000001);
return (info[2] & ((int)1 << 20)) != 0;
return recx;
#elif defined _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2];
#else
return false;
# error "unknown compiler"
#endif
}
#ifdef HAVE_SSE42
extern "C" int crc32_pclmul_enabled();
#endif
extern "C" int crc32_pclmul_enabled(void)
{
return !(~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL);
}
static bool isPCLMULQDQ() {
#ifdef HAVE_SSE42
return crc32_pclmul_enabled();
#else
return false;
#if SIZEOF_SIZE_T == 8
extern "C" uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len);
USE_SSE42
static inline uint64_t LE_LOAD64(const uint8_t *ptr)
{
return uint8korr(reinterpret_cast<const char*>(ptr));
}
#endif
USE_SSE42
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p)
{
# if (SIZEOF_SIZE_T == 8)
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
*p += 8;
# else
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
# endif
}
#endif // HAVE_POWER8
#endif // HAVE_ARMV8_CRC
USE_SSE42
static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size)
{
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
// Point x at first 16-byte aligned byte in string. This might be
// just past the end of the string.
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
if (x <= e)
// Process bytes until finished or p is 16-byte aligned
while (p != x)
STEP1;
// Process bytes 16 at a time
while ((e-p) >= 16)
{
Fast_CRC32(&l, &p);
Fast_CRC32(&l, &p);
}
// Process bytes 8 at a time
while ((e-p) >= 8)
Fast_CRC32(&l, &p);
// Process the last few bytes
while (p != e)
STEP1;
return static_cast<uint32_t>(l ^ 0xffffffffu);
}
#endif
typedef uint32_t (*Function)(uint32_t, const char*, size_t);
......@@ -507,14 +529,6 @@ static int arch_ppc_probe(void) {
return arch_ppc_crc32;
}
#endif // __linux__
static bool isAltiVec() {
if (arch_ppc_probe()) {
return true;
} else {
return false;
}
}
#endif
#if defined(HAVE_ARMV8_CRC)
......@@ -526,760 +540,59 @@ static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
}
#endif
extern "C" const char * my_crc32c_implementation()
static inline Function Choose_Extend()
{
#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
#if defined HAVE_POWER8 && defined HAS_ALTIVEC
if (arch_ppc_probe())
return "Using POWER8 crc32 instructions";
return ExtendPPCImpl;
#elif defined(HAVE_ARMV8_CRC)
const char *ret = crc32c_aarch64_available();
if (ret)
return ret ;
if (crc32c_aarch64_available())
return ExtendARMImpl;
#elif HAVE_SSE42
if (isSSE42())
{
if (SIZEOF_SIZE_T == 8 && isPCLMULQDQ())
return "Using crc32 + pclmulqdq instructions";
return "Using SSE4.2 crc32 instructions";
# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) {
case cpuid_ecx_SSE42_AND_PCLMUL:
return crc32c_3way;
case cpuid_ecx_SSE42:
return crc32c_sse42;
}
# else
if (cpuid_ecx() & cpuid_ecx_SSE42)
return crc32c_sse42;
# endif
#endif
return "Using generic crc32 instructions";
return crc32c_slow;
}
static const Function ChosenExtend= Choose_Extend();
/*
* Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
* Ferry Toth
* ftoth@exalondelft.nl
*
* https://github.com/htot/crc32c
*
* Modified by Facebook
*
* Original intel whitepaper:
* "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
*
* This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
*
*/
#if defined HAVE_SSE42 && defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
#define CRCtriplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
#define CRCduplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
#define CRCsinglet(crc, buf, offset) \
crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
// Numbers taken directly from intel whitepaper.
// clang-format off
static const uint64_t clmul_constants[] = {
0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
};
// Compute the crc32c value for buffer smaller than 8
static inline void align_to_8(
size_t len,
uint64_t& crc0, // crc so far, updated on return
const unsigned char*& next) { // next data pointer, updated on return
uint32_t crc32bit = static_cast<uint32_t>(crc0);
if (len & 0x04) {
crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
next += sizeof(uint32_t);
}
if (len & 0x02) {
crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
next += sizeof(uint16_t);
}
if (len & 0x01) {
crc32bit = _mm_crc32_u8(crc32bit, *(next));
next++;
}
crc0 = crc32bit;
}
//
// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
// chosen constant and xor's these with the remaining CRC.
//
static inline uint64_t CombineCRC(
size_t block_size,
uint64_t crc0,
uint64_t crc1,
uint64_t crc2,
const uint64_t* next2) {
const auto multiplier =
*(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
const auto crc0_xmm = _mm_set_epi64x(0, crc0);
const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
const auto crc1_xmm = _mm_set_epi64x(0, crc1);
const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
const auto res = _mm_xor_si128(res0, res1);
crc0 = _mm_cvtsi128_si64(res);
crc0 = crc0 ^ *((uint64_t*)next2 - 1);
crc2 = _mm_crc32_u64(crc2, crc0);
return crc2;
}
// Compute CRC-32C using the Intel hardware instruction.
static inline uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
const unsigned char* next = (const unsigned char*)buf;
uint64_t count;
uint64_t crc0, crc1, crc2;
crc0 = crc ^ 0xffffffffu;
if (len >= 8) {
// if len > 216 then align and use triplets
if (len > 216) {
{
// Work on the bytes (< 8) before the first 8-byte alignment addr starts
auto align_bytes = (8 - (uintptr_t)next) & 7;
len -= align_bytes;
align_to_8(align_bytes, crc0, next);
}
// Now work on the remaining blocks
count = len / 24; // number of triplets
len %= 24; // bytes remaining
uint64_t n = count >> 7; // #blocks = first block + full blocks
uint64_t block_size = count & 127;
if (block_size == 0) {
block_size = 128;
} else {
n++;
}
// points to the first byte of the next block
const uint64_t* next0 = (uint64_t*)next + block_size;
const uint64_t* next1 = next0 + block_size;
const uint64_t* next2 = next1 + block_size;
crc1 = crc2 = 0;
// Use Duff's device, a for() loop inside a switch()
// statement. This needs to execute at least once, round len
// down to nearest triplet multiple
switch (block_size) {
case 128:
do {
// jumps here for a full block of len 128
CRCtriplet(crc, next, -128);
/* fallthrough */
case 127:
// jumps here or below for the first block smaller
CRCtriplet(crc, next, -127);
/* fallthrough */
case 126:
CRCtriplet(crc, next, -126); // than 128
/* fallthrough */
case 125:
CRCtriplet(crc, next, -125);
/* fallthrough */
case 124:
CRCtriplet(crc, next, -124);
/* fallthrough */
case 123:
CRCtriplet(crc, next, -123);
/* fallthrough */
case 122:
CRCtriplet(crc, next, -122);
/* fallthrough */
case 121:
CRCtriplet(crc, next, -121);
/* fallthrough */
case 120:
CRCtriplet(crc, next, -120);
/* fallthrough */
case 119:
CRCtriplet(crc, next, -119);
/* fallthrough */
case 118:
CRCtriplet(crc, next, -118);
/* fallthrough */
case 117:
CRCtriplet(crc, next, -117);
/* fallthrough */
case 116:
CRCtriplet(crc, next, -116);
/* fallthrough */
case 115:
CRCtriplet(crc, next, -115);
/* fallthrough */
case 114:
CRCtriplet(crc, next, -114);
/* fallthrough */
case 113:
CRCtriplet(crc, next, -113);
/* fallthrough */
case 112:
CRCtriplet(crc, next, -112);
/* fallthrough */
case 111:
CRCtriplet(crc, next, -111);
/* fallthrough */
case 110:
CRCtriplet(crc, next, -110);
/* fallthrough */
case 109:
CRCtriplet(crc, next, -109);
/* fallthrough */
case 108:
CRCtriplet(crc, next, -108);
/* fallthrough */
case 107:
CRCtriplet(crc, next, -107);
/* fallthrough */
case 106:
CRCtriplet(crc, next, -106);
/* fallthrough */
case 105:
CRCtriplet(crc, next, -105);
/* fallthrough */
case 104:
CRCtriplet(crc, next, -104);
/* fallthrough */
case 103:
CRCtriplet(crc, next, -103);
/* fallthrough */
case 102:
CRCtriplet(crc, next, -102);
/* fallthrough */
case 101:
CRCtriplet(crc, next, -101);
/* fallthrough */
case 100:
CRCtriplet(crc, next, -100);
/* fallthrough */
case 99:
CRCtriplet(crc, next, -99);
/* fallthrough */
case 98:
CRCtriplet(crc, next, -98);
/* fallthrough */
case 97:
CRCtriplet(crc, next, -97);
/* fallthrough */
case 96:
CRCtriplet(crc, next, -96);
/* fallthrough */
case 95:
CRCtriplet(crc, next, -95);
/* fallthrough */
case 94:
CRCtriplet(crc, next, -94);
/* fallthrough */
case 93:
CRCtriplet(crc, next, -93);
/* fallthrough */
case 92:
CRCtriplet(crc, next, -92);
/* fallthrough */
case 91:
CRCtriplet(crc, next, -91);
/* fallthrough */
case 90:
CRCtriplet(crc, next, -90);
/* fallthrough */
case 89:
CRCtriplet(crc, next, -89);
/* fallthrough */
case 88:
CRCtriplet(crc, next, -88);
/* fallthrough */
case 87:
CRCtriplet(crc, next, -87);
/* fallthrough */
case 86:
CRCtriplet(crc, next, -86);
/* fallthrough */
case 85:
CRCtriplet(crc, next, -85);
/* fallthrough */
case 84:
CRCtriplet(crc, next, -84);
/* fallthrough */
case 83:
CRCtriplet(crc, next, -83);
/* fallthrough */
case 82:
CRCtriplet(crc, next, -82);
/* fallthrough */
case 81:
CRCtriplet(crc, next, -81);
/* fallthrough */
case 80:
CRCtriplet(crc, next, -80);
/* fallthrough */
case 79:
CRCtriplet(crc, next, -79);
/* fallthrough */
case 78:
CRCtriplet(crc, next, -78);
/* fallthrough */
case 77:
CRCtriplet(crc, next, -77);
/* fallthrough */
case 76:
CRCtriplet(crc, next, -76);
/* fallthrough */
case 75:
CRCtriplet(crc, next, -75);
/* fallthrough */
case 74:
CRCtriplet(crc, next, -74);
/* fallthrough */
case 73:
CRCtriplet(crc, next, -73);
/* fallthrough */
case 72:
CRCtriplet(crc, next, -72);
/* fallthrough */
case 71:
CRCtriplet(crc, next, -71);
/* fallthrough */
case 70:
CRCtriplet(crc, next, -70);
/* fallthrough */
case 69:
CRCtriplet(crc, next, -69);
/* fallthrough */
case 68:
CRCtriplet(crc, next, -68);
/* fallthrough */
case 67:
CRCtriplet(crc, next, -67);
/* fallthrough */
case 66:
CRCtriplet(crc, next, -66);
/* fallthrough */
case 65:
CRCtriplet(crc, next, -65);
/* fallthrough */
case 64:
CRCtriplet(crc, next, -64);
/* fallthrough */
case 63:
CRCtriplet(crc, next, -63);
/* fallthrough */
case 62:
CRCtriplet(crc, next, -62);
/* fallthrough */
case 61:
CRCtriplet(crc, next, -61);
/* fallthrough */
case 60:
CRCtriplet(crc, next, -60);
/* fallthrough */
case 59:
CRCtriplet(crc, next, -59);
/* fallthrough */
case 58:
CRCtriplet(crc, next, -58);
/* fallthrough */
case 57:
CRCtriplet(crc, next, -57);
/* fallthrough */
case 56:
CRCtriplet(crc, next, -56);
/* fallthrough */
case 55:
CRCtriplet(crc, next, -55);
/* fallthrough */
case 54:
CRCtriplet(crc, next, -54);
/* fallthrough */
case 53:
CRCtriplet(crc, next, -53);
/* fallthrough */
case 52:
CRCtriplet(crc, next, -52);
/* fallthrough */
case 51:
CRCtriplet(crc, next, -51);
/* fallthrough */
case 50:
CRCtriplet(crc, next, -50);
/* fallthrough */
case 49:
CRCtriplet(crc, next, -49);
/* fallthrough */
case 48:
CRCtriplet(crc, next, -48);
/* fallthrough */
case 47:
CRCtriplet(crc, next, -47);
/* fallthrough */
case 46:
CRCtriplet(crc, next, -46);
/* fallthrough */
case 45:
CRCtriplet(crc, next, -45);
/* fallthrough */
case 44:
CRCtriplet(crc, next, -44);
/* fallthrough */
case 43:
CRCtriplet(crc, next, -43);
/* fallthrough */
case 42:
CRCtriplet(crc, next, -42);
/* fallthrough */
case 41:
CRCtriplet(crc, next, -41);
/* fallthrough */
case 40:
CRCtriplet(crc, next, -40);
/* fallthrough */
case 39:
CRCtriplet(crc, next, -39);
/* fallthrough */
case 38:
CRCtriplet(crc, next, -38);
/* fallthrough */
case 37:
CRCtriplet(crc, next, -37);
/* fallthrough */
case 36:
CRCtriplet(crc, next, -36);
/* fallthrough */
case 35:
CRCtriplet(crc, next, -35);
/* fallthrough */
case 34:
CRCtriplet(crc, next, -34);
/* fallthrough */
case 33:
CRCtriplet(crc, next, -33);
/* fallthrough */
case 32:
CRCtriplet(crc, next, -32);
/* fallthrough */
case 31:
CRCtriplet(crc, next, -31);
/* fallthrough */
case 30:
CRCtriplet(crc, next, -30);
/* fallthrough */
case 29:
CRCtriplet(crc, next, -29);
/* fallthrough */
case 28:
CRCtriplet(crc, next, -28);
/* fallthrough */
case 27:
CRCtriplet(crc, next, -27);
/* fallthrough */
case 26:
CRCtriplet(crc, next, -26);
/* fallthrough */
case 25:
CRCtriplet(crc, next, -25);
/* fallthrough */
case 24:
CRCtriplet(crc, next, -24);
/* fallthrough */
case 23:
CRCtriplet(crc, next, -23);
/* fallthrough */
case 22:
CRCtriplet(crc, next, -22);
/* fallthrough */
case 21:
CRCtriplet(crc, next, -21);
/* fallthrough */
case 20:
CRCtriplet(crc, next, -20);
/* fallthrough */
case 19:
CRCtriplet(crc, next, -19);
/* fallthrough */
case 18:
CRCtriplet(crc, next, -18);
/* fallthrough */
case 17:
CRCtriplet(crc, next, -17);
/* fallthrough */
case 16:
CRCtriplet(crc, next, -16);
/* fallthrough */
case 15:
CRCtriplet(crc, next, -15);
/* fallthrough */
case 14:
CRCtriplet(crc, next, -14);
/* fallthrough */
case 13:
CRCtriplet(crc, next, -13);
/* fallthrough */
case 12:
CRCtriplet(crc, next, -12);
/* fallthrough */
case 11:
CRCtriplet(crc, next, -11);
/* fallthrough */
case 10:
CRCtriplet(crc, next, -10);
/* fallthrough */
case 9:
CRCtriplet(crc, next, -9);
/* fallthrough */
case 8:
CRCtriplet(crc, next, -8);
/* fallthrough */
case 7:
CRCtriplet(crc, next, -7);
/* fallthrough */
case 6:
CRCtriplet(crc, next, -6);
/* fallthrough */
case 5:
CRCtriplet(crc, next, -5);
/* fallthrough */
case 4:
CRCtriplet(crc, next, -4);
/* fallthrough */
case 3:
CRCtriplet(crc, next, -3);
/* fallthrough */
case 2:
CRCtriplet(crc, next, -2);
/* fallthrough */
case 1:
CRCduplet(crc, next, -1); // the final triplet is actually only 2
//{ CombineCRC(); }
crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
if (--n > 0) {
crc1 = crc2 = 0;
block_size = 128;
// points to the first byte of the next block
next0 = next2 + 128;
next1 = next0 + 128; // from here on all blocks are 128 long
next2 = next1 + 128;
}
/* fallthrough */
case 0:;
} while (n > 0);
}
next = (const unsigned char*)next2;
}
uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
len = len & 7;
next += (count2 * 8);
switch (count2) {
case 27:
CRCsinglet(crc0, next, -27 * 8);
/* fallthrough */
case 26:
CRCsinglet(crc0, next, -26 * 8);
/* fallthrough */
case 25:
CRCsinglet(crc0, next, -25 * 8);
/* fallthrough */
case 24:
CRCsinglet(crc0, next, -24 * 8);
/* fallthrough */
case 23:
CRCsinglet(crc0, next, -23 * 8);
/* fallthrough */
case 22:
CRCsinglet(crc0, next, -22 * 8);
/* fallthrough */
case 21:
CRCsinglet(crc0, next, -21 * 8);
/* fallthrough */
case 20:
CRCsinglet(crc0, next, -20 * 8);
/* fallthrough */
case 19:
CRCsinglet(crc0, next, -19 * 8);
/* fallthrough */
case 18:
CRCsinglet(crc0, next, -18 * 8);
/* fallthrough */
case 17:
CRCsinglet(crc0, next, -17 * 8);
/* fallthrough */
case 16:
CRCsinglet(crc0, next, -16 * 8);
/* fallthrough */
case 15:
CRCsinglet(crc0, next, -15 * 8);
/* fallthrough */
case 14:
CRCsinglet(crc0, next, -14 * 8);
/* fallthrough */
case 13:
CRCsinglet(crc0, next, -13 * 8);
/* fallthrough */
case 12:
CRCsinglet(crc0, next, -12 * 8);
/* fallthrough */
case 11:
CRCsinglet(crc0, next, -11 * 8);
/* fallthrough */
case 10:
CRCsinglet(crc0, next, -10 * 8);
/* fallthrough */
case 9:
CRCsinglet(crc0, next, -9 * 8);
/* fallthrough */
case 8:
CRCsinglet(crc0, next, -8 * 8);
/* fallthrough */
case 7:
CRCsinglet(crc0, next, -7 * 8);
/* fallthrough */
case 6:
CRCsinglet(crc0, next, -6 * 8);
/* fallthrough */
case 5:
CRCsinglet(crc0, next, -5 * 8);
/* fallthrough */
case 4:
CRCsinglet(crc0, next, -4 * 8);
/* fallthrough */
case 3:
CRCsinglet(crc0, next, -3 * 8);
/* fallthrough */
case 2:
CRCsinglet(crc0, next, -2 * 8);
/* fallthrough */
case 1:
CRCsinglet(crc0, next, -1 * 8);
/* fallthrough */
case 0:;
}
}
{
align_to_8(len, crc0, next);
return (uint32_t)crc0 ^ 0xffffffffu;
}
static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size)
{
return ChosenExtend(crc, buf, size);
}
#else
#define NO_THREEWAY_CRC32C
#endif //HAVE_SSE42 && HAVE_PCLMUL
static inline Function Choose_Extend() {
#ifdef HAVE_POWER8
return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
extern "C" const char *my_crc32c_implementation()
{
#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
if (ChosenExtend == ExtendPPCImpl)
return "Using POWER8 crc32 instructions";
#elif defined(HAVE_ARMV8_CRC)
if(crc32c_aarch64_available()) {
return ExtendARMImpl;
} else {
return ExtendImpl<Slow_CRC32>;
}
#else
if (isSSE42()) {
if (isPCLMULQDQ()) {
#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
return crc32c_3way;
#else
return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
#endif
}
else { // no runtime PCLMULQDQ support but has SSE42 support
return ExtendImpl<Fast_CRC32>;
}
} // end of isSSE42()
else {
return ExtendImpl<Slow_CRC32>;
}
if (const char *ret= crc32c_aarch64_available())
return ret;
#elif HAVE_SSE42
# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
if (ChosenExtend == crc32c_3way)
return "Using crc32 + pclmulqdq instructions";
# endif
if (ChosenExtend == crc32c_sse42)
return "Using SSE4.2 crc32 instructions";
#endif
}
static const Function ChosenExtend = Choose_Extend();
static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
return ChosenExtend(crc, buf, size);
return "Using generic crc32 instructions";
}
} // namespace crc32c
} // namespace mysys_namespace
extern "C" unsigned int my_crc32c(unsigned int crc, const char *buf, size_t size)
extern "C" unsigned my_crc32c(unsigned int crc, const char *buf, size_t size)
{
return mysys_namespace::crc32c::Extend(crc,buf, size);
}
/* Copyright (c) 2020, 2021, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
/*
* Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
* Ferry Toth
* ftoth@exalondelft.nl
*
* https://github.com/htot/crc32c
*
* Modified by Facebook
*
* Original intel whitepaper:
* "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
*
* This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
*
*/
#include <stdint.h>
#include <nmmintrin.h>
#include <wmmintrin.h>
#define CRCtriplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
#define CRCduplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
#define CRCsinglet(crc, buf, offset) \
crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
// Numbers taken directly from intel whitepaper.
// clang-format off
static const uint64_t clmul_constants alignas(16) [] = {
0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
};
// Compute the crc32c value for buffer smaller than 8
static inline void align_to_8(
size_t len,
uint64_t& crc0, // crc so far, updated on return
const unsigned char*& next) { // next data pointer, updated on return
uint32_t crc32bit = static_cast<uint32_t>(crc0);
if (len & 0x04) {
crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
next += sizeof(uint32_t);
}
if (len & 0x02) {
crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
next += sizeof(uint16_t);
}
if (len & 0x01) {
crc32bit = _mm_crc32_u8(crc32bit, *(next));
next++;
}
crc0 = crc32bit;
}
//
// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
// chosen constant and xor's these with the remaining CRC.
//
static inline uint64_t CombineCRC(
size_t block_size,
uint64_t crc0,
uint64_t crc1,
uint64_t crc2,
const uint64_t* next2) {
const auto multiplier =
*(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
const auto crc0_xmm = _mm_set_epi64x(0, crc0);
const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
const auto crc1_xmm = _mm_set_epi64x(0, crc1);
const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
const auto res = _mm_xor_si128(res0, res1);
crc0 = _mm_cvtsi128_si64(res);
crc0 = crc0 ^ *((uint64_t*)next2 - 1);
crc2 = _mm_crc32_u64(crc2, crc0);
return crc2;
}
// Compute CRC-32C using the Intel hardware instruction.
extern "C"
uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len)
{
const unsigned char* next = (const unsigned char*)buf;
uint64_t count;
uint64_t crc0, crc1, crc2;
crc0 = crc ^ 0xffffffffu;
if (len >= 8) {
// if len > 216 then align and use triplets
if (len > 216) {
{
// Work on the bytes (< 8) before the first 8-byte alignment addr starts
auto align_bytes = (8 - (uintptr_t)next) & 7;
len -= align_bytes;
align_to_8(align_bytes, crc0, next);
}
// Now work on the remaining blocks
count = len / 24; // number of triplets
len %= 24; // bytes remaining
uint64_t n = count >> 7; // #blocks = first block + full blocks
uint64_t block_size = count & 127;
if (block_size == 0) {
block_size = 128;
} else {
n++;
}
// points to the first byte of the next block
const uint64_t* next0 = (uint64_t*)next + block_size;
const uint64_t* next1 = next0 + block_size;
const uint64_t* next2 = next1 + block_size;
crc1 = crc2 = 0;
// Use Duff's device, a for() loop inside a switch()
// statement. This needs to execute at least once, round len
// down to nearest triplet multiple
switch (block_size) {
case 128:
do {
// jumps here for a full block of len 128
CRCtriplet(crc, next, -128);
/* fallthrough */
case 127:
// jumps here or below for the first block smaller
CRCtriplet(crc, next, -127);
/* fallthrough */
case 126:
CRCtriplet(crc, next, -126); // than 128
/* fallthrough */
case 125:
CRCtriplet(crc, next, -125);
/* fallthrough */
case 124:
CRCtriplet(crc, next, -124);
/* fallthrough */
case 123:
CRCtriplet(crc, next, -123);
/* fallthrough */
case 122:
CRCtriplet(crc, next, -122);
/* fallthrough */
case 121:
CRCtriplet(crc, next, -121);
/* fallthrough */
case 120:
CRCtriplet(crc, next, -120);
/* fallthrough */
case 119:
CRCtriplet(crc, next, -119);
/* fallthrough */
case 118:
CRCtriplet(crc, next, -118);
/* fallthrough */
case 117:
CRCtriplet(crc, next, -117);
/* fallthrough */
case 116:
CRCtriplet(crc, next, -116);
/* fallthrough */
case 115:
CRCtriplet(crc, next, -115);
/* fallthrough */
case 114:
CRCtriplet(crc, next, -114);
/* fallthrough */
case 113:
CRCtriplet(crc, next, -113);
/* fallthrough */
case 112:
CRCtriplet(crc, next, -112);
/* fallthrough */
case 111:
CRCtriplet(crc, next, -111);
/* fallthrough */
case 110:
CRCtriplet(crc, next, -110);
/* fallthrough */
case 109:
CRCtriplet(crc, next, -109);
/* fallthrough */
case 108:
CRCtriplet(crc, next, -108);
/* fallthrough */
case 107:
CRCtriplet(crc, next, -107);
/* fallthrough */
case 106:
CRCtriplet(crc, next, -106);
/* fallthrough */
case 105:
CRCtriplet(crc, next, -105);
/* fallthrough */
case 104:
CRCtriplet(crc, next, -104);
/* fallthrough */
case 103:
CRCtriplet(crc, next, -103);
/* fallthrough */
case 102:
CRCtriplet(crc, next, -102);
/* fallthrough */
case 101:
CRCtriplet(crc, next, -101);
/* fallthrough */
case 100:
CRCtriplet(crc, next, -100);
/* fallthrough */
case 99:
CRCtriplet(crc, next, -99);
/* fallthrough */
case 98:
CRCtriplet(crc, next, -98);
/* fallthrough */
case 97:
CRCtriplet(crc, next, -97);
/* fallthrough */
case 96:
CRCtriplet(crc, next, -96);
/* fallthrough */
case 95:
CRCtriplet(crc, next, -95);
/* fallthrough */
case 94:
CRCtriplet(crc, next, -94);
/* fallthrough */
case 93:
CRCtriplet(crc, next, -93);
/* fallthrough */
case 92:
CRCtriplet(crc, next, -92);
/* fallthrough */
case 91:
CRCtriplet(crc, next, -91);
/* fallthrough */
case 90:
CRCtriplet(crc, next, -90);
/* fallthrough */
case 89:
CRCtriplet(crc, next, -89);
/* fallthrough */
case 88:
CRCtriplet(crc, next, -88);
/* fallthrough */
case 87:
CRCtriplet(crc, next, -87);
/* fallthrough */
case 86:
CRCtriplet(crc, next, -86);
/* fallthrough */
case 85:
CRCtriplet(crc, next, -85);
/* fallthrough */
case 84:
CRCtriplet(crc, next, -84);
/* fallthrough */
case 83:
CRCtriplet(crc, next, -83);
/* fallthrough */
case 82:
CRCtriplet(crc, next, -82);
/* fallthrough */
case 81:
CRCtriplet(crc, next, -81);
/* fallthrough */
case 80:
CRCtriplet(crc, next, -80);
/* fallthrough */
case 79:
CRCtriplet(crc, next, -79);
/* fallthrough */
case 78:
CRCtriplet(crc, next, -78);
/* fallthrough */
case 77:
CRCtriplet(crc, next, -77);
/* fallthrough */
case 76:
CRCtriplet(crc, next, -76);
/* fallthrough */
case 75:
CRCtriplet(crc, next, -75);
/* fallthrough */
case 74:
CRCtriplet(crc, next, -74);
/* fallthrough */
case 73:
CRCtriplet(crc, next, -73);
/* fallthrough */
case 72:
CRCtriplet(crc, next, -72);
/* fallthrough */
case 71:
CRCtriplet(crc, next, -71);
/* fallthrough */
case 70:
CRCtriplet(crc, next, -70);
/* fallthrough */
case 69:
CRCtriplet(crc, next, -69);
/* fallthrough */
case 68:
CRCtriplet(crc, next, -68);
/* fallthrough */
case 67:
CRCtriplet(crc, next, -67);
/* fallthrough */
case 66:
CRCtriplet(crc, next, -66);
/* fallthrough */
case 65:
CRCtriplet(crc, next, -65);
/* fallthrough */
case 64:
CRCtriplet(crc, next, -64);
/* fallthrough */
case 63:
CRCtriplet(crc, next, -63);
/* fallthrough */
case 62:
CRCtriplet(crc, next, -62);
/* fallthrough */
case 61:
CRCtriplet(crc, next, -61);
/* fallthrough */
case 60:
CRCtriplet(crc, next, -60);
/* fallthrough */
case 59:
CRCtriplet(crc, next, -59);
/* fallthrough */
case 58:
CRCtriplet(crc, next, -58);
/* fallthrough */
case 57:
CRCtriplet(crc, next, -57);
/* fallthrough */
case 56:
CRCtriplet(crc, next, -56);
/* fallthrough */
case 55:
CRCtriplet(crc, next, -55);
/* fallthrough */
case 54:
CRCtriplet(crc, next, -54);
/* fallthrough */
case 53:
CRCtriplet(crc, next, -53);
/* fallthrough */
case 52:
CRCtriplet(crc, next, -52);
/* fallthrough */
case 51:
CRCtriplet(crc, next, -51);
/* fallthrough */
case 50:
CRCtriplet(crc, next, -50);
/* fallthrough */
case 49:
CRCtriplet(crc, next, -49);
/* fallthrough */
case 48:
CRCtriplet(crc, next, -48);
/* fallthrough */
case 47:
CRCtriplet(crc, next, -47);
/* fallthrough */
case 46:
CRCtriplet(crc, next, -46);
/* fallthrough */
case 45:
CRCtriplet(crc, next, -45);
/* fallthrough */
case 44:
CRCtriplet(crc, next, -44);
/* fallthrough */
case 43:
CRCtriplet(crc, next, -43);
/* fallthrough */
case 42:
CRCtriplet(crc, next, -42);
/* fallthrough */
case 41:
CRCtriplet(crc, next, -41);
/* fallthrough */
case 40:
CRCtriplet(crc, next, -40);
/* fallthrough */
case 39:
CRCtriplet(crc, next, -39);
/* fallthrough */
case 38:
CRCtriplet(crc, next, -38);
/* fallthrough */
case 37:
CRCtriplet(crc, next, -37);
/* fallthrough */
case 36:
CRCtriplet(crc, next, -36);
/* fallthrough */
case 35:
CRCtriplet(crc, next, -35);
/* fallthrough */
case 34:
CRCtriplet(crc, next, -34);
/* fallthrough */
case 33:
CRCtriplet(crc, next, -33);
/* fallthrough */
case 32:
CRCtriplet(crc, next, -32);
/* fallthrough */
case 31:
CRCtriplet(crc, next, -31);
/* fallthrough */
case 30:
CRCtriplet(crc, next, -30);
/* fallthrough */
case 29:
CRCtriplet(crc, next, -29);
/* fallthrough */
case 28:
CRCtriplet(crc, next, -28);
/* fallthrough */
case 27:
CRCtriplet(crc, next, -27);
/* fallthrough */
case 26:
CRCtriplet(crc, next, -26);
/* fallthrough */
case 25:
CRCtriplet(crc, next, -25);
/* fallthrough */
case 24:
CRCtriplet(crc, next, -24);
/* fallthrough */
case 23:
CRCtriplet(crc, next, -23);
/* fallthrough */
case 22:
CRCtriplet(crc, next, -22);
/* fallthrough */
case 21:
CRCtriplet(crc, next, -21);
/* fallthrough */
case 20:
CRCtriplet(crc, next, -20);
/* fallthrough */
case 19:
CRCtriplet(crc, next, -19);
/* fallthrough */
case 18:
CRCtriplet(crc, next, -18);
/* fallthrough */
case 17:
CRCtriplet(crc, next, -17);
/* fallthrough */
case 16:
CRCtriplet(crc, next, -16);
/* fallthrough */
case 15:
CRCtriplet(crc, next, -15);
/* fallthrough */
case 14:
CRCtriplet(crc, next, -14);
/* fallthrough */
case 13:
CRCtriplet(crc, next, -13);
/* fallthrough */
case 12:
CRCtriplet(crc, next, -12);
/* fallthrough */
case 11:
CRCtriplet(crc, next, -11);
/* fallthrough */
case 10:
CRCtriplet(crc, next, -10);
/* fallthrough */
case 9:
CRCtriplet(crc, next, -9);
/* fallthrough */
case 8:
CRCtriplet(crc, next, -8);
/* fallthrough */
case 7:
CRCtriplet(crc, next, -7);
/* fallthrough */
case 6:
CRCtriplet(crc, next, -6);
/* fallthrough */
case 5:
CRCtriplet(crc, next, -5);
/* fallthrough */
case 4:
CRCtriplet(crc, next, -4);
/* fallthrough */
case 3:
CRCtriplet(crc, next, -3);
/* fallthrough */
case 2:
CRCtriplet(crc, next, -2);
/* fallthrough */
case 1:
CRCduplet(crc, next, -1); // the final triplet is actually only 2
//{ CombineCRC(); }
crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
if (--n > 0) {
crc1 = crc2 = 0;
block_size = 128;
// points to the first byte of the next block
next0 = next2 + 128;
next1 = next0 + 128; // from here on all blocks are 128 long
next2 = next1 + 128;
}
/* fallthrough */
case 0:;
} while (n > 0);
}
next = (const unsigned char*)next2;
}
uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
len = len & 7;
next += (count2 * 8);
switch (count2) {
case 27:
CRCsinglet(crc0, next, -27 * 8);
/* fallthrough */
case 26:
CRCsinglet(crc0, next, -26 * 8);
/* fallthrough */
case 25:
CRCsinglet(crc0, next, -25 * 8);
/* fallthrough */
case 24:
CRCsinglet(crc0, next, -24 * 8);
/* fallthrough */
case 23:
CRCsinglet(crc0, next, -23 * 8);
/* fallthrough */
case 22:
CRCsinglet(crc0, next, -22 * 8);
/* fallthrough */
case 21:
CRCsinglet(crc0, next, -21 * 8);
/* fallthrough */
case 20:
CRCsinglet(crc0, next, -20 * 8);
/* fallthrough */
case 19:
CRCsinglet(crc0, next, -19 * 8);
/* fallthrough */
case 18:
CRCsinglet(crc0, next, -18 * 8);
/* fallthrough */
case 17:
CRCsinglet(crc0, next, -17 * 8);
/* fallthrough */
case 16:
CRCsinglet(crc0, next, -16 * 8);
/* fallthrough */
case 15:
CRCsinglet(crc0, next, -15 * 8);
/* fallthrough */
case 14:
CRCsinglet(crc0, next, -14 * 8);
/* fallthrough */
case 13:
CRCsinglet(crc0, next, -13 * 8);
/* fallthrough */
case 12:
CRCsinglet(crc0, next, -12 * 8);
/* fallthrough */
case 11:
CRCsinglet(crc0, next, -11 * 8);
/* fallthrough */
case 10:
CRCsinglet(crc0, next, -10 * 8);
/* fallthrough */
case 9:
CRCsinglet(crc0, next, -9 * 8);
/* fallthrough */
case 8:
CRCsinglet(crc0, next, -8 * 8);
/* fallthrough */
case 7:
CRCsinglet(crc0, next, -7 * 8);
/* fallthrough */
case 6:
CRCsinglet(crc0, next, -6 * 8);
/* fallthrough */
case 5:
CRCsinglet(crc0, next, -5 * 8);
/* fallthrough */
case 4:
CRCsinglet(crc0, next, -4 * 8);
/* fallthrough */
case 3:
CRCsinglet(crc0, next, -3 * 8);
/* fallthrough */
case 2:
CRCsinglet(crc0, next, -2 * 8);
/* fallthrough */
case 1:
CRCsinglet(crc0, next, -1 * 8);
/* fallthrough */
case 0:;
}
}
{
align_to_8(len, crc0, next);
return (uint32_t)crc0 ^ 0xffffffffu;
}
}
/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
/* Copyright (c) 2020, 2021, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -39,25 +39,23 @@ typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
static my_crc32_t init_crc32()
{
my_crc32_t func= my_crc32_zlib;
#ifdef HAVE_PCLMUL
if (crc32_pclmul_enabled())
func = crc32_pclmul;
return crc32_pclmul;
#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
if (crc32_aarch64_available())
func= crc32_aarch64;
return crc32_aarch64;
#endif
return func;
return my_crc32_zlib;
}
static const my_crc32_t my_checksum_func= init_crc32();
#ifndef __powerpc64__
/* For powerpc, my_checksum is defined elsewhere.*/
extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len)
#ifdef __powerpc64__
# error "my_checksum() is defined in mysys/crc32/crc32_ppc64.c"
#endif
extern "C"
unsigned int my_checksum(unsigned int crc, const void *data, size_t len)
{
return my_checksum_func(crc, data, len);
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment