Commit 611cd6b9 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-33817 preparation: Restructuring and unit tests

In our unit test, let us rely on our own reference
implementation using the reflected
CRC-32 ISO 3309 and CRC-32C polynomials. Let us also
test with various lengths.

Let us refactor the CRC-32 and CRC-32C implementations
so that no special compilation flags will be needed and
that some function call indirection will be avoided.

pmull_supported: Remove. We will have pointers to two separate
functions crc32c_aarch64_pmull() and crc32c_aarch64().
parent b84d335d
......@@ -60,29 +60,21 @@ IF (WIN32)
ENDIF()
IF(MSVC)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c crc32/crc32c_x86.cc)
IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc)
ENDIF()
ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
IF(CLANG_CL)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
MY_CHECK_CXX_COMPILER_FLAG(-msse4.2)
MY_CHECK_CXX_COMPILER_FLAG(-mpclmul)
CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H)
IF(have_CXX__msse4.2 AND HAVE_CPUID_H)
ADD_DEFINITIONS(-DHAVE_SSE42)
IF (have_CXX__mpclmul AND HAVE_X86INTRIN_H)
ADD_DEFINITIONS(-DHAVE_PCLMUL)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32c_amd64.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c crc32/crc32c_x86.cc)
IF(CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS "5")
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES
COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc)
IF(CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS "5")
SET_SOURCE_FILES_PROPERTIES(crc32/crc32c_amd64.cc PROPERTIES
COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
......
#include <my_global.h>
#include <string.h>
#include <stdint.h>
#include <stddef.h>
static int pmull_supported;
typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t);
#if defined(HAVE_ARMV8_CRC)
#ifdef HAVE_ARMV8_CRC
#if defined(__APPLE__)
#include <sys/sysctl.h>
# ifdef HAVE_ARMV8_CRYPTO
static unsigned crc32c_aarch64_pmull(unsigned, const void *, size_t);
# endif
# ifdef __APPLE__
# include <sys/sysctl.h>
int crc32_aarch64_available(void)
{
......@@ -18,17 +23,17 @@ int crc32_aarch64_available(void)
return ret;
}
const char *crc32c_aarch64_available(void)
my_crc32_t crc32c_aarch64_available(void)
{
if (crc32_aarch64_available() == 0)
return NULL;
pmull_supported = 1;
return "Using ARMv8 crc32 + pmull instructions";
# ifdef HAVE_ARMV8_CRYPTO
if (crc32_aarch64_available())
return crc32c_aarch64_pmull;
# endif
return NULL;
}
#else
#include <sys/auxv.h>
#if defined(__FreeBSD__)
# else
# include <sys/auxv.h>
# ifdef __FreeBSD__
static unsigned long getauxval(unsigned int key)
{
unsigned long val;
......@@ -36,17 +41,17 @@ static unsigned long getauxval(unsigned int key)
return 0ul;
return val;
}
#else
# include <asm/hwcap.h>
#endif
# else
# include <asm/hwcap.h>
# endif
#ifndef HWCAP_CRC32
# define HWCAP_CRC32 (1 << 7)
#endif
# ifndef HWCAP_CRC32
# define HWCAP_CRC32 (1 << 7)
# endif
#ifndef HWCAP_PMULL
# define HWCAP_PMULL (1 << 4)
#endif
# ifndef HWCAP_PMULL
# define HWCAP_PMULL (1 << 4)
# endif
/* ARM made crc32 default from ARMv8.1 but optional in ARMv8A
* Runtime check API.
......@@ -56,22 +61,37 @@ int crc32_aarch64_available(void)
unsigned long auxv= getauxval(AT_HWCAP);
return (auxv & HWCAP_CRC32) != 0;
}
# endif
# ifndef __APPLE__
static unsigned crc32c_aarch64(unsigned, const void *, size_t);
const char *crc32c_aarch64_available(void)
my_crc32_t crc32c_aarch64_available(void)
{
unsigned long auxv= getauxval(AT_HWCAP);
if (!(auxv & HWCAP_CRC32))
return NULL;
# ifdef HAVE_ARMV8_CRYPTO
/* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). */
if (auxv & HWCAP_PMULL)
return crc32c_aarch64_pmull;
# endif
return crc32c_aarch64;
}
# endif
pmull_supported= (auxv & HWCAP_PMULL) != 0;
if (pmull_supported)
const char *crc32c_aarch64_impl(my_crc32_t c)
{
# ifdef HAVE_ARMV8_CRYPTO
if (c == crc32c_aarch64_pmull)
return "Using ARMv8 crc32 + pmull instructions";
else
# endif
# ifndef __APPLE__
if (c == crc32c_aarch64)
return "Using ARMv8 crc32 instructions";
# endif
return NULL;
}
#endif /* __APPLE__ */
#endif /* HAVE_ARMV8_CRC */
#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
......@@ -157,131 +177,14 @@ asm(".arch_extension crypto");
PREF4X64L2(buffer,(PREF_OFFSET), 8) \
PREF4X64L2(buffer,(PREF_OFFSET), 12)
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
#ifndef __APPLE__
static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len)
{
uint32_t crc0, crc1, crc2;
int64_t length= (int64_t)len;
const unsigned char *buffer= buf;
crc^= 0xffffffff;
/* Pmull runtime check here.
* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
*
* Consider the condition that the target platform does support hardware crc32
* but not support PMULL. In this condition, it should leverage the aarch64
* crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull)
* rather than skip all hardware crc32 instruction of computation.
*/
if (pmull_supported)
{
/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */
#ifdef HAVE_ARMV8_CRYPTO
/* Crypto extension Support
* Parallel computation with 1024 Bytes (per block)
* Intrinsics Support
*/
# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1= 0xe417f38a, k2= 0x8f158014;
uint64_t t0, t1;
/* Process per block size of 1024 Bytes
* A block size = 8 + 42*3*sizeof(uint64_t) + 8
*/
while ((length-= 1024) >= 0)
{
/* Prefetch 3*1024 data for avoiding L2 cache miss */
PREF1KL2(buffer, 1024*3);
/* Do first 8 bytes here for better pipelining */
crc0= __crc32cd(crc, *(const uint64_t *)buffer);
crc1= 0;
crc2= 0;
buffer+= sizeof(uint64_t);
/* Process block inline
* Process crc0 last to avoid dependency with above
*/
CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);
buffer+= 42*3*sizeof(uint64_t);
/* Prefetch data for following block to avoid L1 cache miss */
PREF1KL1(buffer, 1024);
/* Last 8 bytes
* Merge crc0 and crc1 into crc2
* crc1 multiply by K2
* crc0 multiply by K1
*/
t1= (uint64_t)vmull_p64(crc1, k2);
t0= (uint64_t)vmull_p64(crc0, k1);
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
crc1= __crc32cd(0, t1);
crc^= crc1;
crc0= __crc32cd(0, t0);
crc^= crc0;
buffer+= sizeof(uint64_t);
}
# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/*No intrinsics*/
__asm__("mov x16, #0xf38a \n\t"
"movk x16, #0xe417, lsl 16 \n\t"
"mov v1.2d[0], x16 \n\t"
"mov x16, #0x8014 \n\t"
"movk x16, #0x8f15, lsl 16 \n\t"
"mov v0.2d[0], x16 \n\t"
:::"x16");
while ((length-= 1024) >= 0)
{
PREF1KL2(buffer, 1024*3);
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
crc1= 0;
crc2= 0;
buffer+= sizeof(uint64_t);
CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);
buffer+= 42*3*sizeof(uint64_t);
PREF1KL1(buffer, 1024);
__asm__("mov v2.2d[0], %x[c1] \n\t"
"pmull v2.1q, v2.1d, v0.1d \n\t"
"mov v3.2d[0], %x[c0] \n\t"
"pmull v3.1q, v3.1d, v1.1d \n\t"
"crc32cx %w[c], %w[c2], %x[v] \n\t"
"mov %x[c1], v2.2d[0] \n\t"
"crc32cx %w[c1], wzr, %x[c1] \n\t"
"eor %w[c], %w[c], %w[c1] \n\t"
"mov %x[c0], v3.2d[0] \n\t"
"crc32cx %w[c0], wzr, %x[c0] \n\t"
"eor %w[c], %w[c], %w[c0] \n\t"
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
:[v]"r"(*((const uint64_t *)buffer)));
buffer+= sizeof(uint64_t);
}
# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/* Done if Input data size is aligned with 1024 */
if (!(length+= 1024))
return ~crc;
#endif /* HAVE_ARMV8_CRYPTO */
} // end if pmull_supported
while ((length-= sizeof(uint64_t)) >= 0)
{
CRC32CX(crc, *(uint64_t *)buffer);
......@@ -306,6 +209,143 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
return ~crc;
}
#endif
#ifdef HAVE_ARMV8_CRYPTO
static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
{
int64_t length= (int64_t)len;
const unsigned char *buffer= buf;
crc^= 0xffffffff;
/* Crypto extension Support
* Parallel computation with 1024 Bytes (per block)
* Intrinsics Support
*/
# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
/* Process per block size of 1024 Bytes
* A block size = 8 + 42*3*sizeof(uint64_t) + 8
*/
for (const poly64_t k1= 0xe417f38a, k2= 0x8f158014; (length-= 1024) >= 0; )
{
uint32_t crc0, crc1, crc2;
uint64_t t0, t1;
/* Prefetch 3*1024 data for avoiding L2 cache miss */
PREF1KL2(buffer, 1024*3);
/* Do first 8 bytes here for better pipelining */
crc0= __crc32cd(crc, *(const uint64_t *)buffer);
crc1= 0;
crc2= 0;
buffer+= sizeof(uint64_t);
/* Process block inline
* Process crc0 last to avoid dependency with above
*/
CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);
buffer+= 42*3*sizeof(uint64_t);
/* Prefetch data for following block to avoid L1 cache miss */
PREF1KL1(buffer, 1024);
/* Last 8 bytes
* Merge crc0 and crc1 into crc2
* crc1 multiply by K2
* crc0 multiply by K1
*/
t1= (uint64_t)vmull_p64(crc1, k2);
t0= (uint64_t)vmull_p64(crc0, k1);
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
crc1= __crc32cd(0, t1);
crc^= crc1;
crc0= __crc32cd(0, t0);
crc^= crc0;
buffer+= sizeof(uint64_t);
}
# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/*No intrinsics*/
__asm__("mov x16, #0xf38a \n\t"
"movk x16, #0xe417, lsl 16 \n\t"
"mov v1.2d[0], x16 \n\t"
"mov x16, #0x8014 \n\t"
"movk x16, #0x8f15, lsl 16 \n\t"
"mov v0.2d[0], x16 \n\t"
:::"x16");
while ((length-= 1024) >= 0)
{
uint32_t crc0, crc1, crc2;
PREF1KL2(buffer, 1024*3);
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
crc1= 0;
crc2= 0;
buffer+= sizeof(uint64_t);
CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);
buffer+= 42*3*sizeof(uint64_t);
PREF1KL1(buffer, 1024);
__asm__("mov v2.2d[0], %x[c1] \n\t"
"pmull v2.1q, v2.1d, v0.1d \n\t"
"mov v3.2d[0], %x[c0] \n\t"
"pmull v3.1q, v3.1d, v1.1d \n\t"
"crc32cx %w[c], %w[c2], %x[v] \n\t"
"mov %x[c1], v2.2d[0] \n\t"
"crc32cx %w[c1], wzr, %x[c1] \n\t"
"eor %w[c], %w[c], %w[c1] \n\t"
"mov %x[c0], v3.2d[0] \n\t"
"crc32cx %w[c0], wzr, %x[c0] \n\t"
"eor %w[c], %w[c], %w[c0] \n\t"
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
:[v]"r"(*((const uint64_t *)buffer)));
buffer+= sizeof(uint64_t);
}
# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/* Done if Input data size is aligned with 1024 */
length+= 1024;
if (length)
{
while ((length-= sizeof(uint64_t)) >= 0)
{
CRC32CX(crc, *(uint64_t *)buffer);
buffer+= sizeof(uint64_t);
}
/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t))
{
CRC32CW(crc, *(uint32_t *)buffer);
buffer+= sizeof(uint32_t);
}
if (length & sizeof(uint16_t))
{
CRC32CH(crc, *(uint16_t *)buffer);
buffer+= sizeof(uint16_t);
}
if (length & sizeof(uint8_t))
CRC32CB(crc, *buffer);
}
return ~crc;
}
#endif /* HAVE_ARMV8_CRYPTO */
/* There are multiple approaches to calculate crc.
Approach-1: Process 8 bytes then 4 bytes then 2 bytes and then 1 bytes
......
......@@ -56,11 +56,16 @@
#include <stddef.h>
#ifdef __GNUC__
#include <x86intrin.h>
# include <emmintrin.h>
# include <smmintrin.h>
# include <tmmintrin.h>
# include <wmmintrin.h>
# define USE_PCLMUL __attribute__((target("sse4.2,pclmul")))
#elif defined(_MSC_VER)
#include <intrin.h>
# include <intrin.h>
# define USE_PCLMUL /* nothing */
#else
#error "unknown compiler"
# error "unknown compiler"
#endif
/**
......@@ -71,6 +76,7 @@
*
* @return \a reg << (\a num * 8)
*/
USE_PCLMUL
static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num)
{
static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= {
......@@ -111,6 +117,7 @@ struct crcr_pclmulqdq_ctx
*
* @return New 16 byte folded data
*/
USE_PCLMUL
static inline __m128i crcr32_folding_round(const __m128i data_block,
const __m128i precomp, const __m128i fold)
{
......@@ -128,6 +135,7 @@ static inline __m128i crcr32_folding_round(const __m128i data_block,
*
* @return data reduced to 64 bits
*/
USE_PCLMUL
static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp)
{
__m128i tmp0, tmp1, tmp2;
......@@ -152,6 +160,7 @@ static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i pre
*
* @return data reduced to 32 bits
*/
USE_PCLMUL
static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp)
{
static const MY_ALIGNED(16) uint32_t mask1[4]= {
......@@ -188,6 +197,7 @@ static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i prec
*
* @return CRC for given \a data block (32 bits wide).
*/
USE_PCLMUL
static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len,
uint32_t crc,
const struct crcr_pclmulqdq_ctx *params)
......
......@@ -19,52 +19,23 @@
#include <stddef.h>
#include <stdint.h>
#include <my_global.h>
#include <my_byteorder.h>
static inline uint32_t DecodeFixed32(const char *ptr)
{
return uint4korr(ptr);
}
#include <stdint.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifdef HAVE_SSE42
# ifdef __GNUC__
# include <cpuid.h>
# if __GNUC__ < 5 && !defined __clang__
/* the headers do not really work in GCC before version 5 */
# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
# else
# include <nmmintrin.h>
# endif
# define USE_SSE42 __attribute__((target("sse4.2")))
# else
# define USE_SSE42 /* nothing */
# endif
#endif
#ifdef __powerpc64__
#include "crc32c_ppc.h"
#if __linux__
#include <sys/auxv.h>
# include "crc32c_ppc.h"
# ifdef __linux__
# include <sys/auxv.h>
#ifndef PPC_FEATURE2_VEC_CRYPTO
#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
#endif
# ifndef PPC_FEATURE2_VEC_CRYPTO
# define PPC_FEATURE2_VEC_CRYPTO 0x02000000
# endif
#ifndef AT_HWCAP2
#define AT_HWCAP2 26
# ifndef AT_HWCAP2
# define AT_HWCAP2 26
# endif
# endif
#endif
#endif /* __linux__ */
#endif
typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t);
namespace mysys_namespace {
namespace crc32c {
......@@ -75,6 +46,7 @@ static int arch_ppc_crc32 = 0;
#endif /* __powerpc64__ */
#endif
alignas(CPU_LEVEL1_DCACHE_LINESIZE)
static const uint32_t table0_[256] = {
0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
......@@ -341,8 +313,9 @@ static const uint32_t table3_[256] = {
};
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
static inline uint32_t LE_LOAD32(const uint8_t *p) {
return DecodeFixed32(reinterpret_cast<const char*>(p));
static inline uint32_t LE_LOAD32(const uint8_t *p)
{
return uint4korr(reinterpret_cast<const char*>(p));
}
static inline void Slow_CRC32(uint64_t* l, uint8_t const **p)
......@@ -362,10 +335,7 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p)
table0_[c >> 24];
}
#ifdef ALIGN
#undef ALIGN
#endif
// Align n to (1 << m) byte boundary
#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
......@@ -374,70 +344,30 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p)
l = table0_[c] ^ (l >> 8); \
} while (0)
static uint32_t crc32c_slow(uint32_t crc, const char* buf, size_t size)
{
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
// Point x at first 16-byte aligned byte in string. This might be
// just past the end of the string.
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
if (x <= e)
// Process bytes until finished or p is 16-byte aligned
while (p != x)
STEP1;
// Process bytes 16 at a time
while ((e-p) >= 16)
{
Slow_CRC32(&l, &p);
Slow_CRC32(&l, &p);
}
// Process bytes 8 at a time
while ((e-p) >= 8)
Slow_CRC32(&l, &p);
// Process the last few bytes
while (p != e)
STEP1;
return static_cast<uint32_t>(l ^ 0xffffffffu);
}
#if defined HAVE_POWER8
#elif defined HAVE_ARMV8_CRC
#elif defined HAVE_SSE42
constexpr uint32_t cpuid_ecx_SSE42= 1U << 20;
constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U<<1;
static uint32_t cpuid_ecx()
{
#ifdef __GNUC__
uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
__cpuid(1, reax, rebx, recx, redx);
return recx;
#elif defined _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2];
#else
# error "unknown compiler"
#undef USE_SSE42
#if defined _MSC_VER && (defined _M_X64 || defined _M_IX86)
# include <intrin.h>
# include <immintrin.h>
# define USE_SSE42 /* nothing */
#elif defined __GNUC__ && (defined __i386__||defined __x86_64__)
# if __GNUC__ < 5 && !defined __clang_major__
/* the headers do not really work in GCC before version 5 */
# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
# else
# include <nmmintrin.h>
# endif
# define USE_SSE42 __attribute__((target("sse4.2")))
#endif
}
extern "C" int crc32_pclmul_enabled(void)
{
return !(~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL);
}
#if SIZEOF_SIZE_T == 8
extern "C" uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len);
USE_SSE42
#ifdef USE_SSE42
# if SIZEOF_SIZE_T == 8
static inline uint64_t LE_LOAD64(const uint8_t *ptr)
{
return uint8korr(reinterpret_cast<const char*>(ptr));
}
#endif
# endif
USE_SSE42
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p)
......@@ -453,10 +383,11 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p)
# endif
}
extern "C"
USE_SSE42
static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size)
unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size)
{
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *p = static_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
......@@ -484,107 +415,111 @@ static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size)
}
#endif
typedef uint32_t (*Function)(uint32_t, const char*, size_t);
static unsigned crc32c_slow(unsigned crc, const void* buf, size_t size)
{
const uint8_t *p = static_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint64_t l = crc ^ 0xffffffffu;
#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
return crc32c_ppc(crc, (const unsigned char *)buf, size);
// Point x at first 16-byte aligned byte in string. This might be
// just past the end of the string.
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
if (x <= e)
// Process bytes until finished or p is 16-byte aligned
while (p != x)
STEP1;
// Process bytes 16 at a time
while ((e-p) >= 16)
{
Slow_CRC32(&l, &p);
Slow_CRC32(&l, &p);
}
// Process bytes 8 at a time
while ((e-p) >= 8)
Slow_CRC32(&l, &p);
// Process the last few bytes
while (p != e)
STEP1;
return static_cast<uint32_t>(l ^ 0xffffffffu);
}
#if __linux__
#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
# ifdef __linux__
static int arch_ppc_probe(void) {
arch_ppc_crc32 = 0;
#if defined(__powerpc64__)
# if defined(__powerpc64__)
if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
#endif /* __powerpc64__ */
# endif /* __powerpc64__ */
return arch_ppc_crc32;
}
#elif __FreeBSD_version >= 1200000
#include <machine/cpu.h>
#include <sys/auxv.h>
#include <sys/elf_common.h>
# elif defined __FreeBSD_version && __FreeBSD_version >= 1200000
# include <machine/cpu.h>
# include <sys/auxv.h>
# include <sys/elf_common.h>
static int arch_ppc_probe(void) {
unsigned long cpufeatures;
arch_ppc_crc32 = 0;
#if defined(__powerpc64__)
# if defined(__powerpc64__)
elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures));
if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1;
#endif /* __powerpc64__ */
# endif /* __powerpc64__ */
return arch_ppc_crc32;
}
#elif defined(_AIX) || defined(__OpenBSD__)
# elif defined(_AIX) || defined(__OpenBSD__)
static int arch_ppc_probe(void) {
arch_ppc_crc32 = 0;
#if defined(__powerpc64__)
# if defined(__powerpc64__)
// AIX 7.1+/OpenBSD has vector crypto features on all POWER 8+
arch_ppc_crc32 = 1;
#endif /* __powerpc64__ */
# endif /* __powerpc64__ */
return arch_ppc_crc32;
}
#endif // __linux__
# endif
#endif
#if defined(HAVE_ARMV8_CRC)
extern "C" const char *crc32c_aarch64_available(void);
extern "C" uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
return crc32c_aarch64(crc, (const unsigned char *)buf, (size_t) size);
}
extern "C" my_crc32_t crc32c_aarch64_available(void);
extern "C" const char *crc32c_aarch64_impl(my_crc32_t);
#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86
extern "C" my_crc32_t crc32c_x86_available(void);
extern "C" const char *crc32c_x86_impl(my_crc32_t);
#endif
static inline Function Choose_Extend()
static inline my_crc32_t Choose_Extend()
{
#if defined HAVE_POWER8 && defined HAS_ALTIVEC
if (arch_ppc_probe())
return ExtendPPCImpl;
#elif defined(HAVE_ARMV8_CRC)
if (crc32c_aarch64_available())
return ExtendARMImpl;
#elif HAVE_SSE42
# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) {
case cpuid_ecx_SSE42_AND_PCLMUL:
return crc32c_3way;
case cpuid_ecx_SSE42:
return crc32c_sse42;
}
# else
if (cpuid_ecx() & cpuid_ecx_SSE42)
return crc32c_sse42;
# endif
return crc32c_ppc;
#elif defined HAVE_ARMV8_CRC
if (my_crc32_t crc= crc32c_aarch64_available())
return crc;
#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86
if (my_crc32_t crc= crc32c_x86_available())
return crc;
#endif
return crc32c_slow;
}
static const Function ChosenExtend= Choose_Extend();
static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size)
{
return ChosenExtend(crc, buf, size);
}
static const my_crc32_t ChosenExtend= Choose_Extend();
extern "C" const char *my_crc32c_implementation()
{
#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
if (ChosenExtend == ExtendPPCImpl)
#if defined HAVE_POWER8 && defined HAS_ALTIVEC
if (ChosenExtend == crc32c_ppc)
return "Using POWER8 crc32 instructions";
#elif defined(HAVE_ARMV8_CRC)
if (const char *ret= crc32c_aarch64_available())
#elif defined HAVE_ARMV8_CRC
if (const char *ret= crc32c_aarch64_impl(ChosenExtend))
return ret;
#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86
if (const char *ret= crc32c_x86_impl(ChosenExtend))
return ret;
#elif HAVE_SSE42
# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
if (ChosenExtend == crc32c_3way)
return "Using crc32 + pclmulqdq instructions";
# endif
if (ChosenExtend == crc32c_sse42)
return "Using SSE4.2 crc32 instructions";
#endif
return "Using generic crc32 instructions";
}
......@@ -593,5 +528,5 @@ extern "C" const char *my_crc32c_implementation()
extern "C" unsigned my_crc32c(unsigned int crc, const char *buf, size_t size)
{
return mysys_namespace::crc32c::Extend(crc,buf, size);
return mysys_namespace::crc32c::ChosenExtend(crc,buf, size);
}
......@@ -47,6 +47,11 @@
#include <nmmintrin.h>
#include <wmmintrin.h>
#ifdef _MSC_VER
# define USE_PCLMUL /* nothing */
#else
# define USE_PCLMUL __attribute__((target("sse4.2,pclmul")))
#endif
#define CRCtriplet(crc, buf, offset) \
crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
......@@ -131,6 +136,7 @@ static const uint64_t clmul_constants alignas(16) [] = {
};
// Compute the crc32c value for buffer smaller than 8
USE_PCLMUL
static inline void align_to_8(
size_t len,
uint64_t& crc0, // crc so far, updated on return
......@@ -155,6 +161,7 @@ static inline void align_to_8(
// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
// chosen constant and xor's these with the remaining CRC.
//
USE_PCLMUL
static inline uint64_t CombineCRC(
size_t block_size,
uint64_t crc0,
......@@ -176,6 +183,7 @@ static inline uint64_t CombineCRC(
// Compute CRC-32C using the Intel hardware instruction.
extern "C"
USE_PCLMUL
uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len)
{
const unsigned char* next = (const unsigned char*)buf;
......
......@@ -11,8 +11,7 @@
extern "C" {
#endif
extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
unsigned len);
extern unsigned crc32c_ppc(unsigned crc, const void *buffer, size_t len);
#ifdef __cplusplus
}
......
#include <my_global.h>
#include <cstddef>
#include <cstdint>
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif
extern "C" unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size);
constexpr uint32_t cpuid_ecx_SSE42= 1U << 20;
constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U << 1;
static uint32_t cpuid_ecx()
{
#ifdef __GNUC__
uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
__cpuid(1, reax, rebx, recx, redx);
return recx;
#elif defined _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2];
#else
# error "unknown compiler"
#endif
}
typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t);
extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t);
extern "C" unsigned int crc32c_3way(unsigned int, const void *, size_t);
extern "C" my_crc32_t crc32_pclmul_enabled(void)
{
if (~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL)
return nullptr;
return crc32_pclmul;
}
extern "C" my_crc32_t crc32c_x86_available(void)
{
#if SIZEOF_SIZE_T == 8
switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) {
case cpuid_ecx_SSE42_AND_PCLMUL:
return crc32c_3way;
case cpuid_ecx_SSE42:
return crc32c_sse42;
}
#else
if (cpuid_ecx() & cpuid_ecx_SSE42)
return crc32c_sse42;
#endif
return nullptr;
}
extern "C" const char *crc32c_x86_impl(my_crc32_t c)
{
#if SIZEOF_SIZE_T == 8
if (c == crc32c_3way)
return "Using crc32 + pclmulqdq instructions";
#endif
if (c == crc32c_sse42)
return "Using SSE4.2 crc32 instructions";
return nullptr;
}
......@@ -28,7 +28,7 @@
* any later version, or
* b) the Apache License, Version 2.0
*/
#include <stddef.h>
#include <altivec.h>
......@@ -57,12 +57,13 @@ static unsigned int __attribute__ ((aligned (32)))
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
unsigned long len)
unsigned CRC32_FUNCTION(unsigned crc, const void *buffer, size_t len)
{
unsigned int prealign;
unsigned int tail;
const unsigned char *p = buffer;
#ifdef CRC_XOR
crc ^= 0xffffffff;
#endif
......
......@@ -26,23 +26,22 @@ static unsigned int my_crc32_zlib(unsigned int crc, const void *data,
return (unsigned int) crc32(crc, (const Bytef *)data, (unsigned int) len);
}
#ifdef HAVE_PCLMUL
extern "C" int crc32_pclmul_enabled();
extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t);
#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
#if defined _M_IX86 || defined _M_X64 || defined __i386__ || defined __x86_64__
extern "C" my_crc32_t crc32_pclmul_enabled();
#elif defined HAVE_ARMV8_CRC
extern "C" int crc32_aarch64_available();
extern "C" unsigned int crc32_aarch64(unsigned int, const void *, size_t);
#endif
typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
static my_crc32_t init_crc32()
{
#ifdef HAVE_PCLMUL
if (crc32_pclmul_enabled())
return crc32_pclmul;
#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
#if defined _M_IX86 || defined _M_X64 || defined __i386__ || defined __x86_64__
if (my_crc32_t crc= crc32_pclmul_enabled())
return crc;
#elif defined HAVE_ARMV8_CRC
if (crc32_aarch64_available())
return crc32_aarch64;
#endif
......
/* Copyright (c) MariaDB 2020
/* Copyright (c) MariaDB 2020, 2024
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
......@@ -19,51 +19,127 @@
#include <tap.h>
#include <string.h>
#include <ctype.h>
#include <zlib.h>
/*
Check that optimized crc32 (ieee, or ethernet polynomical) returns the same
result as zlib (not so well optimized, yet, but trustworthy)
The following lookup table oriented computation of CRC-32
is based on the Public Domain / Creative Commons CC0 Perl code from
http://billauer.co.il/blog/2011/05/perl-crc32-crc-xs-module/
*/
#define DO_TEST_CRC32(crc,str) \
ok(crc32(crc,(const Bytef *)str,(uint)(sizeof(str)-1)) == my_checksum(crc, str, sizeof(str)-1), "crc32 '%s'",str)
/* Check that CRC32-C calculation returns correct result*/
#define DO_TEST_CRC32C(crc,str,expected) \
do { \
unsigned int v = my_crc32c(crc, str, sizeof(str)-1); \
printf("crc32(%u,'%s',%zu)=%u\n",crc,str,sizeof(str)-1,v); \
ok(expected == my_crc32c(crc, str, sizeof(str)-1),"crc32c '%s'",str); \
}while(0)
/** Lookup tables */
static uint32 tab_3309[256], tab_castagnoli[256];
/** Initialize a lookup table for a CRC-32 polynomial */
static void init_lookup(uint32 *tab, uint32 polynomial)
{
unsigned i;
for (i= 0; i < 256; i++)
{
uint32 x= i;
unsigned j;
for (j= 0; j < 8; j++)
if (x & 1)
x= (x >> 1) ^ polynomial;
else
x>>= 1;
tab[i]= x;
}
}
/** Compute a CRC-32 one octet at a time based on a lookup table */
static uint crc_(uint32 crc, const void *buf, size_t len, const uint32 *tab)
{
const unsigned char *b= buf;
const unsigned char *const end = b + len;
crc^= 0xffffffff;
while (b != end)
crc= ((crc >> 8) & 0xffffff) ^ tab[(crc ^ *b++) & 0xff];
crc^= 0xffffffff;
return crc;
}
static uint crc32(uint32 crc, const void *buf, size_t len)
{ return crc_(crc, buf, len, tab_3309); }
static uint crc32c(uint32 crc, const void *buf, size_t len)
{ return crc_(crc, buf, len, tab_castagnoli); }
#define LONG_STR "1234567890234568900212345678901231213123321212123123123123123"\
"............................................................................." \
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy" \
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
static char buf[16384];
typedef uint (*check)(uint32, const void*, size_t);
static size_t test_buf(check c1, check c2)
{
size_t s;
for (s= sizeof buf; s; s--)
if (c1(0, buf, s) != c2(0, buf, s))
break;
return s;
}
#define DO_TEST_CRC32(crc,str,len) \
ok(crc32(crc,str,len) == my_checksum(crc, str, len), \
"crc32(%u,'%.*s')", crc, (int) len, str)
/* Check that CRC-32C calculation returns correct result*/
#define DO_TEST_CRC32C(crc,str,len) \
ok(crc32c(crc,str,len) == my_crc32c(crc, str, len), \
"crc32c(%u,'%.*s')", crc, (int) len, str)
static const char STR[]=
"123456789012345678900212345678901231213123321212123123123123123"
"..........................................................................."
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz";
int main(int argc __attribute__((unused)),char *argv[])
{
MY_INIT(argv[0]);
plan(14);
init_lookup(tab_3309, 0xedb88320);
init_lookup(tab_castagnoli, 0x82f63b78);
plan(36);
printf("%s\n",my_crc32c_implementation());
DO_TEST_CRC32(0,"");
DO_TEST_CRC32(1,"");
DO_TEST_CRC32(0,"12345");
DO_TEST_CRC32(1,"12345");
DO_TEST_CRC32(0,"1234567890123456789");
DO_TEST_CRC32(0, LONG_STR);
DO_TEST_CRC32(0,STR,0);
DO_TEST_CRC32(1,STR,0);
DO_TEST_CRC32(0,STR,3);
DO_TEST_CRC32(0,STR,5);
DO_TEST_CRC32(1,STR,5);
DO_TEST_CRC32(0,STR,15);
DO_TEST_CRC32(0,STR,16);
DO_TEST_CRC32(0,STR,19);
DO_TEST_CRC32(0,STR,32);
DO_TEST_CRC32(0,STR,63);
DO_TEST_CRC32(0,STR,64);
DO_TEST_CRC32(0,STR,65);
DO_TEST_CRC32(0,STR,255);
DO_TEST_CRC32(0,STR,256);
DO_TEST_CRC32(0,STR,257);
DO_TEST_CRC32(0,STR,(sizeof(STR)-1));
ok(0 == my_checksum(0, NULL, 0) , "crc32 data = NULL, length = 0");
DO_TEST_CRC32C(0,"", 0);
DO_TEST_CRC32C(1,"", 1);
DO_TEST_CRC32C(0, "12345", 416359221);
DO_TEST_CRC32C(1, "12345", 549473433);
DO_TEST_CRC32C(0, "1234567890123456789", 2366987449U);
DO_TEST_CRC32C(0, LONG_STR, 3009234172U);
DO_TEST_CRC32C(0,STR,0);
DO_TEST_CRC32C(1,STR,0);
DO_TEST_CRC32C(0,STR,3);
DO_TEST_CRC32C(0,STR,5);
DO_TEST_CRC32C(1,STR,5);
DO_TEST_CRC32C(0,STR,15);
DO_TEST_CRC32C(0,STR,16);
DO_TEST_CRC32C(0,STR,19);
DO_TEST_CRC32C(0,STR,32);
DO_TEST_CRC32C(0,STR,63);
DO_TEST_CRC32C(0,STR,64);
DO_TEST_CRC32C(0,STR,65);
DO_TEST_CRC32C(0,STR,255);
DO_TEST_CRC32C(0,STR,256);
DO_TEST_CRC32C(0,STR,257);
DO_TEST_CRC32C(0,STR,(sizeof(STR)-1));
ok(0 == my_crc32c(0, NULL, 0), "crc32c data = NULL, length = 0");
memset(buf, 0x5a, sizeof buf);
ok(0 == test_buf(my_checksum, crc32), "crc32 with various lengths");
ok(0 == test_buf(my_crc32c, crc32c), "crc32c with various lengths");
my_end(0);
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment