Commit b6cd22c2 authored by Lynn Boger's avatar Lynn Boger

hash/crc32: improve performance for ppc64le

This change improves the performance of crc32 for ppc64le by using
vpmsum and other vector instructions in the algorithm.

The testcase was updated to test more sizes.

Fixes #19570

BenchmarkCRC32/poly=IEEE/size=15/align=0-8             90.5          81.8          -9.61%
BenchmarkCRC32/poly=IEEE/size=15/align=1-8             89.7          81.7          -8.92%
BenchmarkCRC32/poly=IEEE/size=40/align=0-8             93.2          61.1          -34.44%
BenchmarkCRC32/poly=IEEE/size=40/align=1-8             92.8          60.9          -34.38%
BenchmarkCRC32/poly=IEEE/size=512/align=0-8            501           55.8          -88.86%
BenchmarkCRC32/poly=IEEE/size=512/align=1-8            502           132           -73.71%
BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8            947           69.9          -92.62%
BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8            946           144           -84.78%
BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8            3602          186           -94.84%
BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8            3603          263           -92.70%
BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8           28404         1338          -95.29%
BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8           28856         1405          -95.13%
BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8       89.7          81.8          -8.81%
BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8       89.8          81.9          -8.80%
BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8       93.8          61.4          -34.54%
BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8       94.3          61.3          -34.99%
BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8      503           56.4          -88.79%
BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8      502           132           -73.71%
BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8      941           70.2          -92.54%
BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8      943           145           -84.62%
BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8      3588          186           -94.82%
BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8      3595          264           -92.66%
BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8     28266         1323          -95.32%
BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8     28344         1404          -95.05%

Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5
Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 16663a85
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!amd64p32,!s390x
// +build !amd64,!amd64p32,!s390x,!ppc64le
package crc32
......
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
import (
"unsafe"
)
const (
vecMinLen = 16
vecAlignMask = 15 // align to 16 bytes
crcIEEE = 1
crcCast = 2
)
//go:noescape
func ppc64SlicingUpdateBy8(crc uint32, table8 *slicing8Table, p []byte) uint32
// this function requires the buffer to be 16 byte aligned and > 16 bytes long
//go:noescape
func vectorCrc32(crc uint32, poly uint32, p []byte) uint32
var archCastagnoliTable8 *slicing8Table
func archInitCastagnoli() {
archCastagnoliTable8 = slicingMakeTable(Castagnoli)
}
func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
if len(p) >= 4*vecMinLen {
// If not aligned then process the initial unaligned bytes
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
newlen := vecMinLen - align
crc = ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p[:newlen])
p = p[newlen:]
}
// p should be aligned now
aligned := len(p) & ^vecAlignMask
crc = vectorCrc32(crc, crcCast, p[:aligned])
p = p[aligned:]
}
if len(p) == 0 {
return crc
}
return ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p)
}
func archAvailableIEEE() bool {
return true
}
func archAvailableCastagnoli() bool {
return true
}
var archIeeeTable8 *slicing8Table
func archInitIEEE() {
// We still use slicing-by-8 for small buffers.
archIeeeTable8 = slicingMakeTable(IEEE)
}
// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
func archUpdateIEEE(crc uint32, p []byte) uint32 {
// Check if vector code should be used. If not aligned, then handle those
// first up to the aligned bytes.
if len(p) >= 4*vecMinLen {
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
newlen := vecMinLen - align
crc = ppc64SlicingUpdateBy8(crc, archIeeeTable8, p[:newlen])
p = p[newlen:]
}
aligned := len(p) & ^vecAlignMask
crc = vectorCrc32(crc, crcIEEE, p[:aligned])
p = p[aligned:]
}
if len(p) == 0 {
return crc
}
return ppc64SlicingUpdateBy8(crc, archIeeeTable8, p)
}
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The vectorized implementation found below is a derived work
// from code written by Anton Blanchard <anton@au.ibm.com> found
// at https://github.com/antonblanchard/crc32-vpmsum. The original
// is dual licensed under GPL and Apache 2. As the copyright holder
// for the work, IBM has contributed this new work under
// the golang license.
// Changes include porting to Go assembler with modifications for
// the Go ABI for ppc64le.
#include "textflag.h"
#define POWER8_OFFSET 132
#define off16 R16
#define off32 R17
#define off48 R18
#define off64 R19
#define off80 R20
#define off96 R21
#define off112 R22
#define const1 V24
#define const2 V25
#define byteswap V26
#define mask_32bit V27
#define mask_64bit V28
#define zeroes V29
#define MAX_SIZE 32*1024
#define REFLECT
TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
MOVWZ crc+0(FP), R3 // incoming crc
MOVD table8+8(FP), R4 // *Table
MOVD p+16(FP), R5
MOVD p_len+24(FP), R6 // p len
CMP $0,R6 // len == 0?
BNE start
MOVW R3,ret+40(FP) // return crc
RET
start:
NOR R3,R3,R7 // ^crc
MOVWZ R7,R7 // 32 bits
CMP R6,$16
MOVD R6,CTR
BLT short
SRAD $3,R6,R8 // 8 byte chunks
MOVD R8,CTR
loop:
MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
MOVWZ 4(R5),R9 // 4-7 bytes of p
MOVD R4,R10 // &tab[0]
XOR R7,R8,R7 // crc ^= byte[0:3]
RLDICL $40,R9,$56,R17 // p[7]
SLD $2,R17,R17 // p[7]*4
RLDICL $40,R7,$56,R8 // crc>>24
ADD R17,R10,R17 // &tab[0][p[7]]
SLD $2,R8,R8 // crc>>24*4
RLDICL $48,R9,$56,R18 // p[6]
SLD $2,R18,R18 // p[6]*4
ADD $1024,R10,R10 // tab[1]
MOVWZ 0(R17),R21 // tab[0][p[7]]
RLDICL $56,R9,$56,R19 // p[5]
ADD R10,R18,R18 // &tab[1][p[6]]
SLD $2,R19,R19 // p[5]*4:1
MOVWZ 0(R18),R22 // tab[1][p[6]]
ADD $1024,R10,R10 // tab[2]
XOR R21,R22,R21 // xor done R22
ADD R19,R10,R19 // &tab[2][p[5]]
ANDCC $255,R9,R20 // p[4] ??
SLD $2,R20,R20 // p[4]*4
MOVWZ 0(R19),R23 // tab[2][p[5]]
ADD $1024,R10,R10 // &tab[3]
ADD R20,R10,R20 // tab[3][p[4]]
XOR R21,R23,R21 // xor done R23
ADD $1024,R10,R10 // &tab[4]
MOVWZ 0(R20),R24 // tab[3][p[4]]
ADD R10,R8,R23 // &tab[4][crc>>24]
XOR R21,R24,R21 // xor done R24
MOVWZ 0(R23),R25 // tab[4][crc>>24]
RLDICL $48,R7,$56,R24 // crc>>16&0xFF
XOR R21,R25,R21 // xor done R25
ADD $1024,R10,R10 // &tab[5]
SLD $2,R24,R24 // crc>>16&0xFF*4
ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
XOR R21,R26,R21 // xor done R26
RLDICL $56,R7,$56,R25 // crc>>8
ADD $1024,R10,R10 // &tab[6]
SLD $2,R25,R25 // crc>>8&FF*2
ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
MOVBZ R7,R26 // crc&0xFF
ADD $1024,R10,R10 // &tab[7]
MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
SLD $2,R26,R26 // crc&0xFF*2
XOR R21,R27,R21 // xor done R27
ADD R26,R10,R26 // &tab[7][crc&0xFF]
ADD $8,R5 // p = p[8:]
MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
XOR R21,R28,R21 // xor done R28
MOVWZ R21,R7 // crc for next round
BC 16,0,loop // next 8 bytes
ANDCC $7,R6,R8 // any leftover bytes
BEQ done // none --> done
MOVD R8,CTR // byte count
short:
MOVBZ 0(R5),R8 // get v
MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
MOVWZ R7,R14
SRD $8,R14,R14 // crc>>8
XOR R8,R9,R8 // byte(crc)^v -> R8
ADD $1,R5 // ptr to next v
SLD $2,R8 // convert index-> bytes
ADD R8,R4,R9 // &tab[byte(crc)^v]
MOVWZ 0(R9),R10 // tab[byte(crc)^v]
XOR R10,R14,R7 // loop crc in R7
MOVWZ R7,R7 // 32 bits
BC 16,0,short
done:
NOR R7,R7,R7 // ^crc
MOVW R7,ret+40(FP) // return crc
RET
#ifdef BYTESWAP_DATA
DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
GLOBL ·byteswapcons+0(SB),RODATA,$16
#endif
TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
MOVWZ crc+0(FP), R3 // incoming crc
MOVWZ ctab+4(FP), R14 // crc poly id
MOVD p+8(FP), R4
MOVD p_len+16(FP), R5 // p len
// R3 = incoming crc
// R14 = constant table identifier
// R5 = address of bytes
// R6 = length of bytes
// defines for index loads
MOVD $16,off16
MOVD $32,off32
MOVD $48,off48
MOVD $64,off64
MOVD $80,off80
MOVD $96,off96
MOVD $112,off112
MOVD $0,R15
MOVD R3,R10 // save initial crc
NOR R3,R3,R3 // ^crc
MOVWZ R3,R3 // 32 bits
VXOR zeroes,zeroes,zeroes // clear the V reg
VSPLTISW $-1,V0
VSLDOI $4,V29,V0,mask_32bit
VSLDOI $8,V29,V0,mask_64bit
VXOR V8,V8,V8
MTVSRD R3,VS40 // crc initial value VS40 = V8
#ifdef REFLECT
VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
#else
VSLDOI $4,V8,zeroes,V8
#endif
#ifdef BYTESWAP_DATA
MOVD byteswapcons(SB),R3
LVX (R3),byteswap
#endif
CMPU R5,$256 // length of bytes
BLT short
RLDICR $0,R5,$56,R6 // chunk to process
// First step for larger sizes
l1: MOVD $32768,R7
MOVD R7,R9
CMP R6,R7 // compare R6, R7 (MAX SIZE)
BGT top // less than MAX, just do remainder
MOVD R6,R7
top:
SUB R7,R6,R6
// mainloop does 128 bytes at a time
SRD $7,R7
// determine the offset into the constants table to start with.
// Each constant is 128 bytes, used against 16 bytes of data.
SLD $4,R7,R8
SRD $3,R9,R9
SUB R8,R9,R8
// The last iteration is reduced in a separate step
ADD $-1,R7
MOVD R7,CTR
// Determine which constant table (depends on poly)
CMP R14,$1
BNE castTable
MOVD IEEEConst(SB),R3
BR startConst
castTable:
MOVD CastConst(SB),R3
startConst:
ADD R3,R8,R3 // starting point in constants table
VXOR V0,V0,V0 // clear the V regs
VXOR V1,V1,V1
VXOR V2,V2,V2
VXOR V3,V3,V3
VXOR V4,V4,V4
VXOR V5,V5,V5
VXOR V6,V6,V6
VXOR V7,V7,V7
LVX (R3),const1 // loading constant values
CMP R15,$1 // Identify warm up pass
BEQ next
// First warm up pass: load the bytes to process
LVX (R4),V16
LVX (R4+off16),V17
LVX (R4+off32),V18
LVX (R4+off48),V19
LVX (R4+off64),V20
LVX (R4+off80),V21
LVX (R4+off96),V22
LVX (R4+off112),V23
ADD $128,R4 // bump up to next 128 bytes in buffer
VXOR V16,V8,V16 // xor in inital CRC in V8
next:
BC 18,0,first_warm_up_done
ADD $16,R3 // bump up to next constants
LVX (R3),const2 // table values
VPMSUMD V16,const1,V8 // second warm up pass
LVX (R4),V16 // load from buffer
OR $0,R2,R2
VPMSUMD V17,const1,V9 // vpmsumd with constants
LVX (R4+off16),V17 // load next from buffer
OR $0,R2,R2
VPMSUMD V18,const1,V10 // vpmsumd with constants
LVX (R4+off32),V18 // load next from buffer
OR $0,R2,R2
VPMSUMD V19,const1,V11 // vpmsumd with constants
LVX (R4+off48),V19 // load next from buffer
OR $0,R2,R2
VPMSUMD V20,const1,V12 // vpmsumd with constants
LVX (R4+off64),V20 // load next from buffer
OR $0,R2,R2
VPMSUMD V21,const1,V13 // vpmsumd with constants
LVX (R4+off80),V21 // load next from buffer
OR $0,R2,R2
VPMSUMD V22,const1,V14 // vpmsumd with constants
LVX (R4+off96),V22 // load next from buffer
OR $0,R2,R2
VPMSUMD V23,const1,V15 // vpmsumd with constants
LVX (R4+off112),V23 // load next from buffer
ADD $128,R4 // bump up to next 128 bytes in buffer
BC 18,0,first_cool_down
cool_top:
LVX (R3),const1 // constants
ADD $16,R3 // inc to next constants
OR $0,R2,R2
VXOR V0,V8,V0 // xor in previous vpmsumd
VPMSUMD V16,const2,V8 // vpmsumd with constants
LVX (R4),V16 // buffer
OR $0,R2,R2
VXOR V1,V9,V1 // xor in previous
VPMSUMD V17,const2,V9 // vpmsumd with constants
LVX (R4+off16),V17 // next in buffer
OR $0,R2,R2
VXOR V2,V10,V2 // xor in previous
VPMSUMD V18,const2,V10 // vpmsumd with constants
LVX (R4+off32),V18 // next in buffer
OR $0,R2,R2
VXOR V3,V11,V3 // xor in previous
VPMSUMD V19,const2,V11 // vpmsumd with constants
LVX (R4+off48),V19 // next in buffer
LVX (R3),const2 // get next constant
OR $0,R2,R2
VXOR V4,V12,V4 // xor in previous
VPMSUMD V20,const1,V12 // vpmsumd with constants
LVX (R4+off64),V20 // next in buffer
OR $0,R2,R2
VXOR V5,V13,V5 // xor in previous
VPMSUMD V21,const1,V13 // vpmsumd with constants
LVX (R4+off80),V21 // next in buffer
OR $0,R2,R2
VXOR V6,V14,V6 // xor in previous
VPMSUMD V22,const1,V14 // vpmsumd with constants
LVX (R4+off96),V22 // next in buffer
OR $0,R2,R2
VXOR V7,V15,V7 // xor in previous
VPMSUMD V23,const1,V15 // vpmsumd with constants
LVX (R4+off112),V23 // next in buffer
ADD $128,R4 // bump up buffer pointer
BC 16,0,cool_top // are we done?
first_cool_down:
// load the constants
// xor in the previous value
// vpmsumd the result with constants
LVX (R3),const1
ADD $16,R3
VXOR V0,V8,V0
VPMSUMD V16,const1,V8
OR $0,R2,R2
VXOR V1,V9,V1
VPMSUMD V17,const1,V9
OR $0,R2,R2
VXOR V2,V10,V2
VPMSUMD V18,const1,V10
OR $0,R2,R2
VXOR V3,V11,V3
VPMSUMD V19,const1,V11
OR $0,R2,R2
VXOR V4,V12,V4
VPMSUMD V20,const1,V12
OR $0,R2,R2
VXOR V5,V13,V5
VPMSUMD V21,const1,V13
OR $0,R2,R2
VXOR V6,V14,V6
VPMSUMD V22,const1,V14
OR $0,R2,R2
VXOR V7,V15,V7
VPMSUMD V23,const1,V15
OR $0,R2,R2
second_cool_down:
VXOR V0,V8,V0
VXOR V1,V9,V1
VXOR V2,V10,V2
VXOR V3,V11,V3
VXOR V4,V12,V4
VXOR V5,V13,V5
VXOR V6,V14,V6
VXOR V7,V15,V7
#ifdef REFLECT
VSLDOI $4,V0,zeroes,V0
VSLDOI $4,V1,zeroes,V1
VSLDOI $4,V2,zeroes,V2
VSLDOI $4,V3,zeroes,V3
VSLDOI $4,V4,zeroes,V4
VSLDOI $4,V5,zeroes,V5
VSLDOI $4,V6,zeroes,V6
VSLDOI $4,V7,zeroes,V7
#endif
LVX (R4),V8
LVX (R4+off16),V9
LVX (R4+off32),V10
LVX (R4+off48),V11
LVX (R4+off64),V12
LVX (R4+off80),V13
LVX (R4+off96),V14
LVX (R4+off112),V15
ADD $128,R4
VXOR V0,V8,V16
VXOR V1,V9,V17
VXOR V2,V10,V18
VXOR V3,V11,V19
VXOR V4,V12,V20
VXOR V5,V13,V21
VXOR V6,V14,V22
VXOR V7,V15,V23
MOVD $1,R15
CMP $0,R6
ADD $128,R6
BNE l1
ANDCC $127,R5
SUBC R5,$128,R6
ADD R3,R6,R3
SRD $4,R5,R7
MOVD R7,CTR
LVX (R3),V0
LVX (R3+off16),V1
LVX (R3+off32),V2
LVX (R3+off48),V3
LVX (R3+off64),V4
LVX (R3+off80),V5
LVX (R3+off96),V6
LVX (R3+off112),V7
ADD $128,R3
VPMSUMW V16,V0,V0
VPMSUMW V17,V1,V1
VPMSUMW V18,V2,V2
VPMSUMW V19,V3,V3
VPMSUMW V20,V4,V4
VPMSUMW V21,V5,V5
VPMSUMW V22,V6,V6
VPMSUMW V23,V7,V7
// now reduce the tail
CMP $0,R7
BEQ next1
LVX (R4),V16
LVX (R3),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off16),V16
LVX (R3+off16),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off32),V16
LVX (R3+off32),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off48),V16
LVX (R3+off48),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off64),V16
LVX (R3+off64),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off80),V16
LVX (R3+off80),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
BC 18,0,next1
LVX (R4+off96),V16
LVX (R3+off96),V17
VPMSUMW V16,V17,V16
VXOR V0,V16,V0
next1:
VXOR V0,V1,V0
VXOR V2,V3,V2
VXOR V4,V5,V4
VXOR V6,V7,V6
VXOR V0,V2,V0
VXOR V4,V6,V4
VXOR V0,V4,V0
barrett_reduction:
CMP R14,$1
BNE barcstTable
MOVD IEEEBarConst(SB),R3
BR startbarConst
barcstTable:
MOVD CastBarConst(SB),R3
startbarConst:
LVX (R3),const1
LVX (R3+off16),const2
VSLDOI $8,V0,V0,V1
VXOR V0,V1,V0
#ifdef REFLECT
VSPLTISB $1,V1
VSL V0,V1,V0
#endif
VAND V0,mask_64bit,V0
#ifndef REFLECT
VPMSUMD V0,const1,V1
VSLDOI $8,zeroes,V1,V1
VPMSUMD V1,const2,V1
VXOR V0,V1,V0
VSLDOI $8,V0,zeroes,V0
#else
VAND V0,mask_32bit,V1
VPMSUMD V1,const1,V1
VAND V1,mask_32bit,V1
VPMSUMD V1,const2,V1
VXOR V0,V1,V0
VSLDOI $4,V0,zeroes,V0
#endif
MFVSRD VS32,R3 // VS32 = V0
NOR R3,R3,R3 // return ^crc
MOVW R3,ret+32(FP)
RET
first_warm_up_done:
LVX (R3),const1
ADD $16,R3
VPMSUMD V16,const1,V8
VPMSUMD V17,const1,V9
VPMSUMD V18,const1,V10
VPMSUMD V19,const1,V11
VPMSUMD V20,const1,V12
VPMSUMD V21,const1,V13
VPMSUMD V22,const1,V14
VPMSUMD V23,const1,V15
BR second_cool_down
short:
CMP $0,R5
BEQ zero
// compute short constants
CMP R14,$1
BNE castshTable
MOVD IEEEConst(SB),R3
ADD $4080,R3
BR startshConst
castshTable:
MOVD CastConst(SB),R3
ADD $4080,R3
startshConst:
SUBC R5,$256,R6 // sub from 256
ADD R3,R6,R3
// calculate where to start
SRD $4,R5,R7
MOVD R7,CTR
VXOR V19,V19,V19
VXOR V20,V20,V20
LVX (R4),V0
LVX (R3),V16
VXOR V0,V8,V0
VPMSUMW V0,V16,V0
BC 18,0,v0
LVX (R4+off16),V1
LVX (R3+off16),V17
VPMSUMW V1,V17,V1
BC 18,0,v1
LVX (R4+off32),V2
LVX (R3+off32),V16
VPMSUMW V2,V16,V2
BC 18,0,v2
LVX (R4+off48),V3
LVX (R3+off48),V17
VPMSUMW V3,V17,V3
BC 18,0,v3
LVX (R4+off64),V4
LVX (R3+off64),V16
VPMSUMW V4,V16,V4
BC 18,0,v4
LVX (R4+off80),V5
LVX (R3+off80),V17
VPMSUMW V5,V17,V5
BC 18,0,v5
LVX (R4+off96),V6
LVX (R3+off96),V16
VPMSUMW V6,V16,V6
BC 18,0,v6
LVX (R4+off112),V7
LVX (R3+off112),V17
VPMSUMW V7,V17,V7
BC 18,0,v7
ADD $128,R3
ADD $128,R4
LVX (R4),V8
LVX (R3),V16
VPMSUMW V8,V16,V8
BC 18,0,v8
LVX (R4+off16),V9
LVX (R3+off16),V17
VPMSUMW V9,V17,V9
BC 18,0,v9
LVX (R4+off32),V10
LVX (R3+off32),V16
VPMSUMW V10,V16,V10
BC 18,0,v10
LVX (R4+off48),V11
LVX (R3+off48),V17
VPMSUMW V11,V17,V11
BC 18,0,v11
LVX (R4+off64),V12
LVX (R3+off64),V16
VPMSUMW V12,V16,V12
BC 18,0,v12
LVX (R4+off80),V13
LVX (R3+off80),V17
VPMSUMW V13,V17,V13
BC 18,0,v13
LVX (R4+off96),V14
LVX (R3+off96),V16
VPMSUMW V14,V16,V14
BC 18,0,v14
LVX (R4+off112),V15
LVX (R3+off112),V17
VPMSUMW V15,V17,V15
VXOR V19,V15,V19
v14: VXOR V20,V14,V20
v13: VXOR V19,V13,V19
v12: VXOR V20,V12,V20
v11: VXOR V19,V11,V19
v10: VXOR V20,V10,V20
v9: VXOR V19,V9,V19
v8: VXOR V20,V8,V20
v7: VXOR V19,V7,V19
v6: VXOR V20,V6,V20
v5: VXOR V19,V5,V19
v4: VXOR V20,V4,V20
v3: VXOR V19,V3,V19
v2: VXOR V20,V2,V20
v1: VXOR V19,V1,V19
v0: VXOR V20,V0,V20
VXOR V19,V20,V0
BR barrett_reduction
zero:
// This case is the original crc, so just return it
MOVW R10,ret+32(FP)
RET
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -76,8 +76,9 @@ func testCrossCheck(t *testing.T, crcFunc1, crcFunc2 func(crc uint32, b []byte)
// The AMD64 implementation has some cutoffs at lengths 168*3=504 and
// 1344*3=4032. We should make sure lengths around these values are in the
// list.
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 100, 128,
500, 501, 502, 503, 504, 505, 512, 1000, 1024, 2000,
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 63, 64, 65, 100,
127, 128, 129, 255, 256, 257, 300, 312, 384, 416, 448, 480,
500, 501, 502, 503, 504, 505, 512, 513, 1000, 1024, 2000,
4030, 4031, 4032, 4033, 4036, 4040, 4048, 4096, 5000, 10000}
for _, length := range lengths {
p := make([]byte, length)
......
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Generate the constant table associated with the poly used by the
// vpmsumd crc32 algorithm.
//
// go run gen_const_ppc64le.go
//
// generates crc32_table_ppc64le.s
// The following is derived from code written by Anton Blanchard
// <anton@au.ibm.com> found at https://github.com/antonblanchard/crc32-vpmsum.
// The original is dual licensed under GPL and Apache 2. As the copyright holder
// for the work, IBM has contributed this new work under the golang license.
// This code was written in Go based on the original C implementation.
// This is a tool needed to generate the appropriate constants needed for
// the vpmsum algorithm. It is included to generate new constant tables if
// new polynomial values are included in the future.
package main
import (
"bytes"
"fmt"
"io/ioutil"
)
var blocking = 32 * 1024
func reflect_bits(b uint64, nr uint) uint64 {
var ref uint64
for bit := uint64(0); bit < uint64(nr); bit++ {
if (b & uint64(1)) == 1 {
ref |= (1 << (uint64(nr-1) - bit))
}
b = (b >> 1)
}
return ref
}
func get_remainder(poly uint64, deg uint, n uint) uint64 {
rem, _ := xnmodp(n, poly, deg)
return rem
}
func get_quotient(poly uint64, bits, n uint) uint64 {
_, div := xnmodp(n, poly, bits)
return div
}
// xnmodp returns two values, p and div:
// p is the representation of the binary polynomial x**n mod (x ** deg + "poly")
// That is p is the binary representation of the modulus polynomial except for its highest-order term.
// div is the binary representation of the polynomial x**n / (x ** deg + "poly")
func xnmodp(n uint, poly uint64, deg uint) (uint64, uint64) {
var mod, mask, high, div uint64
if n < deg {
div = 0
return poly, div
}
mask = 1<<deg - 1
poly &= mask
mod = poly
div = 1
deg--
n--
for n > deg {
high = (mod >> deg) & 1
div = (div << 1) | high
mod <<= 1
if high != 0 {
mod ^= poly
}
n--
}
return mod & mask, div
}
func main() {
w := new(bytes.Buffer)
fmt.Fprintf(w, "// autogenerated: do not edit!\n")
fmt.Fprintf(w, "// generated from crc32/gen_const_ppc64le.go\n")
fmt.Fprintln(w)
fmt.Fprintf(w, "#include \"textflag.h\"\n")
// These are the polynomials supported in vector now.
// If adding others, include the polynomial and a name
// to identify it.
genCrc32ConstTable(w, 0xedb88320, "IEEE")
genCrc32ConstTable(w, 0x82f63b78, "Cast")
genCrc32ConstTable(w, 0xeb31d82e, "Koop")
b := w.Bytes()
err := ioutil.WriteFile("crc32_table_ppc64le.s", b, 0666)
if err != nil {
fmt.Printf("can't write output: %s\n", err)
}
}
func genCrc32ConstTable(w *bytes.Buffer, poly uint32, polyid string) {
ref_poly := reflect_bits(uint64(poly), 32)
fmt.Fprintf(w, "\n\t/* Reduce %d kbits to 1024 bits */\n", blocking*8)
j := 0
for i := (blocking * 8) - 1024; i > 0; i -= 1024 {
a := reflect_bits(get_remainder(ref_poly, 32, uint(i)), 32) << 1
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32) << 1
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s */\n", uint(i+64), "", uint(i), "")
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, j*8, b)
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, (j+1)*8, a)
j += 2
fmt.Fprintf(w, "\n")
}
for i := (1024 * 2) - 128; i >= 0; i -= 128 {
a := reflect_bits(get_remainder(ref_poly, 32, uint(i+32)), 32)
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32)
c := reflect_bits(get_remainder(ref_poly, 32, uint(i+96)), 32)
d := reflect_bits(get_remainder(ref_poly, 32, uint(i+128)), 32)
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s */\n", i+128, "", i+96, "", i+64, "", i+32, "")
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, j*8, c, d)
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, (j+1)*8, a, b)
j += 2
fmt.Fprintf(w, "\n")
}
fmt.Fprintf(w, "GLOBL ·%sConst(SB),RODATA,$4336\n", polyid)
fmt.Fprintf(w, "\n /* Barrett constant m - (4^32)/n */\n")
fmt.Fprintf(w, "DATA ·%sBarConst(SB)/8,$0x%016x\n", polyid, reflect_bits(get_quotient(ref_poly, 32, 64), 33))
fmt.Fprintf(w, "DATA ·%sBarConst+8(SB)/8,$0x0000000000000000\n", polyid)
fmt.Fprintf(w, "DATA ·%sBarConst+16(SB)/8,$0x%016x\n", polyid, reflect_bits((uint64(1)<<32)|ref_poly, 33)) // reflected?
fmt.Fprintf(w, "DATA ·%sBarConst+24(SB)/8,$0x0000000000000000\n", polyid)
fmt.Fprintf(w, "GLOBL ·%sBarConst(SB),RODATA,$32\n", polyid)
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment