hash/crc32: improve performance for ppc64le

This change improves the performance of crc32 for ppc64le by using vpmsum and other vector instructions in the algorithm. The testcase was updated to test more sizes. Fixes #19570 BenchmarkCRC32/poly=IEEE/size=15/align=0-8 90.5 81.8 -9.61% BenchmarkCRC32/poly=IEEE/size=15/align=1-8 89.7 81.7 -8.92% BenchmarkCRC32/poly=IEEE/size=40/align=0-8 93.2 61.1 -34.44% BenchmarkCRC32/poly=IEEE/size=40/align=1-8 92.8 60.9 -34.38% BenchmarkCRC32/poly=IEEE/size=512/align=0-8 501 55.8 -88.86% BenchmarkCRC32/poly=IEEE/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8 947 69.9 -92.62% BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8 946 144 -84.78% BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8 3602 186 -94.84% BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8 3603 263 -92.70% BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8 28404 1338 -95.29% BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8 28856 1405 -95.13% BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8 89.7 81.8 -8.81% BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8 89.8 81.9 -8.80% BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8 93.8 61.4 -34.54% BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8 94.3 61.3 -34.99% BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8 503 56.4 -88.79% BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8 941 70.2 -92.54% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8 943 145 -84.62% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8 3588 186 -94.82% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8 3595 264 -92.66% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8 28266 1323 -95.32% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8 28344 1404 -95.05% Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5 Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: David Chase <drchase@google.com>

hash/crc32: improve performance for ppc64le
This change improves the performance of crc32 for ppc64le by using vpmsum and other vector instructions in the algorithm. The testcase was updated to test more sizes. Fixes #19570 BenchmarkCRC32/poly=IEEE/size=15/align=0-8 90.5 81.8 -9.61% BenchmarkCRC32/poly=IEEE/size=15/align=1-8 89.7 81.7 -8.92% BenchmarkCRC32/poly=IEEE/size=40/align=0-8 93.2 61.1 -34.44% BenchmarkCRC32/poly=IEEE/size=40/align=1-8 92.8 60.9 -34.38% BenchmarkCRC32/poly=IEEE/size=512/align=0-8 501 55.8 -88.86% BenchmarkCRC32/poly=IEEE/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8 947 69.9 -92.62% BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8 946 144 -84.78% BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8 3602 186 -94.84% BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8 3603 263 -92.70% BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8 28404 1338 -95.29% BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8 28856 1405 -95.13% BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8 89.7 81.8 -8.81% BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8 89.8 81.9 -8.80% BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8 93.8 61.4 -34.54% BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8 94.3 61.3 -34.99% BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8 503 56.4 -88.79% BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8 941 70.2 -92.54% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8 943 145 -84.62% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8 3588 186 -94.82% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8 3595 264 -92.66% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8 28266 1323 -95.32% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8 28344 1404 -95.05% Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5 Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: David Chase <drchase@google.com>
b6cd22c2 · Lynn Boger · 16663a85 · b6cd22c2 · b6cd22c2 · b6cd22c2
Commit b6cd22c2 authored Mar 13, 2017 by Lynn Boger
6 changed files
--- a/src/hash/crc32/crc32_otherarch.go
+++ b/src/hash/crc32/crc32_otherarch.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64,!amd64p32,!s390x
+// +build !amd64,!amd64p32,!s390x,!ppc64le

 package crc32


--- a/src/hash/crc32/crc32_ppc64le.go
+++ b/src/hash/crc32/crc32_ppc64le.go
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+import (
+	"unsafe"
+)
+
+const (
+	vecMinLen    = 16
+	vecAlignMask = 15 // align to 16 bytes
+	crcIEEE      = 1
+	crcCast      = 2
+)
+
+//go:noescape
+func ppc64SlicingUpdateBy8(crc uint32, table8 *slicing8Table, p []byte) uint32
+
+// this function requires the buffer to be 16 byte aligned and > 16 bytes long
+//go:noescape
+func vectorCrc32(crc uint32, poly uint32, p []byte) uint32
+
+var archCastagnoliTable8 *slicing8Table
+
+func archInitCastagnoli() {
+	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+	if len(p) >= 4*vecMinLen {
+		// If not aligned then process the initial unaligned bytes
+
+		if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
+			align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
+			newlen := vecMinLen - align
+			crc = ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p[:newlen])
+			p = p[newlen:]
+		}
+		// p should be aligned now
+		aligned := len(p) & ^vecAlignMask
+		crc = vectorCrc32(crc, crcCast, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p)
+}
+
+func archAvailableIEEE() bool {
+	return true
+}
+func archAvailableCastagnoli() bool {
+	return true
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+	// We still use slicing-by-8 for small buffers.
+	archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+
+	// Check if vector code should be used.  If not aligned, then handle those
+	// first up to the aligned bytes.
+
+	if len(p) >= 4*vecMinLen {
+		if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
+			align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
+			newlen := vecMinLen - align
+			crc = ppc64SlicingUpdateBy8(crc, archIeeeTable8, p[:newlen])
+			p = p[newlen:]
+		}
+		aligned := len(p) & ^vecAlignMask
+		crc = vectorCrc32(crc, crcIEEE, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return ppc64SlicingUpdateBy8(crc, archIeeeTable8, p)
+}
--- a/src/hash/crc32/crc32_ppc64le.s
+++ b/src/hash/crc32/crc32_ppc64le.s
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The vectorized implementation found below is a derived work
+// from code written by Anton Blanchard <anton@au.ibm.com> found
+// at https://github.com/antonblanchard/crc32-vpmsum.  The original
+// is dual licensed under GPL and Apache 2.  As the copyright holder
+// for the work, IBM has contributed this new work under
+// the golang license.
+
+// Changes include porting to Go assembler with modifications for
+// the Go ABI for ppc64le.
+
+#include "textflag.h"
+
+#define POWER8_OFFSET 132
+
+#define off16	R16
+#define off32	R17
+#define off48	R18
+#define off64	R19
+#define off80	R20
+#define off96	R21
+#define	off112	R22
+
+#define const1	V24
+#define const2	V25
+
+#define byteswap	V26
+#define mask_32bit	V27
+#define mask_64bit	V28
+#define zeroes		V29
+
+#define MAX_SIZE	32*1024
+#define REFLECT
+
+TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
+	MOVWZ	crc+0(FP), R3   // incoming crc
+	MOVD    table8+8(FP), R4   // *Table
+	MOVD    p+16(FP), R5
+	MOVD    p_len+24(FP), R6 // p len
+
+	CMP     $0,R6           // len == 0?
+	BNE     start
+	MOVW    R3,ret+40(FP)   // return crc
+	RET
+
+start:
+	NOR     R3,R3,R7        // ^crc
+	MOVWZ	R7,R7		// 32 bits
+	CMP	R6,$16
+	MOVD	R6,CTR
+	BLT	short
+	SRAD    $3,R6,R8        // 8 byte chunks
+	MOVD    R8,CTR
+
+loop:
+	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
+	MOVWZ	4(R5),R9	// 4-7 bytes of p
+	MOVD	R4,R10		// &tab[0]
+	XOR	R7,R8,R7	// crc ^= byte[0:3]
+	RLDICL	$40,R9,$56,R17	// p[7]
+	SLD	$2,R17,R17	// p[7]*4
+	RLDICL	$40,R7,$56,R8	// crc>>24
+	ADD	R17,R10,R17	// &tab[0][p[7]]
+	SLD	$2,R8,R8	// crc>>24*4
+	RLDICL	$48,R9,$56,R18	// p[6]
+	SLD	$2,R18,R18	// p[6]*4
+	ADD	$1024,R10,R10	// tab[1]
+	MOVWZ	0(R17),R21	// tab[0][p[7]]
+	RLDICL	$56,R9,$56,R19	// p[5]
+	ADD	R10,R18,R18	// &tab[1][p[6]]
+	SLD	$2,R19,R19	// p[5]*4:1
+	MOVWZ	0(R18),R22	// tab[1][p[6]]
+	ADD	$1024,R10,R10	// tab[2]
+	XOR	R21,R22,R21	// xor done R22
+	ADD	R19,R10,R19	// &tab[2][p[5]]
+	ANDCC	$255,R9,R20	// p[4] ??
+	SLD	$2,R20,R20	// p[4]*4
+	MOVWZ	0(R19),R23	// tab[2][p[5]]
+	ADD	$1024,R10,R10	// &tab[3]
+	ADD	R20,R10,R20	// tab[3][p[4]]
+	XOR	R21,R23,R21	// xor done R23
+	ADD	$1024,R10,R10	// &tab[4]
+	MOVWZ	0(R20),R24	// tab[3][p[4]]
+	ADD	R10,R8,R23	// &tab[4][crc>>24]
+	XOR	R21,R24,R21	// xor done R24
+	MOVWZ	0(R23),R25	// tab[4][crc>>24]
+	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
+	XOR	R21,R25,R21	// xor done R25
+	ADD	$1024,R10,R10	// &tab[5]
+	SLD	$2,R24,R24	// crc>>16&0xFF*4
+	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
+	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
+	XOR	R21,R26,R21	// xor done R26
+	RLDICL	$56,R7,$56,R25	// crc>>8
+	ADD	$1024,R10,R10	// &tab[6]
+	SLD	$2,R25,R25	// crc>>8&FF*2
+	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
+	MOVBZ   R7,R26          // crc&0xFF
+	ADD     $1024,R10,R10   // &tab[7]
+	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
+	SLD	$2,R26,R26	// crc&0xFF*2
+	XOR	R21,R27,R21	// xor done R27
+	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
+	ADD     $8,R5           // p = p[8:]
+	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
+	XOR	R21,R28,R21	// xor done R28
+	MOVWZ	R21,R7		// crc for next round
+	BC	16,0,loop	// next 8 bytes
+	ANDCC	$7,R6,R8	// any leftover bytes
+	BEQ	done		// none --> done
+	MOVD	R8,CTR		// byte count
+
+short:
+	MOVBZ   0(R5),R8        // get v
+	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
+	MOVWZ	R7,R14
+	SRD	$8,R14,R14	// crc>>8
+	XOR     R8,R9,R8        // byte(crc)^v -> R8
+	ADD	$1,R5		// ptr to next v
+	SLD     $2,R8           // convert index-> bytes
+	ADD     R8,R4,R9        // &tab[byte(crc)^v]
+	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
+	XOR     R10,R14,R7       // loop crc in R7
+	MOVWZ   R7,R7           // 32 bits
+	BC      16,0,short
+done:
+	NOR     R7,R7,R7        // ^crc
+	MOVW    R7,ret+40(FP)   // return crc
+	RET
+
+#ifdef BYTESWAP_DATA
+DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
+DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
+
+GLOBL ·byteswapcons+0(SB),RODATA,$16
+#endif
+
+TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
+	MOVWZ	crc+0(FP), R3   // incoming crc
+	MOVWZ	ctab+4(FP), R14   // crc poly id
+	MOVD    p+8(FP), R4
+	MOVD    p_len+16(FP), R5 // p len
+
+	// R3 = incoming crc
+	// R14 = constant table identifier
+	// R5 = address of bytes
+	// R6 = length of bytes
+
+	// defines for index loads
+
+	MOVD	$16,off16
+	MOVD	$32,off32
+	MOVD	$48,off48
+	MOVD	$64,off64
+	MOVD	$80,off80
+	MOVD	$96,off96
+	MOVD	$112,off112
+	MOVD	$0,R15
+
+	MOVD	R3,R10	// save initial crc
+
+	NOR	R3,R3,R3  // ^crc
+	MOVWZ	R3,R3	// 32 bits
+	VXOR	zeroes,zeroes,zeroes  // clear the V reg
+	VSPLTISW $-1,V0
+	VSLDOI	$4,V29,V0,mask_32bit
+	VSLDOI	$8,V29,V0,mask_64bit
+
+	VXOR	V8,V8,V8
+	MTVSRD	R3,VS40	// crc initial value VS40 = V8
+
+#ifdef REFLECT
+	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
+#else
+	VSLDOI	$4,V8,zeroes,V8
+#endif
+
+#ifdef BYTESWAP_DATA
+	MOVD    $·byteswapcons(SB),R3
+	LVX	(R3),byteswap
+#endif
+
+	CMPU	R5,$256		// length of bytes
+	BLT	short
+
+	RLDICR	$0,R5,$56,R6 // chunk to process
+
+	// First step for larger sizes
+l1:	MOVD	$32768,R7
+	MOVD	R7,R9
+	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
+	BGT	top	// less than MAX, just do remainder
+	MOVD	R6,R7
+top:
+	SUB	R7,R6,R6
+
+	// mainloop does 128 bytes at a time
+	SRD	$7,R7
+
+	// determine the offset into the constants table to start with.
+	// Each constant is 128 bytes, used against 16 bytes of data.
+	SLD	$4,R7,R8
+	SRD	$3,R9,R9
+	SUB	R8,R9,R8
+
+	// The last iteration is reduced in a separate step
+	ADD	$-1,R7
+	MOVD	R7,CTR
+
+	// Determine which constant table (depends on poly)
+	CMP	R14,$1
+	BNE	castTable
+	MOVD	$·IEEEConst(SB),R3
+	BR	startConst
+castTable:
+	MOVD	$·CastConst(SB),R3
+
+startConst:
+	ADD	R3,R8,R3	// starting point in constants table
+
+	VXOR	V0,V0,V0	// clear the V regs
+	VXOR	V1,V1,V1
+	VXOR	V2,V2,V2
+	VXOR	V3,V3,V3
+	VXOR	V4,V4,V4
+	VXOR	V5,V5,V5
+	VXOR	V6,V6,V6
+	VXOR	V7,V7,V7
+
+	LVX	(R3),const1	// loading constant values
+
+	CMP	R15,$1		// Identify warm up pass
+	BEQ	next
+
+	// First warm up pass: load the bytes to process
+	LVX	(R4),V16
+	LVX	(R4+off16),V17
+	LVX	(R4+off32),V18
+	LVX	(R4+off48),V19
+	LVX	(R4+off64),V20
+	LVX	(R4+off80),V21
+	LVX	(R4+off96),V22
+	LVX	(R4+off112),V23
+	ADD	$128,R4		// bump up to next 128 bytes in buffer
+
+	VXOR	V16,V8,V16	// xor in inital CRC in V8
+
+next:
+	BC	18,0,first_warm_up_done
+
+	ADD	$16,R3		// bump up to next constants
+	LVX	(R3),const2	// table values
+
+	VPMSUMD	V16,const1,V8 // second warm up pass
+	LVX	(R4),V16	// load from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V17,const1,V9	// vpmsumd with constants
+	LVX	(R4+off16),V17	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V18,const1,V10	// vpmsumd with constants
+	LVX	(R4+off32),V18	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V19,const1,V11	// vpmsumd with constants
+	LVX	(R4+off48),V19	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V20,const1,V12	// vpmsumd with constants
+	LVX	(R4+off64),V20	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V21,const1,V13	// vpmsumd with constants
+	LVX	(R4+off80),V21	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V22,const1,V14	// vpmsumd with constants
+	LVX	(R4+off96),V22	// load next from buffer
+	OR	$0,R2,R2
+
+	VPMSUMD	V23,const1,V15	// vpmsumd with constants
+	LVX	(R4+off112),V23	// load next from buffer
+
+	ADD	$128,R4		// bump up to next 128 bytes in buffer
+
+	BC	18,0,first_cool_down
+
+cool_top:
+	LVX	(R3),const1	// constants
+	ADD	$16,R3		// inc to next constants
+	OR	$0,R2,R2
+
+	VXOR	V0,V8,V0	// xor in previous vpmsumd
+	VPMSUMD	V16,const2,V8	// vpmsumd with constants
+	LVX	(R4),V16	// buffer
+	OR	$0,R2,R2
+
+	VXOR	V1,V9,V1	// xor in previous
+	VPMSUMD	V17,const2,V9	// vpmsumd with constants
+	LVX	(R4+off16),V17	// next in buffer
+	OR	$0,R2,R2
+
+	VXOR	V2,V10,V2	// xor in previous
+	VPMSUMD	V18,const2,V10	// vpmsumd with constants
+	LVX	(R4+off32),V18	// next in buffer
+	OR	$0,R2,R2
+
+	VXOR	V3,V11,V3	// xor in previous
+	VPMSUMD	V19,const2,V11	// vpmsumd with constants
+	LVX	(R4+off48),V19	// next in buffer
+	LVX	(R3),const2	// get next constant
+	OR	$0,R2,R2
+
+	VXOR	V4,V12,V4	// xor in previous
+	VPMSUMD	V20,const1,V12	// vpmsumd with constants
+	LVX	(R4+off64),V20	// next in buffer
+	OR	$0,R2,R2
+
+	VXOR	V5,V13,V5	// xor in previous
+	VPMSUMD	V21,const1,V13	// vpmsumd with constants
+	LVX	(R4+off80),V21	// next in buffer
+	OR	$0,R2,R2
+
+	VXOR	V6,V14,V6	// xor in previous
+	VPMSUMD	V22,const1,V14	// vpmsumd with constants
+	LVX	(R4+off96),V22	// next in buffer
+	OR	$0,R2,R2
+
+	VXOR	V7,V15,V7	// xor in previous
+	VPMSUMD	V23,const1,V15	// vpmsumd with constants
+	LVX	(R4+off112),V23	// next in buffer
+
+	ADD	$128,R4		// bump up buffer pointer
+	BC	16,0,cool_top	// are we done?
+
+first_cool_down:
+
+	// load the constants
+	// xor in the previous value
+	// vpmsumd the result with constants
+
+	LVX	(R3),const1
+	ADD	$16,R3
+
+	VXOR	V0,V8,V0
+	VPMSUMD V16,const1,V8
+	OR	$0,R2,R2
+
+	VXOR	V1,V9,V1
+	VPMSUMD	V17,const1,V9
+	OR	$0,R2,R2
+
+	VXOR	V2,V10,V2
+	VPMSUMD	V18,const1,V10
+	OR	$0,R2,R2
+
+	VXOR	V3,V11,V3
+	VPMSUMD	V19,const1,V11
+	OR	$0,R2,R2
+
+	VXOR	V4,V12,V4
+	VPMSUMD	V20,const1,V12
+	OR	$0,R2,R2
+
+	VXOR	V5,V13,V5
+	VPMSUMD	V21,const1,V13
+	OR	$0,R2,R2
+
+	VXOR	V6,V14,V6
+	VPMSUMD	V22,const1,V14
+	OR	$0,R2,R2
+
+	VXOR	V7,V15,V7
+	VPMSUMD	V23,const1,V15
+	OR	$0,R2,R2
+
+second_cool_down:
+
+	VXOR    V0,V8,V0
+	VXOR    V1,V9,V1
+	VXOR    V2,V10,V2
+	VXOR    V3,V11,V3
+	VXOR    V4,V12,V4
+	VXOR    V5,V13,V5
+	VXOR    V6,V14,V6
+	VXOR    V7,V15,V7
+
+#ifdef REFLECT
+	VSLDOI  $4,V0,zeroes,V0
+	VSLDOI  $4,V1,zeroes,V1
+	VSLDOI  $4,V2,zeroes,V2
+	VSLDOI  $4,V3,zeroes,V3
+	VSLDOI  $4,V4,zeroes,V4
+	VSLDOI  $4,V5,zeroes,V5
+	VSLDOI  $4,V6,zeroes,V6
+	VSLDOI  $4,V7,zeroes,V7
+#endif
+
+	LVX	(R4),V8
+	LVX	(R4+off16),V9
+	LVX	(R4+off32),V10
+	LVX	(R4+off48),V11
+	LVX	(R4+off64),V12
+	LVX	(R4+off80),V13
+	LVX	(R4+off96),V14
+	LVX	(R4+off112),V15
+
+	ADD	$128,R4
+
+	VXOR	V0,V8,V16
+	VXOR	V1,V9,V17
+	VXOR	V2,V10,V18
+	VXOR	V3,V11,V19
+	VXOR	V4,V12,V20
+	VXOR	V5,V13,V21
+	VXOR	V6,V14,V22
+	VXOR	V7,V15,V23
+
+	MOVD    $1,R15
+	CMP     $0,R6
+	ADD     $128,R6
+
+	BNE	l1
+	ANDCC   $127,R5
+	SUBC	R5,$128,R6
+	ADD	R3,R6,R3
+
+	SRD	$4,R5,R7
+	MOVD	R7,CTR
+	LVX	(R3),V0
+	LVX	(R3+off16),V1
+	LVX	(R3+off32),V2
+	LVX	(R3+off48),V3
+	LVX	(R3+off64),V4
+	LVX	(R3+off80),V5
+	LVX	(R3+off96),V6
+	LVX	(R3+off112),V7
+
+	ADD	$128,R3
+
+	VPMSUMW	V16,V0,V0
+	VPMSUMW	V17,V1,V1
+	VPMSUMW	V18,V2,V2
+	VPMSUMW	V19,V3,V3
+	VPMSUMW	V20,V4,V4
+	VPMSUMW	V21,V5,V5
+	VPMSUMW	V22,V6,V6
+	VPMSUMW	V23,V7,V7
+
+	// now reduce the tail
+
+	CMP	$0,R7
+	BEQ	next1
+
+	LVX	(R4),V16
+	LVX	(R3),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off16),V16
+	LVX	(R3+off16),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off32),V16
+	LVX	(R3+off32),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off48),V16
+	LVX	(R3+off48),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off64),V16
+	LVX	(R3+off64),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off80),V16
+	LVX	(R3+off80),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+	BC	18,0,next1
+
+	LVX	(R4+off96),V16
+	LVX	(R3+off96),V17
+	VPMSUMW	V16,V17,V16
+	VXOR	V0,V16,V0
+
+next1:
+	VXOR	V0,V1,V0
+	VXOR	V2,V3,V2
+	VXOR	V4,V5,V4
+	VXOR	V6,V7,V6
+	VXOR	V0,V2,V0
+	VXOR	V4,V6,V4
+	VXOR	V0,V4,V0
+
+barrett_reduction:
+
+	CMP	R14,$1
+	BNE	barcstTable
+	MOVD	$·IEEEBarConst(SB),R3
+	BR	startbarConst
+barcstTable:
+	MOVD    $·CastBarConst(SB),R3
+
+startbarConst:
+	LVX	(R3),const1
+	LVX	(R3+off16),const2
+
+	VSLDOI	$8,V0,V0,V1
+	VXOR	V0,V1,V0
+
+#ifdef REFLECT
+	VSPLTISB $1,V1
+	VSL	V0,V1,V0
+#endif
+
+	VAND	V0,mask_64bit,V0
+
+#ifndef	REFLECT
+
+	VPMSUMD	V0,const1,V1
+	VSLDOI	$8,zeroes,V1,V1
+	VPMSUMD	V1,const2,V1
+	VXOR	V0,V1,V0
+	VSLDOI	$8,V0,zeroes,V0
+
+#else
+
+	VAND	V0,mask_32bit,V1
+	VPMSUMD	V1,const1,V1
+	VAND	V1,mask_32bit,V1
+	VPMSUMD	V1,const2,V1
+	VXOR	V0,V1,V0
+	VSLDOI  $4,V0,zeroes,V0
+
+#endif
+
+	MFVSRD	VS32,R3 // VS32 = V0
+
+	NOR	R3,R3,R3 // return ^crc
+	MOVW	R3,ret+32(FP)
+	RET
+
+first_warm_up_done:
+
+	LVX	(R3),const1
+	ADD	$16,R3
+
+	VPMSUMD	V16,const1,V8
+	VPMSUMD	V17,const1,V9
+	VPMSUMD	V18,const1,V10
+	VPMSUMD	V19,const1,V11
+	VPMSUMD	V20,const1,V12
+	VPMSUMD	V21,const1,V13
+	VPMSUMD	V22,const1,V14
+	VPMSUMD	V23,const1,V15
+
+	BR	second_cool_down
+
+short:
+	CMP	$0,R5
+	BEQ	zero
+
+	// compute short constants
+
+	CMP     R14,$1
+	BNE     castshTable
+	MOVD    $·IEEEConst(SB),R3
+	ADD	$4080,R3
+	BR      startshConst
+castshTable:
+	MOVD    $·CastConst(SB),R3
+	ADD	$4080,R3
+
+startshConst:
+	SUBC	R5,$256,R6	// sub from 256
+	ADD	R3,R6,R3
+
+	// calculate where to start
+
+	SRD	$4,R5,R7
+	MOVD	R7,CTR
+
+	VXOR	V19,V19,V19
+	VXOR	V20,V20,V20
+
+	LVX	(R4),V0
+	LVX	(R3),V16
+	VXOR	V0,V8,V0
+	VPMSUMW	V0,V16,V0
+	BC	18,0,v0
+
+	LVX	(R4+off16),V1
+	LVX	(R3+off16),V17
+	VPMSUMW	V1,V17,V1
+	BC	18,0,v1
+
+	LVX	(R4+off32),V2
+	LVX	(R3+off32),V16
+	VPMSUMW	V2,V16,V2
+	BC	18,0,v2
+
+	LVX	(R4+off48),V3
+	LVX	(R3+off48),V17
+	VPMSUMW	V3,V17,V3
+	BC	18,0,v3
+
+	LVX	(R4+off64),V4
+	LVX	(R3+off64),V16
+	VPMSUMW	V4,V16,V4
+	BC	18,0,v4
+
+	LVX	(R4+off80),V5
+	LVX	(R3+off80),V17
+	VPMSUMW	V5,V17,V5
+	BC	18,0,v5
+
+	LVX	(R4+off96),V6
+	LVX	(R3+off96),V16
+	VPMSUMW	V6,V16,V6
+	BC	18,0,v6
+
+	LVX	(R4+off112),V7
+	LVX	(R3+off112),V17
+	VPMSUMW	V7,V17,V7
+	BC	18,0,v7
+
+	ADD	$128,R3
+	ADD	$128,R4
+
+	LVX	(R4),V8
+	LVX	(R3),V16
+	VPMSUMW	V8,V16,V8
+	BC	18,0,v8
+
+	LVX	(R4+off16),V9
+	LVX	(R3+off16),V17
+	VPMSUMW	V9,V17,V9
+	BC	18,0,v9
+
+	LVX	(R4+off32),V10
+	LVX	(R3+off32),V16
+	VPMSUMW	V10,V16,V10
+	BC	18,0,v10
+
+	LVX	(R4+off48),V11
+	LVX	(R3+off48),V17
+	VPMSUMW	V11,V17,V11
+	BC	18,0,v11
+
+	LVX	(R4+off64),V12
+	LVX	(R3+off64),V16
+	VPMSUMW	V12,V16,V12
+	BC	18,0,v12
+
+	LVX	(R4+off80),V13
+	LVX	(R3+off80),V17
+	VPMSUMW	V13,V17,V13
+	BC	18,0,v13
+
+	LVX	(R4+off96),V14
+	LVX	(R3+off96),V16
+	VPMSUMW	V14,V16,V14
+	BC	18,0,v14
+
+	LVX	(R4+off112),V15
+	LVX	(R3+off112),V17
+	VPMSUMW	V15,V17,V15
+
+	VXOR	V19,V15,V19
+v14:	VXOR	V20,V14,V20
+v13:	VXOR	V19,V13,V19
+v12:	VXOR	V20,V12,V20
+v11:	VXOR	V19,V11,V19
+v10:	VXOR	V20,V10,V20
+v9:	VXOR	V19,V9,V19
+v8:	VXOR	V20,V8,V20
+v7:	VXOR	V19,V7,V19
+v6:	VXOR	V20,V6,V20
+v5:	VXOR	V19,V5,V19
+v4:	VXOR	V20,V4,V20
+v3:	VXOR	V19,V3,V19
+v2:	VXOR	V20,V2,V20
+v1:	VXOR	V19,V1,V19
+v0:	VXOR	V20,V0,V20
+
+	VXOR	V19,V20,V0
+
+	BR	barrett_reduction
+
+zero:
+	// This case is the original crc, so just return it
+	MOVW    R10,ret+32(FP)
+	RET
--- a/src/hash/crc32/crc32_table_ppc64le.s
+++ b/src/hash/crc32/crc32_table_ppc64le.s
--- a/src/hash/crc32/crc32_test.go
+++ b/src/hash/crc32/crc32_test.go
@@ -76,8 +76,9 @@ func testCrossCheck(t *testing.T, crcFunc1, crcFunc2 func(crc uint32, b []byte)
 	// The AMD64 implementation has some cutoffs at lengths 168*3=504 and
 	// 1344*3=4032. We should make sure lengths around these values are in the
 	// list.
-	lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 100, 128,
-		500, 501, 502, 503, 504, 505, 512, 1000, 1024, 2000,
+	lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 63, 64, 65, 100,
+		127, 128, 129, 255, 256, 257, 300, 312, 384, 416, 448, 480,
+		500, 501, 502, 503, 504, 505, 512, 513, 1000, 1024, 2000,
 		4030, 4031, 4032, 4033, 4036, 4040, 4048, 4096, 5000, 10000}
 	for _, length := range lengths {
 		p := make([]byte, length)

--- a/src/hash/crc32/gen_const_ppc64le.go
+++ b/src/hash/crc32/gen_const_ppc64le.go
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// Generate the constant table associated with the poly used by the
+// vpmsumd crc32 algorithm.
+//
+// go run gen_const_ppc64le.go
+//
+// generates crc32_table_ppc64le.s
+
+// The following is derived from code written by Anton Blanchard
+// <anton@au.ibm.com> found at https://github.com/antonblanchard/crc32-vpmsum.
+// The original is dual licensed under GPL and Apache 2.  As the copyright holder
+// for the work, IBM has contributed this new work under the golang license.
+
+// This code was written in Go based on the original C implementation.
+
+// This is a tool needed to generate the appropriate constants needed for
+// the vpmsum algorithm.  It is included to generate new constant tables if
+// new polynomial values are included in the future.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+)
+
+var blocking = 32 * 1024
+
+func reflect_bits(b uint64, nr uint) uint64 {
+	var ref uint64
+
+	for bit := uint64(0); bit < uint64(nr); bit++ {
+		if (b & uint64(1)) == 1 {
+			ref |= (1 << (uint64(nr-1) - bit))
+		}
+		b = (b >> 1)
+	}
+	return ref
+}
+
+func get_remainder(poly uint64, deg uint, n uint) uint64 {
+
+	rem, _ := xnmodp(n, poly, deg)
+	return rem
+}
+
+func get_quotient(poly uint64, bits, n uint) uint64 {
+
+	_, div := xnmodp(n, poly, bits)
+	return div
+}
+
+// xnmodp returns two values, p and div:
+// p is the representation of the binary polynomial x**n mod (x ** deg + "poly")
+// That is p is the binary representation of the modulus polynomial except for its highest-order term.
+// div is the binary representation of the polynomial x**n / (x ** deg + "poly")
+func xnmodp(n uint, poly uint64, deg uint) (uint64, uint64) {
+
+	var mod, mask, high, div uint64
+
+	if n < deg {
+		div = 0
+		return poly, div
+	}
+	mask = 1<<deg - 1
+	poly &= mask
+	mod = poly
+	div = 1
+	deg--
+	n--
+	for n > deg {
+		high = (mod >> deg) & 1
+		div = (div << 1) | high
+		mod <<= 1
+		if high != 0 {
+			mod ^= poly
+		}
+		n--
+	}
+	return mod & mask, div
+}
+
+func main() {
+	w := new(bytes.Buffer)
+
+	fmt.Fprintf(w, "// autogenerated: do not edit!\n")
+	fmt.Fprintf(w, "// generated from crc32/gen_const_ppc64le.go\n")
+	fmt.Fprintln(w)
+	fmt.Fprintf(w, "#include \"textflag.h\"\n")
+
+	// These are the polynomials supported in vector now.
+	// If adding others, include the polynomial and a name
+	// to identify it.
+
+	genCrc32ConstTable(w, 0xedb88320, "IEEE")
+	genCrc32ConstTable(w, 0x82f63b78, "Cast")
+	genCrc32ConstTable(w, 0xeb31d82e, "Koop")
+	b := w.Bytes()
+
+	err := ioutil.WriteFile("crc32_table_ppc64le.s", b, 0666)
+	if err != nil {
+		fmt.Printf("can't write output: %s\n", err)
+	}
+}
+
+func genCrc32ConstTable(w *bytes.Buffer, poly uint32, polyid string) {
+
+	ref_poly := reflect_bits(uint64(poly), 32)
+	fmt.Fprintf(w, "\n\t/* Reduce %d kbits to 1024 bits */\n", blocking*8)
+	j := 0
+	for i := (blocking * 8) - 1024; i > 0; i -= 1024 {
+		a := reflect_bits(get_remainder(ref_poly, 32, uint(i)), 32) << 1
+		b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32) << 1
+
+		fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s */\n", uint(i+64), "", uint(i), "")
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, j*8, b)
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, (j+1)*8, a)
+
+		j += 2
+		fmt.Fprintf(w, "\n")
+	}
+
+	for i := (1024 * 2) - 128; i >= 0; i -= 128 {
+		a := reflect_bits(get_remainder(ref_poly, 32, uint(i+32)), 32)
+		b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32)
+		c := reflect_bits(get_remainder(ref_poly, 32, uint(i+96)), 32)
+		d := reflect_bits(get_remainder(ref_poly, 32, uint(i+128)), 32)
+
+		fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s */\n", i+128, "", i+96, "", i+64, "", i+32, "")
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, j*8, c, d)
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, (j+1)*8, a, b)
+
+		j += 2
+		fmt.Fprintf(w, "\n")
+	}
+
+	fmt.Fprintf(w, "GLOBL ·%sConst(SB),RODATA,$4336\n", polyid)
+	fmt.Fprintf(w, "\n /* Barrett constant m - (4^32)/n */\n")
+	fmt.Fprintf(w, "DATA ·%sBarConst(SB)/8,$0x%016x\n", polyid, reflect_bits(get_quotient(ref_poly, 32, 64), 33))
+	fmt.Fprintf(w, "DATA ·%sBarConst+8(SB)/8,$0x0000000000000000\n", polyid)
+	fmt.Fprintf(w, "DATA ·%sBarConst+16(SB)/8,$0x%016x\n", polyid, reflect_bits((uint64(1)<<32)|ref_poly, 33)) // reflected?
+	fmt.Fprintf(w, "DATA ·%sBarConst+24(SB)/8,$0x0000000000000000\n", polyid)
+	fmt.Fprintf(w, "GLOBL ·%sBarConst(SB),RODATA,$32\n", polyid)
+}