sync/atomic, runtime/internal/atomic: improve ppc64x atomics

The following performance improvements have been made to the low-level atomic functions for ppc64le & ppc64: - For those cases containing a lwarx and stwcx (or other sizes): sync, lwarx, maybe something, stwcx, loop to sync, sync, isync The sync is moved before (outside) the lwarx/stwcx loop, and the sync after is removed, so it becomes: sync, lwarx, maybe something, stwcx, loop to lwarx, isync - For the Or8 and And8, the shifting and manipulation of the address to the word aligned version were removed and the instructions were changed to use lbarx, stbcx instead of register shifting, xor, then lwarx, stwcx. - New instructions LWSYNC, LBAR, STBCC were tested and added. runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode instead of the WORD encoding. Fixes #15469 Ran some of the benchmarks in the runtime and sync directories. Some results varied from run to run but the trend was improvement based on best times for base and new: runtime.test: BenchmarkChanNonblocking-128 0.88 0.89 +1.14% BenchmarkChanUncontended-128 569 511 -10.19% BenchmarkChanContended-128 63110 53231 -15.65% BenchmarkChanSync-128 691 598 -13.46% BenchmarkChanSyncWork-128 11355 11649 +2.59% BenchmarkChanProdCons0-128 2402 2090 -12.99% BenchmarkChanProdCons10-128 1348 1363 +1.11% BenchmarkChanProdCons100-128 1002 746 -25.55% BenchmarkChanProdConsWork0-128 2554 2720 +6.50% BenchmarkChanProdConsWork10-128 1909 1804 -5.50% BenchmarkChanProdConsWork100-128 1624 1580 -2.71% BenchmarkChanCreation-128 237 212 -10.55% BenchmarkChanSem-128 705 667 -5.39% BenchmarkChanPopular-128 5081190 4497566 -11.49% BenchmarkCreateGoroutines-128 532 473 -11.09% BenchmarkCreateGoroutinesParallel-128 35.0 34.7 -0.86% BenchmarkCreateGoroutinesCapture-128 4923 4200 -14.69% sync.test: BenchmarkUncontendedSemaphore-128 112 94.2 -15.89% BenchmarkContendedSemaphore-128 133 128 -3.76% BenchmarkMutexUncontended-128 1.90 1.67 -12.11% BenchmarkMutex-128 353 310 -12.18% BenchmarkMutexSlack-128 304 283 -6.91% BenchmarkMutexWork-128 554 541 -2.35% BenchmarkMutexWorkSlack-128 567 556 -1.94% BenchmarkMutexNoSpin-128 275 242 -12.00% BenchmarkMutexSpin-128 1129 1030 -8.77% BenchmarkOnce-128 1.08 0.96 -11.11% BenchmarkPool-128 29.8 27.4 -8.05% BenchmarkPoolOverflow-128 40564 36583 -9.81% BenchmarkSemaUncontended-128 3.14 2.63 -16.24% BenchmarkSemaSyntNonblock-128 1087 1069 -1.66% BenchmarkSemaSyntBlock-128 897 893 -0.45% BenchmarkSemaWorkNonblock-128 1034 1028 -0.58% BenchmarkSemaWorkBlock-128 949 886 -6.64% Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e Reviewed-on: https://go-review.googlesource.com/22549Reviewed-by: Michael Munday <munday@ca.ibm.com> Reviewed-by: Minux Ma <minux@golang.org>

sync/atomic, runtime/internal/atomic: improve ppc64x atomics
The following performance improvements have been made to the low-level atomic functions for ppc64le & ppc64: - For those cases containing a lwarx and stwcx (or other sizes): sync, lwarx, maybe something, stwcx, loop to sync, sync, isync The sync is moved before (outside) the lwarx/stwcx loop, and the sync after is removed, so it becomes: sync, lwarx, maybe something, stwcx, loop to lwarx, isync - For the Or8 and And8, the shifting and manipulation of the address to the word aligned version were removed and the instructions were changed to use lbarx, stbcx instead of register shifting, xor, then lwarx, stwcx. - New instructions LWSYNC, LBAR, STBCC were tested and added. runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode instead of the WORD encoding. Fixes #15469 Ran some of the benchmarks in the runtime and sync directories. Some results varied from run to run but the trend was improvement based on best times for base and new: runtime.test: BenchmarkChanNonblocking-128 0.88 0.89 +1.14% BenchmarkChanUncontended-128 569 511 -10.19% BenchmarkChanContended-128 63110 53231 -15.65% BenchmarkChanSync-128 691 598 -13.46% BenchmarkChanSyncWork-128 11355 11649 +2.59% BenchmarkChanProdCons0-128 2402 2090 -12.99% BenchmarkChanProdCons10-128 1348 1363 +1.11% BenchmarkChanProdCons100-128 1002 746 -25.55% BenchmarkChanProdConsWork0-128 2554 2720 +6.50% BenchmarkChanProdConsWork10-128 1909 1804 -5.50% BenchmarkChanProdConsWork100-128 1624 1580 -2.71% BenchmarkChanCreation-128 237 212 -10.55% BenchmarkChanSem-128 705 667 -5.39% BenchmarkChanPopular-128 5081190 4497566 -11.49% BenchmarkCreateGoroutines-128 532 473 -11.09% BenchmarkCreateGoroutinesParallel-128 35.0 34.7 -0.86% BenchmarkCreateGoroutinesCapture-128 4923 4200 -14.69% sync.test: BenchmarkUncontendedSemaphore-128 112 94.2 -15.89% BenchmarkContendedSemaphore-128 133 128 -3.76% BenchmarkMutexUncontended-128 1.90 1.67 -12.11% BenchmarkMutex-128 353 310 -12.18% BenchmarkMutexSlack-128 304 283 -6.91% BenchmarkMutexWork-128 554 541 -2.35% BenchmarkMutexWorkSlack-128 567 556 -1.94% BenchmarkMutexNoSpin-128 275 242 -12.00% BenchmarkMutexSpin-128 1129 1030 -8.77% BenchmarkOnce-128 1.08 0.96 -11.11% BenchmarkPool-128 29.8 27.4 -8.05% BenchmarkPoolOverflow-128 40564 36583 -9.81% BenchmarkSemaUncontended-128 3.14 2.63 -16.24% BenchmarkSemaSyntNonblock-128 1087 1069 -1.66% BenchmarkSemaSyntBlock-128 897 893 -0.45% BenchmarkSemaWorkNonblock-128 1034 1028 -0.58% BenchmarkSemaWorkBlock-128 949 886 -6.64% Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e Reviewed-on: https://go-review.googlesource.com/22549Reviewed-by: Michael Munday <munday@ca.ibm.com> Reviewed-by: Minux Ma <minux@golang.org>
eeca3ba9 · Lynn Boger · Michael Munday · 0960c7c7 · eeca3ba9 · eeca3ba9
Commit eeca3ba9 authored Apr 28, 2016 by Lynn Boger Committed by Michael Munday May 05, 2016
7 changed files
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -342,8 +342,10 @@ const (
 	AFSUBS
 	AFSUBSCC
 	AMOVMW
+	ALBAR
 	ALSW
 	ALWAR
+	ALWSYNC
 	AMOVWBR
 	AMOVB
 	AMOVBU
@@ -401,6 +403,7 @@ const (
 	ASRAW
 	ASRAWCC
 	ASRWCC
+	ASTBCCC
 	ASTSW
 	ASTWCCC
 	ASUB

--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -118,8 +118,10 @@ var Anames = []string{
 	"FSUBS",
 	"FSUBSCC",
 	"MOVMW",
+	"LBAR",
 	"LSW",
 	"LWAR",
+	"LWSYNC",
 	"MOVWBR",
 	"MOVB",
 	"MOVBU",
@@ -177,6 +179,7 @@ var Anames = []string{
 	"SRAW",
 	"SRAWCC",
 	"SRWCC",
+	"STBCCC",
 	"STSW",
 	"STWCCC",
 	"SUB",

--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -933,6 +933,7 @@ func buildop(ctxt *obj.Link) {

 		case AECOWX: /* indexed store: op s,(b+a); op s,(b) */
 			opset(ASTWCCC, r0)
+			opset(ASTBCCC, r0)

 			opset(ASTDCCC, r0)

@@ -1202,6 +1203,7 @@ func buildop(ctxt *obj.Link) {

 		case ASYNC:
 			opset(AISYNC, r0)
+			opset(ALWSYNC, r0)
 			opset(APTESYNC, r0)
 			opset(ATLBSYNC, r0)

@@ -1228,6 +1230,7 @@ func buildop(ctxt *obj.Link) {
 			opset(AFMOVSU, r0)

 		case AECIWX:
+			opset(ALBAR, r0)
 			opset(ALWAR, r0)
 			opset(ALDAR, r0)

@@ -3001,6 +3004,9 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {

 	case ASYNC:
 		return OPVCC(31, 598, 0, 0)
+	case ALWSYNC:
+		return OPVCC(31, 598, 0, 0) | 1<<21
+
 	case APTESYNC:
 		return OPVCC(31, 598, 0, 0) | 2<<21

@@ -3246,6 +3252,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVCC(31, 311, 0, 0) /* lhzux */
 	case AECIWX:
 		return OPVCC(31, 310, 0, 0) /* eciwx */
+	case ALBAR:
+		return OPVCC(31, 52, 0, 0) /* lbarx */
 	case ALWAR:
 		return OPVCC(31, 20, 0, 0) /* lwarx */
 	case ALDAR:
@@ -3342,6 +3350,8 @@ func opstorex(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVCC(31, 661, 0, 0) /* stswx */
 	case AMOVWBR:
 		return OPVCC(31, 662, 0, 0) /* stwbrx */
+	case ASTBCCC:
+		return OPVCC(31, 694, 0, 1) /* stbcx. */
 	case ASTWCCC:
 		return OPVCC(31, 150, 0, 1) /* stwcx. */
 	case ASTDCCC:

--- a/src/cmd/internal/obj/ppc64/obj9.go
+++ b/src/cmd/internal/obj/ppc64/obj9.go
@@ -301,6 +301,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
 			}

 		case ALWAR,
+			ALBAR,
+			ASTBCCC,
 			ASTWCCC,
 			AECIWX,
 			AECOWX,
@@ -323,6 +325,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
 			ASYNC,
 			ATLBSYNC,
 			APTESYNC,
+			ALWSYNC,
 			ATW,
 			AWORD,
 			ARFI,

--- a/src/runtime/atomic_ppc64x.s
+++ b/src/runtime/atomic_ppc64x.s
@@ -10,5 +10,5 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	// LWSYNC is the "export" barrier recommended by Power ISA
 	// v2.07 book II, appendix B.2.2.2.
 	// LWSYNC is a load/load, load/store, and store/store barrier.
-	WORD $0x7c2004ac	// LWSYNC
+	LWSYNC
 	RET
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -17,21 +17,20 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
 	MOVD	ptr+0(FP), R3
 	MOVWZ	old+8(FP), R4
 	MOVWZ	new+12(FP), R5
-cas_again:
 	SYNC
+cas_again:
 	LWAR	(R3), R6
 	CMPW	R6, R4
 	BNE	cas_fail
 	STWCCC	R5, (R3)
 	BNE	cas_again
 	MOVD	$1, R3
-	SYNC
 	ISYNC
 	MOVB	R3, ret+16(FP)
 	RET
 cas_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
+	MOVB	R0, ret+16(FP)
+	RET

 // bool	runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
 // Atomically:
@@ -45,21 +44,20 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
 	MOVD	ptr+0(FP), R3
 	MOVD	old+8(FP), R4
 	MOVD	new+16(FP), R5
-cas64_again:
 	SYNC
+cas64_again:
 	LDAR	(R3), R6
 	CMP	R6, R4
 	BNE	cas64_fail
 	STDCCC	R5, (R3)
 	BNE	cas64_again
 	MOVD	$1, R3
-	SYNC
 	ISYNC
 	MOVB	R3, ret+24(FP)
 	RET
 cas64_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
+	MOVB	R0, ret+24(FP)
+	RET

 TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
 	BR	runtime∕internal∕atomic·Cas64(SB)
@@ -103,8 +101,7 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
 	LWAR	(R4), R3
 	ADD	R5, R3
 	STWCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVW	R3, ret+16(FP)
 	RET
@@ -116,8 +113,7 @@ TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
 	LDAR	(R4), R3
 	ADD	R5, R3
 	STDCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVD	R3, ret+16(FP)
 	RET
@@ -128,8 +124,7 @@ TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
 	SYNC
 	LWAR	(R4), R3
 	STWCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVW	R3, ret+16(FP)
 	RET
@@ -140,8 +135,7 @@ TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
 	SYNC
 	LDAR	(R4), R3
 	STDCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVD	R3, ret+16(FP)
 	RET
@@ -171,26 +165,12 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
 	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr. R4 = val << R6
-	SLD	R6, R4, R4
-
-again:
 	SYNC
-	LWAR	(R5), R6
+again:
+	LBAR	(R3), R6
 	OR	R4, R6
-	STWCCC	R6, (R5)
+	STBCCC	R6, (R3)
 	BNE	again
-	SYNC
 	ISYNC
 	RET

@@ -198,28 +178,11 @@ again:
 TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
 	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr. R4 = val << R6 | ^(0xFF << R6)
-	MOVD	$0xFF, R7
-	SLD	R6, R4
-	SLD	R6, R7
-	XOR $-1, R7
-	OR	R7, R4
-again:
 	SYNC
-	LWAR	(R5), R6
+again:
+	LBAR	(R3), R6
 	AND	R4, R6
-	STWCCC	R6, (R5)
+	STBCCC	R6, (R3)
 	BNE	again
-	SYNC
 	ISYNC
 	RET
--- a/src/sync/atomic/asm_ppc64x.s
+++ b/src/sync/atomic/asm_ppc64x.s
@@ -15,8 +15,7 @@ TEXT ·SwapUint32(SB),NOSPLIT,$0-20
 	SYNC
 	LWAR	(R3), R5
 	STWCCC	R4, (R3)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVW	R5, old+16(FP)
 	RET
@@ -30,8 +29,7 @@ TEXT ·SwapUint64(SB),NOSPLIT,$0-24
 	SYNC
 	LDAR	(R3), R5
 	STDCCC	R4, (R3)
-	BNE	-3(PC)
-	SYNC
+	BNE	-2(PC)
 	ISYNC
 	MOVD	R5, old+16(FP)
 	RET
@@ -49,10 +47,9 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
 	SYNC
 	LWAR	(R3), R6
 	CMPW	R6, R4
-	BNE	8(PC)
+	BNE	7(PC)
 	STWCCC	R5, (R3)
-	BNE	-5(PC)
-	SYNC
+	BNE	-4(PC)
 	ISYNC
 	MOVD	$1, R3
 	MOVB	R3, swapped+16(FP)
@@ -73,10 +70,9 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
 	SYNC
 	LDAR	(R3), R6
 	CMP	R6, R4
-	BNE	8(PC)
+	BNE	7(PC)
 	STDCCC	R5, (R3)
-	BNE	-5(PC)
-	SYNC
+	BNE	-4(PC)
 	ISYNC
 	MOVD	$1, R3
 	MOVB	R3, swapped+24(FP)
@@ -94,8 +90,7 @@ TEXT ·AddUint32(SB),NOSPLIT,$0-20
 	LWAR	(R3), R5
 	ADD	R4, R5
 	STWCCC	R5, (R3)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVW	R5, ret+16(FP)
 	RET
@@ -113,8 +108,7 @@ TEXT ·AddUint64(SB),NOSPLIT,$0-24
 	LDAR	(R3), R5
 	ADD	R4, R5
 	STDCCC	R5, (R3)
-	BNE	-4(PC)
-	SYNC
+	BNE	-3(PC)
 	ISYNC
 	MOVD	R5, ret+16(FP)
 	RET