Commit eeca3ba9 authored by Lynn Boger's avatar Lynn Boger Committed by Michael Munday

sync/atomic, runtime/internal/atomic: improve ppc64x atomics

The following performance improvements have been made to the
low-level atomic functions for ppc64le & ppc64:

- For those cases containing a lwarx and stwcx (or other sizes):
sync, lwarx, maybe something, stwcx, loop to sync, sync, isync
The sync is moved before (outside) the lwarx/stwcx loop, and the
 sync after is removed, so it becomes:
sync, lwarx, maybe something, stwcx, loop to lwarx, isync

- For the Or8 and And8, the shifting and manipulation of the
address to the word aligned version were removed and the
instructions were changed to use lbarx, stbcx instead of
register shifting, xor, then lwarx, stwcx.

- New instructions LWSYNC, LBAR, STBCC were tested and added.
runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode
instead of the WORD encoding.

Fixes #15469

Ran some of the benchmarks in the runtime and sync directories.
Some results varied from run to run but the trend was improvement
based on best times for base and new:

runtime.test:
BenchmarkChanNonblocking-128         0.88          0.89          +1.14%
BenchmarkChanUncontended-128         569           511           -10.19%
BenchmarkChanContended-128           63110         53231         -15.65%
BenchmarkChanSync-128                691           598           -13.46%
BenchmarkChanSyncWork-128            11355         11649         +2.59%
BenchmarkChanProdCons0-128           2402          2090          -12.99%
BenchmarkChanProdCons10-128          1348          1363          +1.11%
BenchmarkChanProdCons100-128         1002          746           -25.55%
BenchmarkChanProdConsWork0-128       2554          2720          +6.50%
BenchmarkChanProdConsWork10-128      1909          1804          -5.50%
BenchmarkChanProdConsWork100-128     1624          1580          -2.71%
BenchmarkChanCreation-128            237           212           -10.55%
BenchmarkChanSem-128                 705           667           -5.39%
BenchmarkChanPopular-128             5081190       4497566       -11.49%

BenchmarkCreateGoroutines-128             532           473           -11.09%
BenchmarkCreateGoroutinesParallel-128     35.0          34.7          -0.86%
BenchmarkCreateGoroutinesCapture-128      4923          4200          -14.69%

sync.test:
BenchmarkUncontendedSemaphore-128      112           94.2          -15.89%
BenchmarkContendedSemaphore-128        133           128           -3.76%
BenchmarkMutexUncontended-128          1.90          1.67          -12.11%
BenchmarkMutex-128                     353           310           -12.18%
BenchmarkMutexSlack-128                304           283           -6.91%
BenchmarkMutexWork-128                 554           541           -2.35%
BenchmarkMutexWorkSlack-128            567           556           -1.94%
BenchmarkMutexNoSpin-128               275           242           -12.00%
BenchmarkMutexSpin-128                 1129          1030          -8.77%
BenchmarkOnce-128                      1.08          0.96          -11.11%
BenchmarkPool-128                      29.8          27.4          -8.05%
BenchmarkPoolOverflow-128              40564         36583         -9.81%
BenchmarkSemaUncontended-128           3.14          2.63          -16.24%
BenchmarkSemaSyntNonblock-128          1087          1069          -1.66%
BenchmarkSemaSyntBlock-128             897           893           -0.45%
BenchmarkSemaWorkNonblock-128          1034          1028          -0.58%
BenchmarkSemaWorkBlock-128             949           886           -6.64%

Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e
Reviewed-on: https://go-review.googlesource.com/22549Reviewed-by: default avatarMichael Munday <munday@ca.ibm.com>
Reviewed-by: default avatarMinux Ma <minux@golang.org>
parent 0960c7c7
......@@ -342,8 +342,10 @@ const (
AFSUBS
AFSUBSCC
AMOVMW
ALBAR
ALSW
ALWAR
ALWSYNC
AMOVWBR
AMOVB
AMOVBU
......@@ -401,6 +403,7 @@ const (
ASRAW
ASRAWCC
ASRWCC
ASTBCCC
ASTSW
ASTWCCC
ASUB
......
......@@ -118,8 +118,10 @@ var Anames = []string{
"FSUBS",
"FSUBSCC",
"MOVMW",
"LBAR",
"LSW",
"LWAR",
"LWSYNC",
"MOVWBR",
"MOVB",
"MOVBU",
......@@ -177,6 +179,7 @@ var Anames = []string{
"SRAW",
"SRAWCC",
"SRWCC",
"STBCCC",
"STSW",
"STWCCC",
"SUB",
......
......@@ -933,6 +933,7 @@ func buildop(ctxt *obj.Link) {
case AECOWX: /* indexed store: op s,(b+a); op s,(b) */
opset(ASTWCCC, r0)
opset(ASTBCCC, r0)
opset(ASTDCCC, r0)
......@@ -1202,6 +1203,7 @@ func buildop(ctxt *obj.Link) {
case ASYNC:
opset(AISYNC, r0)
opset(ALWSYNC, r0)
opset(APTESYNC, r0)
opset(ATLBSYNC, r0)
......@@ -1228,6 +1230,7 @@ func buildop(ctxt *obj.Link) {
opset(AFMOVSU, r0)
case AECIWX:
opset(ALBAR, r0)
opset(ALWAR, r0)
opset(ALDAR, r0)
......@@ -3001,6 +3004,9 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
case ASYNC:
return OPVCC(31, 598, 0, 0)
case ALWSYNC:
return OPVCC(31, 598, 0, 0) | 1<<21
case APTESYNC:
return OPVCC(31, 598, 0, 0) | 2<<21
......@@ -3246,6 +3252,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
return OPVCC(31, 311, 0, 0) /* lhzux */
case AECIWX:
return OPVCC(31, 310, 0, 0) /* eciwx */
case ALBAR:
return OPVCC(31, 52, 0, 0) /* lbarx */
case ALWAR:
return OPVCC(31, 20, 0, 0) /* lwarx */
case ALDAR:
......@@ -3342,6 +3350,8 @@ func opstorex(ctxt *obj.Link, a obj.As) uint32 {
return OPVCC(31, 661, 0, 0) /* stswx */
case AMOVWBR:
return OPVCC(31, 662, 0, 0) /* stwbrx */
case ASTBCCC:
return OPVCC(31, 694, 0, 1) /* stbcx. */
case ASTWCCC:
return OPVCC(31, 150, 0, 1) /* stwcx. */
case ASTDCCC:
......
......@@ -301,6 +301,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
}
case ALWAR,
ALBAR,
ASTBCCC,
ASTWCCC,
AECIWX,
AECOWX,
......@@ -323,6 +325,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
ASYNC,
ATLBSYNC,
APTESYNC,
ALWSYNC,
ATW,
AWORD,
ARFI,
......
......@@ -10,5 +10,5 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
// LWSYNC is the "export" barrier recommended by Power ISA
// v2.07 book II, appendix B.2.2.2.
// LWSYNC is a load/load, load/store, and store/store barrier.
WORD $0x7c2004ac // LWSYNC
LWSYNC
RET
......@@ -17,21 +17,20 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
MOVD ptr+0(FP), R3
MOVWZ old+8(FP), R4
MOVWZ new+12(FP), R5
cas_again:
SYNC
cas_again:
LWAR (R3), R6
CMPW R6, R4
BNE cas_fail
STWCCC R5, (R3)
BNE cas_again
MOVD $1, R3
SYNC
ISYNC
MOVB R3, ret+16(FP)
RET
cas_fail:
MOVD $0, R3
BR -5(PC)
MOVB R0, ret+16(FP)
RET
// bool runtimeinternalatomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
// Atomically:
......@@ -45,21 +44,20 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
MOVD ptr+0(FP), R3
MOVD old+8(FP), R4
MOVD new+16(FP), R5
cas64_again:
SYNC
cas64_again:
LDAR (R3), R6
CMP R6, R4
BNE cas64_fail
STDCCC R5, (R3)
BNE cas64_again
MOVD $1, R3
SYNC
ISYNC
MOVB R3, ret+24(FP)
RET
cas64_fail:
MOVD $0, R3
BR -5(PC)
MOVB R0, ret+24(FP)
RET
TEXT runtimeinternalatomic·Casuintptr(SB), NOSPLIT, $0-25
BR runtimeinternalatomic·Cas64(SB)
......@@ -103,8 +101,7 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
LWAR (R4), R3
ADD R5, R3
STWCCC R3, (R4)
BNE -4(PC)
SYNC
BNE -3(PC)
ISYNC
MOVW R3, ret+16(FP)
RET
......@@ -116,8 +113,7 @@ TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
LDAR (R4), R3
ADD R5, R3
STDCCC R3, (R4)
BNE -4(PC)
SYNC
BNE -3(PC)
ISYNC
MOVD R3, ret+16(FP)
RET
......@@ -128,8 +124,7 @@ TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
SYNC
LWAR (R4), R3
STWCCC R5, (R4)
BNE -3(PC)
SYNC
BNE -2(PC)
ISYNC
MOVW R3, ret+16(FP)
RET
......@@ -140,8 +135,7 @@ TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
SYNC
LDAR (R4), R3
STDCCC R5, (R4)
BNE -3(PC)
SYNC
BNE -2(PC)
ISYNC
MOVD R3, ret+16(FP)
RET
......@@ -171,26 +165,12 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
TEXT runtimeinternalatomic·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
// Align ptr down to 4 bytes so we can use 32-bit load/store.
// R5 = (R3 << 0) & ~3
RLDCR $0, R3, $~3, R5
// Compute val shift.
#ifdef GOARCH_ppc64
// Big endian. ptr = ptr ^ 3
XOR $3, R3
#endif
// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
RLDC $3, R3, $(3*8), R6
// Shift val for aligned ptr. R4 = val << R6
SLD R6, R4, R4
again:
SYNC
LWAR (R5), R6
again:
LBAR (R3), R6
OR R4, R6
STWCCC R6, (R5)
STBCCC R6, (R3)
BNE again
SYNC
ISYNC
RET
......@@ -198,28 +178,11 @@ again:
TEXT runtimeinternalatomic·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
// Align ptr down to 4 bytes so we can use 32-bit load/store.
// R5 = (R3 << 0) & ~3
RLDCR $0, R3, $~3, R5
// Compute val shift.
#ifdef GOARCH_ppc64
// Big endian. ptr = ptr ^ 3
XOR $3, R3
#endif
// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
RLDC $3, R3, $(3*8), R6
// Shift val for aligned ptr. R4 = val << R6 | ^(0xFF << R6)
MOVD $0xFF, R7
SLD R6, R4
SLD R6, R7
XOR $-1, R7
OR R7, R4
again:
SYNC
LWAR (R5), R6
again:
LBAR (R3), R6
AND R4, R6
STWCCC R6, (R5)
STBCCC R6, (R3)
BNE again
SYNC
ISYNC
RET
......@@ -15,8 +15,7 @@ TEXT ·SwapUint32(SB),NOSPLIT,$0-20
SYNC
LWAR (R3), R5
STWCCC R4, (R3)
BNE -3(PC)
SYNC
BNE -2(PC)
ISYNC
MOVW R5, old+16(FP)
RET
......@@ -30,8 +29,7 @@ TEXT ·SwapUint64(SB),NOSPLIT,$0-24
SYNC
LDAR (R3), R5
STDCCC R4, (R3)
BNE -3(PC)
SYNC
BNE -2(PC)
ISYNC
MOVD R5, old+16(FP)
RET
......@@ -49,10 +47,9 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
SYNC
LWAR (R3), R6
CMPW R6, R4
BNE 8(PC)
BNE 7(PC)
STWCCC R5, (R3)
BNE -5(PC)
SYNC
BNE -4(PC)
ISYNC
MOVD $1, R3
MOVB R3, swapped+16(FP)
......@@ -73,10 +70,9 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
SYNC
LDAR (R3), R6
CMP R6, R4
BNE 8(PC)
BNE 7(PC)
STDCCC R5, (R3)
BNE -5(PC)
SYNC
BNE -4(PC)
ISYNC
MOVD $1, R3
MOVB R3, swapped+24(FP)
......@@ -94,8 +90,7 @@ TEXT ·AddUint32(SB),NOSPLIT,$0-20
LWAR (R3), R5
ADD R4, R5
STWCCC R5, (R3)
BNE -4(PC)
SYNC
BNE -3(PC)
ISYNC
MOVW R5, ret+16(FP)
RET
......@@ -113,8 +108,7 @@ TEXT ·AddUint64(SB),NOSPLIT,$0-24
LDAR (R3), R5
ADD R4, R5
STDCCC R5, (R3)
BNE -4(PC)
SYNC
BNE -3(PC)
ISYNC
MOVD R5, ret+16(FP)
RET
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment