[PATCH] ppc64: strengthen I/O and memory barriers

From: Paul Mackerras <paulus@samba.org> After I sent the recent patch to include/asm-ppc64/io.h which put stronger barriers in the I/O accessor macros, Paul McKenney pointed out to me that a writex/outx could still slide out from inside a spinlocked region. This patch makes the barriers a bit stronger so that this can't happen. It means that we need to use a sync instruction for wmb (a full "heavyweight" sync), since drivers rely on wmb for ordering between writes to system memory and writes to a device. I have left smb_wmb() as a lighter-weight barrier that orders stores, and doesn't impose an ordering between cacheable and non-cacheable accesses (the amusingly-named eieio instruction). I am assuming here that smp_wmb is only used for ordering stores to system memory so that another cpu will see them in order. It can't be used for enforcing any ordering that a device will see, because it is just a gcc barrier on UP. This also changes the spinlock/rwlock unlock code to use lwsync ("light-weight sync") rather than eieio, since eieio doesn't order loads, and we need to ensure that loads stay inside the spinlocked region.

[PATCH] ppc64: strengthen I/O and memory barriers
From: Paul Mackerras <paulus@samba.org> After I sent the recent patch to include/asm-ppc64/io.h which put stronger barriers in the I/O accessor macros, Paul McKenney pointed out to me that a writex/outx could still slide out from inside a spinlocked region. This patch makes the barriers a bit stronger so that this can't happen. It means that we need to use a sync instruction for wmb (a full "heavyweight" sync), since drivers rely on wmb for ordering between writes to system memory and writes to a device. I have left smb_wmb() as a lighter-weight barrier that orders stores, and doesn't impose an ordering between cacheable and non-cacheable accesses (the amusingly-named eieio instruction). I am assuming here that smp_wmb is only used for ordering stores to system memory so that another cpu will see them in order. It can't be used for enforcing any ordering that a device will see, because it is just a gcc barrier on UP. This also changes the spinlock/rwlock unlock code to use lwsync ("light-weight sync") rather than eieio, since eieio doesn't order loads, and we need to ensure that loads stay inside the spinlocked region.
2c3956c9 · Andrew Morton · Linus Torvalds · 24f59760 · 2c3956c9 · 2c3956c9
Commit 2c3956c9 authored May 14, 2004 by Andrew Morton Committed by Linus Torvalds May 14, 2004
4 changed files
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -316,6 +316,8 @@ _GLOBAL(_insb)
 	eieio
 	stbu	r5,1(r4)
 	bdnz	00b
+	twi	0,r5,0
+	isync
 	blr

 _GLOBAL(_outsb)
@@ -325,8 +327,8 @@ _GLOBAL(_outsb)
 	blelr-
 00:	lbzu	r5,1(r4)
 	stb	r5,0(r3)
-	eieio
 	bdnz	00b
+	sync
 	blr	

 _GLOBAL(_insw)
@@ -338,6 +340,8 @@ _GLOBAL(_insw)
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
+	twi	0,r5,0
+	isync
 	blr

 _GLOBAL(_outsw)
@@ -346,9 +350,9 @@ _GLOBAL(_outsw)
 	subi	r4,r4,2
 	blelr-
 00:	lhzu	r5,2(r4)
-	eieio
 	sthbrx	r5,0,r3	
 	bdnz	00b
+	sync
 	blr	

 _GLOBAL(_insl)
@@ -360,6 +364,8 @@ _GLOBAL(_insl)
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
+	twi	0,r5,0
+	isync
 	blr

 _GLOBAL(_outsl)
@@ -369,8 +375,8 @@ _GLOBAL(_outsl)
 	blelr-
 00:	lwzu	r5,4(r4)
 	stwbrx	r5,0,r3
-	eieio
 	bdnz	00b
+	sync
 	blr	

 /* _GLOBAL(ide_insw) now in drivers/ide/ide-iops.c */
@@ -383,6 +389,8 @@ _GLOBAL(_insw_ns)
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
+	twi	0,r5,0
+	isync
 	blr

 /* _GLOBAL(ide_outsw) now in drivers/ide/ide-iops.c */
@@ -393,8 +401,8 @@ _GLOBAL(_outsw_ns)
 	blelr-
 00:	lhzu	r5,2(r4)
 	sth	r5,0(r3)
-	eieio
 	bdnz	00b
+	sync
 	blr	

 _GLOBAL(_insl_ns)
@@ -406,6 +414,8 @@ _GLOBAL(_insl_ns)
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
+	twi	0,r5,0
+	isync
 	blr

 _GLOBAL(_outsl_ns)
@@ -415,8 +425,8 @@ _GLOBAL(_outsl_ns)
 	blelr-
 00:	lwzu	r5,4(r4)
 	stw	r5,0(r3)
-	eieio
 	bdnz	00b
+	sync
 	blr	

 _GLOBAL(abs)

--- a/include/asm-ppc64/io.h
+++ b/include/asm-ppc64/io.h
@@ -240,14 +240,14 @@ static inline int in_8(volatile unsigned char *addr)
 {
 	int ret;

-	__asm__ __volatile__("eieio; lbz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("lbz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }

 static inline void out_8(volatile unsigned char *addr, int val)
 {
-	__asm__ __volatile__("sync; stb%U0%X0 %1,%0"
+	__asm__ __volatile__("stb%U0%X0 %1,%0; sync"
 			     : "=m" (*addr) : "r" (val));
 }

@@ -255,7 +255,7 @@ static inline int in_le16(volatile unsigned short *addr)
 {
 	int ret;

-	__asm__ __volatile__("eieio; lhbrx %0,0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("lhbrx %0,0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "r" (addr), "m" (*addr));
 	return ret;
 }
@@ -264,20 +264,20 @@ static inline int in_be16(volatile unsigned short *addr)
 {
 	int ret;

-	__asm__ __volatile__("eieio; lhz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("lhz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }

 static inline void out_le16(volatile unsigned short *addr, int val)
 {
-	__asm__ __volatile__("sync; sthbrx %1,0,%2"
+	__asm__ __volatile__("sthbrx %1,0,%2; sync"
 			     : "=m" (*addr) : "r" (val), "r" (addr));
 }

 static inline void out_be16(volatile unsigned short *addr, int val)
 {
-	__asm__ __volatile__("sync; sth%U0%X0 %1,%0"
+	__asm__ __volatile__("sth%U0%X0 %1,%0; sync"
 			     : "=m" (*addr) : "r" (val));
 }

@@ -285,7 +285,7 @@ static inline unsigned in_le32(volatile unsigned *addr)
 {
 	unsigned ret;

-	__asm__ __volatile__("eieio; lwbrx %0,0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("lwbrx %0,0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "r" (addr), "m" (*addr));
 	return ret;
 }
@@ -294,20 +294,20 @@ static inline unsigned in_be32(volatile unsigned *addr)
 {
 	unsigned ret;

-	__asm__ __volatile__("eieio; lwz%U1%X1 %0,%1; twi 0,%0,0; isync"
+	__asm__ __volatile__("lwz%U1%X1 %0,%1; twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }

 static inline void out_le32(volatile unsigned *addr, int val)
 {
-	__asm__ __volatile__("sync; stwbrx %1,0,%2" : "=m" (*addr)
+	__asm__ __volatile__("stwbrx %1,0,%2; sync" : "=m" (*addr)
 			     : "r" (val), "r" (addr));
 }

 static inline void out_be32(volatile unsigned *addr, int val)
 {
-	__asm__ __volatile__("sync; stw%U0%X0 %1,%0; eieio"
+	__asm__ __volatile__("stw%U0%X0 %1,%0; eieio"
 			     : "=m" (*addr) : "r" (val));
 }

@@ -316,7 +316,7 @@ static inline unsigned long in_le64(volatile unsigned long *addr)
 	unsigned long tmp, ret;

 	__asm__ __volatile__(
-			     "eieio; ld %1,0(%2)\n"
+			     "ld %1,0(%2)\n"
 			     "twi 0,%1,0\n"
 			     "isync\n"
 			     "rldimi %0,%1,5*8,1*8\n"
@@ -334,7 +334,7 @@ static inline unsigned long in_be64(volatile unsigned long *addr)
 {
 	unsigned long ret;

-	__asm__ __volatile__("eieio; ld %0,0(%1); twi 0,%0,0; isync"
+	__asm__ __volatile__("ld %0,0(%1); twi 0,%0,0; isync"
 			     : "=r" (ret) : "m" (*addr));
 	return ret;
 }
@@ -351,13 +351,14 @@ static inline void out_le64(volatile unsigned long *addr, int val)
 			     "rldicl %1,%1,32,0\n"
 			     "rlwimi %0,%1,8,8,31\n"
 			     "rlwimi %0,%1,24,16,23\n"
-			     "sync; std %0,0(%2)\n"
+			     "std %0,0(%2)\n"
+			     "sync"
 			     : "=r" (tmp) : "r" (val), "b" (addr) , "m" (*addr));
 }

 static inline void out_be64(volatile unsigned long *addr, int val)
 {
-	__asm__ __volatile__("sync; std %1,0(%0)" : "=m" (*addr) : "r" (val));
+	__asm__ __volatile__("std %1,0(%0); sync" : "=m" (*addr) : "r" (val));
 }

 #ifndef CONFIG_PPC_ISERIES 

--- a/include/asm-ppc64/spinlock.h
+++ b/include/asm-ppc64/spinlock.h
@@ -28,7 +28,7 @@ typedef struct {

 static __inline__ void _raw_spin_unlock(spinlock_t *lock)
 {
-	__asm__ __volatile__("eieio	# spin_unlock": : :"memory");
+	__asm__ __volatile__("lwsync	# spin_unlock": : :"memory");
 	lock->lock = 0;
 }

@@ -159,7 +159,7 @@ static __inline__ int is_write_locked(rwlock_t *rw)

 static __inline__ void _raw_write_unlock(rwlock_t *rw)
 {
-	__asm__ __volatile__("eieio		# write_unlock": : :"memory");
+	__asm__ __volatile__("lwsync		# write_unlock": : :"memory");
 	rw->lock = 0;
 }

@@ -223,7 +223,7 @@ static __inline__ void _raw_read_unlock(rwlock_t *rw)
 	unsigned int tmp;

 	__asm__ __volatile__(
-	"eieio				# read_unlock\n\
+	"lwsync				# read_unlock\n\
 1:	lwarx		%0,0,%1\n\
 	addic		%0,%0,-1\n\
 	stwcx.		%0,0,%1\n\

--- a/include/asm-ppc64/system.h
+++ b/include/asm-ppc64/system.h
@@ -29,22 +29,26 @@
 * read_barrier_depends() prevents data-dependent loads being reordered
 *	across this point (nop on PPC).
 *
- * We can use the eieio instruction for wmb, but since it doesn't
- * give any ordering guarantees about loads, we have to use the
- * stronger but slower sync instruction for mb and rmb.
+ * We have to use the sync instructions for mb(), since lwsync doesn't
+ * order loads with respect to previous stores.  Lwsync is fine for
+ * rmb(), though.
+ * For wmb(), we use sync since wmb is used in drivers to order
+ * stores to system memory with respect to writes to the device.
+ * However, smp_wmb() can be a lighter-weight eieio barrier on
+ * SMP since it is only used to order updates to system memory.
 */
 #define mb()   __asm__ __volatile__ ("sync" : : : "memory")
 #define rmb()  __asm__ __volatile__ ("lwsync" : : : "memory")
-#define wmb()  __asm__ __volatile__ ("eieio" : : : "memory")
+#define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define read_barrier_depends()  do { } while(0)

-#define set_mb(var, value)	do { var = value; mb(); } while (0)
-#define set_wmb(var, value)	do { var = value; wmb(); } while (0)
+#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define set_wmb(var, value)	do { var = value; smp_wmb(); } while (0)

 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
 #define smp_rmb()	rmb()
-#define smp_wmb()	wmb()
+#define smp_wmb()	__asm__ __volatile__ ("eieio" : : : "memory")
 #define smp_read_barrier_depends()  read_barrier_depends()
 #else
 #define smp_mb()	__asm__ __volatile__("": : :"memory")