[PATCH] library functions updates for x86-64

Update various lib/ functions for x86-64: - new memcpy/memset/csum-copy/copy_page/user - add bitstr support function for IOMMU - Other minor fixes and updates.

[PATCH] library functions updates for x86-64
Update various lib/ functions for x86-64: - new memcpy/memset/csum-copy/copy_page/user - add bitstr support function for IOMMU - Other minor fixes and updates.
6acfce46 · Andi Kleen · Linus Torvalds · 59932a95 · 6acfce46 · 6acfce46
Commit 6acfce46 authored Oct 12, 2002 by Andi Kleen Committed by Linus Torvalds Oct 12, 2002
11 changed files
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -2,14 +2,12 @@
 # Makefile for x86_64-specific library files.
 #

-USE_STANDARD_AS_RULE := true
-
 EXTRA_CFLAGS_csum-partial.o := -funroll-loops

 L_TARGET = lib.a
 obj-y  = csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
-	thunk.o io.o clear_page.o copy_page.o
+	thunk.o io.o clear_page.o copy_page.o bitstr.o
 obj-y += memcpy.o
 obj-y += memmove.o
 obj-y += memset.o

--- a/arch/x86_64/lib/bitstr.c
+++ b/arch/x86_64/lib/bitstr.c
+#include <asm/bitops.h>
+
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+	unsigned long n, end, i; 	
+
+ again:
+	n = find_next_zero_bit(bitmap, nbits, start);
+	if (n == -1) 
+		return -1;
+	
+	/* could test bitsliced, but it's hardly worth it */
+	end = n+len;
+	if (end >= nbits) 
+		return -1; 
+	for (i = n+1; i < end; i++) { 
+		if (test_bit(i, bitmap)) {  
+			start = i+1; 
+			goto again; 
+		} 
+	}
+	return n;
+}
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
 /*
 * Copyright 2002 Andi Kleen, SuSE Labs.
 */		 
-	
 	#include <linux/linkage.h>

 /*
@@ -13,7 +12,7 @@ ENTRY(clear_page)
 	movl   $4096/128,%ecx
 	movl   $128,%edx
 loop:
-#define PUT(x) movnti %rax,x*8(%rdi) 
+#define PUT(x) movq %rax,x*8(%rdi) 
 	PUT(0)
 	PUT(1)
 	PUT(2)

--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
 /*
 * Copyright 2002 Andi Kleen, SuSE Labs.
 */	 	
+	
 	#include <linux/linkage.h>
 	#include <linux/config.h>
 	#ifdef CONFIG_PREEMPT
@@ -18,7 +19,9 @@
 * Warning: in case of super lazy FP save this needs to be preempt_stop	
 */
 	
-ENTRY(copy_page)
+	.globl copy_page
+	.p2align
+copy_page:
 	prefetchnta (%rsi) 
 	prefetchnta 64(%rsi)
 	    
@@ -37,22 +40,22 @@ loop:
 	prefetchnta 64(%rsi) 
 loop_no_prefetch:	
 	movdqa (%rsi),%xmm0 	 			 
-	movdqa 1*16(%rsi),%xmm1
-	movdqa 2*16(%rsi),%xmm2 	 			 
-	movdqa 3*16(%rsi),%xmm3 	 			 
+	movdqa 16(%rsi),%xmm1
+	movdqa 32(%rsi),%xmm2 	 			 
+	movdqa 48(%rsi),%xmm3 	 			 
 	movntdq %xmm0,(%rdi)	 
 	movntdq %xmm1,16(%rdi)	 
-	movntdq %xmm2,2*16(%rdi)	 
-	movntdq %xmm3,3*16(%rdi)
+	movntdq %xmm2,32(%rdi)	 
+	movntdq %xmm3,48(%rdi)
 	
-	movdqa 4*16(%rsi),%xmm0 	 			 
-	movdqa 5*16(%rsi),%xmm1
-	movdqa 6*16(%rsi),%xmm2 	 			 
-	movdqa 7*16(%rsi),%xmm3 	 			 
-	movntdq %xmm0,4*16(%rdi)	 
-	movntdq %xmm1,5*16(%rdi)	 
-	movntdq %xmm2,6*16(%rdi)	 
-	movntdq %xmm3,7*16(%rdi)
+	movdqa 64(%rsi),%xmm0 	 			 
+	movdqa 80(%rsi),%xmm1
+	movdqa 96(%rsi),%xmm2 	 			 
+	movdqa 112(%rsi),%xmm3 	 			 
+	movntdq %xmm0,64(%rdi)	 
+	movntdq %xmm1,80(%rdi)	 
+	movntdq %xmm2,96(%rdi)	 
+	movntdq %xmm3,112(%rdi)

 	addq   %rdx,%rdi	
 	addq   %rdx,%rsi

--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -6,8 +6,12 @@

 #define FIX_ALIGNMENT 1
 		
-	#include <asm/thread_info.h>
+#define movnti movq  /* write to cache for now */
+#define prefetch prefetcht2
+		
+	#include <asm/current.h>
 	#include <asm/offset.h>
+	#include <asm/thread_info.h>

 /* Standard copy_to_user with segment limit checking */		
 	.globl copy_to_user
@@ -62,7 +66,7 @@ copy_user_generic:
 	   the small movements in ioctls etc., but not penalize the bigger
 	   filesystem data copies too much. */
 	pushq %rbx
-	prefetcht0 (%rsi)
+	prefetch (%rsi)
 	xorl %eax,%eax		/*zero for the exception handler */

 #ifdef FIX_ALIGNMENT
@@ -82,7 +86,7 @@ after_bad_alignment:
 	jz   loop_no_prefetch
 	
 loop:
-	prefetchnta 64(%rsi)
+	prefetch 64(%rsi)
 	
 loop_no_prefetch:	
 s1:	movq (%rsi),%r11
@@ -118,10 +122,11 @@ handle_tail:
 	movl $8,%ebx
 loop_8:
 s9:	movq (%rsi),%r8
-d9:	movnti %r8,(%rdi)
+d9:	movq %r8,(%rdi)
 	addq %rbx,%rdi
 	addq %rbx,%rsi
-	loop loop_8
+	decl %ecx
+	jnz loop_8
 	
 handle_7:		
 	movl %edx,%ecx	
@@ -132,7 +137,8 @@ s10:	movb (%rsi),%bl
 d10:	movb %bl,(%rdi)
 	incq %rdi
 	incq %rsi
-	loop loop_1
+	decl %ecx
+	jnz loop_1
 			
 ende:
 	sfence
@@ -153,7 +159,8 @@ s11:	movb (%rsi),%bl
 d11:	movb %bl,(%rdi)
 	incq %rsi
 	incq %rdi
-	loop align_1
+	decl %ecx
+	jnz align_1
 	jmp after_bad_alignment
 small_align:
 	addq %r9,%rdx

--- a/arch/x86_64/lib/csum-copy.S
+++ b/arch/x86_64/lib/csum-copy.S
@@ -28,6 +28,10 @@
 * Wrappers need to take care of valid exception sum and zeroing.		 
 */

+/* for now - should vary this based on direction */
+ #define prefetch prefetcht2
+ #define movnti   movq
+
 	.macro source
 10:
 	.section __ex_table,"a"
@@ -163,7 +167,8 @@ loop_8:
 	movnti %rbx,(%rsi)
 	leaq (%rsi,%rdx),%rsi /* preserve carry */
 	leaq (%rdi,%rdx),%rdi
-	loop loop_8
+	decl %ecx
+	jnz	loop_8
 	adcq %r9,%rax	/* add in carry */

 fold:
@@ -188,7 +193,8 @@ loop_1:
 	movw %bx,(%rsi)
 	addq %rdx,%rdi
 	addq %rdx,%rsi
-	loop loop_1
+	decl %ecx
+	jnz loop_1
 	adcw %r9w,%ax	/* add in carry */
 	
 	/* handle last odd byte */
@@ -235,7 +241,8 @@ align_loop:
 	movw %bx,(%rsi)
 	addq %r10,%rdi
 	addq %r10,%rsi
-	loop align_loop 
+	decl %ecx
+	jnz align_loop
 	jmp after_bad_alignment

 	/* weird case. need to swap the sum at the end because the spec requires

--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,19 +11,22 @@
 * Output:
 * rax original destination
 */	
+
+ // #define FIX_ALIGNMENT
 	.globl __memcpy
 	.globl memcpy
 	.p2align
 __memcpy:
 memcpy:		
 	pushq %rbx
-	prefetcht0 (%rsi)	/*for more hopefully the hw prefetch will kick in*/
 	movq %rdi,%rax

+#ifdef FIX_ALIGNMENT
 	movl %edi,%ecx
 	andl $7,%ecx
 	jnz  bad_alignment	
 after_bad_alignment:
+#endif

 	movq %rdx,%rcx
 	movl $64,%ebx
@@ -31,30 +34,28 @@ after_bad_alignment:
 	jz handle_tail
 	
 loop_64:
-	/* no prefetch because we assume the hw prefetcher does it already 
-           and we have no specific temporal hint to give. XXX or give a nta
-	   hint for the source? */
 	movq (%rsi),%r11
 	movq 8(%rsi),%r8
 	movq 2*8(%rsi),%r9
 	movq 3*8(%rsi),%r10
-	movnti %r11,(%rdi)
-	movnti %r8,1*8(%rdi)
-	movnti %r9,2*8(%rdi)
-	movnti %r10,3*8(%rdi)
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
 		
 	movq 4*8(%rsi),%r11
 	movq 5*8(%rsi),%r8
 	movq 6*8(%rsi),%r9
 	movq 7*8(%rsi),%r10
-	movnti %r11,4*8(%rdi)
-	movnti %r8,5*8(%rdi)
-	movnti %r9,6*8(%rdi)
-	movnti %r10,7*8(%rdi)
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)

 	addq %rbx,%rsi	
 	addq %rbx,%rdi
-	loop loop_64
+	decl %ecx
+	jnz  loop_64

 handle_tail:
 	movl %edx,%ecx
@@ -64,10 +65,11 @@ handle_tail:
 	movl $8,%ebx
 loop_8: 
 	movq (%rsi),%r8
-	movnti %r8,(%rdi) 
+	movq %r8,(%rdi) 
 	addq %rbx,%rdi
 	addq %rbx,%rsi
-	loop loop_8
+	decl %ecx
+	jnz  loop_8

 handle_7:
 	movl %edx,%ecx
@@ -78,13 +80,16 @@ loop_1:
 	movb %r8b,(%rdi) 
 	incq %rdi
 	incq %rsi
-	loop loop_1
+	decl %ecx
+	jnz loop_1
 	
 ende: 	
 	sfence
 	popq %rbx
 	ret
 	
+
+#ifdef FIX_ALIGNMENT
 	/* align destination */
 	/* This is simpleminded. For bigger blocks it may make sense to align
 	   src and dst to their aligned subset and handle the rest separately */
@@ -100,8 +105,10 @@ align_1:
 	movb %r8b,(%rdi) 
 	incq %rdi
 	incq %rsi
-	loop align_1
+	decl %ecx
+	jnz  align_1
 	jmp after_bad_alignment
 small_alignment:
 	addq %r9,%rdx
 	jmp handle_7
+#endif	
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
-/* Copyright 2002 Andi Kleen */
+/* Copyright 2002 Andi Kleen, SuSE Labs */

 /*
 * ISO C memset - set a memory block to a byte value.
@@ -34,16 +34,17 @@ after_bad_alignment:
 	jz	 handle_tail

 loop_64:	
-	movnti  %rax,(%rdi) 
-	movnti  %rax,8(%rdi) 
-	movnti  %rax,16(%rdi) 
-	movnti  %rax,24(%rdi) 
-	movnti  %rax,32(%rdi) 
-	movnti  %rax,40(%rdi) 
-	movnti  %rax,48(%rdi) 
-	movnti  %rax,56(%rdi) 
+	movq  %rax,(%rdi) 
+	movq  %rax,8(%rdi) 
+	movq  %rax,16(%rdi) 
+	movq  %rax,24(%rdi) 
+	movq  %rax,32(%rdi) 
+	movq  %rax,40(%rdi) 
+	movq  %rax,48(%rdi) 
+	movq  %rax,56(%rdi) 
 	addq    %r8,%rdi
-	loop	loop_64
+	decl   %ecx
+	jnz    loop_64

 	/* Handle tail in loops. The loops should be faster than hard
 	   to predict jump tables. */ 
@@ -53,9 +54,10 @@ handle_tail:
 	jz 	handle_7
 	shrl	$3,%ecx
 loop_8:
-	movnti  %rax,(%rdi) 
+	movq  %rax,(%rdi) 
 	addq    $8,%rdi
-	loop 	loop_8   
+	decl   %ecx
+	jnz    loop_8

 handle_7:
 	movl	%r11d,%ecx
@@ -64,7 +66,8 @@ handle_7:
 loop_1:
 	movb 	%al,(%rdi)
 	addq	$1,%rdi
-	loop	loop_1
+	decl    %ecx
+	jnz     loop_1
 	
 ende:	
 	movq	%r10,%rax
@@ -73,7 +76,7 @@ ende:
 bad_alignment:
 	cmpq $7,%r11
 	jbe	handle_7
-	movnti %rax,(%rdi)	/* unaligned store */
+	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8			
 	subq %r9,%r8 
 	addq %r8,%rdi

--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -34,6 +34,7 @@
 	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
 	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
 	thunk rwsem_wake_thunk,rwsem_wake
+	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	thunk do_softirq_thunk,do_softirq
 	

--- a/arch/x86_64/lib/usercopy.c
+++ b/arch/x86_64/lib/usercopy.c
@@ -11,39 +11,49 @@
 * Copy a null terminated string from userspace.
 */

-long __strncpy_from_user(char *dst, const char *src, long count)
+#define __do_strncpy_from_user(dst,src,count,res)			   \
+do {									   \
+	long __d0, __d1, __d2;						   \
+	__asm__ __volatile__(						   \
+		"	testq %1,%1\n"					   \
+		"	jz 2f\n"					   \
+		"0:	lodsb\n"					   \
+		"	stosb\n"					   \
+		"	testb %%al,%%al\n"				   \
+		"	jz 1f\n"					   \
+		"	decq %1\n"					   \
+		"	jnz 0b\n"					   \
+		"1:	subq %1,%0\n"					   \
+		"2:\n"							   \
+		".section .fixup,\"ax\"\n"				   \
+		"3:	movq %5,%0\n"					   \
+		"	jmp 2b\n"					   \
+		".previous\n"						   \
+		".section __ex_table,\"a\"\n"				   \
+		"	.align 4\n"					   \
+		"	.quad 0b,3b\n"					   \
+		".previous"						   \
+		: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
+		  "=&D" (__d2)						   \
+		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+		: "memory");						   \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char *src, long count)
 {
 	long res;
-	long __d0, __d1, __d2;
-	asm volatile(		\
-		"	testq %1,%1\n"
-		"	jz 2f\n"
-		"0:	lodsb\n"
-		"	stosb\n"
-		"	testb %%al,%%al\n"
-		"   	loopnz 0b\n"
-		"1:	subq %1,%0\n"
-		"2:\n"
-		".section .fixup,\"ax\"\n"
-		"3:	movq %5,%0\n"
-		"	jmp 2b\n"
-		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 8\n"
-		"	.quad 0b,3b\n"
-		".previous"
-		: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),
-		  "=&D" (__d2)
-		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst)
-		: "memory");
+	__do_strncpy_from_user(dst, src, count, res);
 	return res;
 }

-long strncpy_from_user(char *dst, const char *src, long count)
+long
+strncpy_from_user(char *dst, const char *src, long count)
 {
+	long res = -EFAULT;
 	if (access_ok(VERIFY_READ, src, 1))
-		return __strncpy_from_user(dst, src, count);
-	return -EFAULT;
+		__do_strncpy_from_user(dst, src, count, res);
+	return res;
 }

 /*
@@ -60,13 +70,13 @@ unsigned long __clear_user(void *addr, unsigned long size)
 		"	jz     4f\n"
 		"0:	movnti %[zero],(%[dst])\n"
 		"	addq   %[eight],%[dst]\n"
-		"	loop   0b\n"
+		"	decl %%ecx ; jnz   0b\n"
 		"4:	movq  %[size1],%%rcx\n"
 		"	testl %%ecx,%%ecx\n"
 		"	jz     2f\n"
 		"1:	movb   %b[zero],(%[dst])\n"
 		"	incq   %[dst]\n"
-		"	loop   1b\n"
+		"	decl %%ecx ; jnz  1b\n"
 		"2:	sfence\n"
 		".section .fixup,\"ax\"\n"
 		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"

--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -384,6 +384,32 @@ static __inline__ int find_next_bit(void * addr, int size, int offset)
 	return (offset + set + res);
 }

+/* 
+ * Find string of zero bits in a bitmap. -1 when not found.
+ */ 
+extern unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len);
+
+static inline void set_bit_string(unsigned long *bitmap, unsigned long i, 
+				  int len) 
+{ 
+	unsigned long end = i + len; 
+	while (i < end) {
+		__set_bit(i, bitmap); 
+		i++;
+	}
+} 
+
+static inline void clear_bit_string(unsigned long *bitmap, unsigned long i, 
+				    int len) 
+{ 
+	unsigned long end = i + len; 
+	while (i < end) {
+		clear_bit(i, bitmap); 
+		i++;
+	}
+} 
+
 /**
 * ffz - find first zero in word.
 * @word: The word to search