Commit 6acfce46 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] library functions updates for x86-64

Update various lib/ functions for x86-64:

 - new memcpy/memset/csum-copy/copy_page/user
 - add bitstr support function for IOMMU
 - Other minor fixes and updates.
parent 59932a95
...@@ -2,14 +2,12 @@ ...@@ -2,14 +2,12 @@
# Makefile for x86_64-specific library files. # Makefile for x86_64-specific library files.
# #
USE_STANDARD_AS_RULE := true
EXTRA_CFLAGS_csum-partial.o := -funroll-loops EXTRA_CFLAGS_csum-partial.o := -funroll-loops
L_TARGET = lib.a L_TARGET = lib.a
obj-y = csum-partial.o csum-copy.o csum-wrappers.o delay.o \ obj-y = csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \ usercopy.o getuser.o putuser.o \
thunk.o io.o clear_page.o copy_page.o thunk.o io.o clear_page.o copy_page.o bitstr.o
obj-y += memcpy.o obj-y += memcpy.o
obj-y += memmove.o obj-y += memmove.o
obj-y += memset.o obj-y += memset.o
......
#include <asm/bitops.h>
/* Find string of zero bits in a bitmap */
unsigned long
find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
{
unsigned long n, end, i;
again:
n = find_next_zero_bit(bitmap, nbits, start);
if (n == -1)
return -1;
/* could test bitsliced, but it's hardly worth it */
end = n+len;
if (end >= nbits)
return -1;
for (i = n+1; i < end; i++) {
if (test_bit(i, bitmap)) {
start = i+1;
goto again;
}
}
return n;
}
/* /*
* Copyright 2002 Andi Kleen, SuSE Labs. * Copyright 2002 Andi Kleen, SuSE Labs.
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
/* /*
...@@ -13,7 +12,7 @@ ENTRY(clear_page) ...@@ -13,7 +12,7 @@ ENTRY(clear_page)
movl $4096/128,%ecx movl $4096/128,%ecx
movl $128,%edx movl $128,%edx
loop: loop:
#define PUT(x) movnti %rax,x*8(%rdi) #define PUT(x) movq %rax,x*8(%rdi)
PUT(0) PUT(0)
PUT(1) PUT(1)
PUT(2) PUT(2)
......
/* /*
* Copyright 2002 Andi Kleen, SuSE Labs. * Copyright 2002 Andi Kleen, SuSE Labs.
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/config.h> #include <linux/config.h>
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
...@@ -18,7 +19,9 @@ ...@@ -18,7 +19,9 @@
* Warning: in case of super lazy FP save this needs to be preempt_stop * Warning: in case of super lazy FP save this needs to be preempt_stop
*/ */
ENTRY(copy_page) .globl copy_page
.p2align
copy_page:
prefetchnta (%rsi) prefetchnta (%rsi)
prefetchnta 64(%rsi) prefetchnta 64(%rsi)
...@@ -37,22 +40,22 @@ loop: ...@@ -37,22 +40,22 @@ loop:
prefetchnta 64(%rsi) prefetchnta 64(%rsi)
loop_no_prefetch: loop_no_prefetch:
movdqa (%rsi),%xmm0 movdqa (%rsi),%xmm0
movdqa 1*16(%rsi),%xmm1 movdqa 16(%rsi),%xmm1
movdqa 2*16(%rsi),%xmm2 movdqa 32(%rsi),%xmm2
movdqa 3*16(%rsi),%xmm3 movdqa 48(%rsi),%xmm3
movntdq %xmm0,(%rdi) movntdq %xmm0,(%rdi)
movntdq %xmm1,16(%rdi) movntdq %xmm1,16(%rdi)
movntdq %xmm2,2*16(%rdi) movntdq %xmm2,32(%rdi)
movntdq %xmm3,3*16(%rdi) movntdq %xmm3,48(%rdi)
movdqa 4*16(%rsi),%xmm0 movdqa 64(%rsi),%xmm0
movdqa 5*16(%rsi),%xmm1 movdqa 80(%rsi),%xmm1
movdqa 6*16(%rsi),%xmm2 movdqa 96(%rsi),%xmm2
movdqa 7*16(%rsi),%xmm3 movdqa 112(%rsi),%xmm3
movntdq %xmm0,4*16(%rdi) movntdq %xmm0,64(%rdi)
movntdq %xmm1,5*16(%rdi) movntdq %xmm1,80(%rdi)
movntdq %xmm2,6*16(%rdi) movntdq %xmm2,96(%rdi)
movntdq %xmm3,7*16(%rdi) movntdq %xmm3,112(%rdi)
addq %rdx,%rdi addq %rdx,%rdi
addq %rdx,%rsi addq %rdx,%rsi
......
...@@ -6,8 +6,12 @@ ...@@ -6,8 +6,12 @@
#define FIX_ALIGNMENT 1 #define FIX_ALIGNMENT 1
#include <asm/thread_info.h> #define movnti movq /* write to cache for now */
#define prefetch prefetcht2
#include <asm/current.h>
#include <asm/offset.h> #include <asm/offset.h>
#include <asm/thread_info.h>
/* Standard copy_to_user with segment limit checking */ /* Standard copy_to_user with segment limit checking */
.globl copy_to_user .globl copy_to_user
...@@ -62,7 +66,7 @@ copy_user_generic: ...@@ -62,7 +66,7 @@ copy_user_generic:
the small movements in ioctls etc., but not penalize the bigger the small movements in ioctls etc., but not penalize the bigger
filesystem data copies too much. */ filesystem data copies too much. */
pushq %rbx pushq %rbx
prefetcht0 (%rsi) prefetch (%rsi)
xorl %eax,%eax /*zero for the exception handler */ xorl %eax,%eax /*zero for the exception handler */
#ifdef FIX_ALIGNMENT #ifdef FIX_ALIGNMENT
...@@ -82,7 +86,7 @@ after_bad_alignment: ...@@ -82,7 +86,7 @@ after_bad_alignment:
jz loop_no_prefetch jz loop_no_prefetch
loop: loop:
prefetchnta 64(%rsi) prefetch 64(%rsi)
loop_no_prefetch: loop_no_prefetch:
s1: movq (%rsi),%r11 s1: movq (%rsi),%r11
...@@ -118,10 +122,11 @@ handle_tail: ...@@ -118,10 +122,11 @@ handle_tail:
movl $8,%ebx movl $8,%ebx
loop_8: loop_8:
s9: movq (%rsi),%r8 s9: movq (%rsi),%r8
d9: movnti %r8,(%rdi) d9: movq %r8,(%rdi)
addq %rbx,%rdi addq %rbx,%rdi
addq %rbx,%rsi addq %rbx,%rsi
loop loop_8 decl %ecx
jnz loop_8
handle_7: handle_7:
movl %edx,%ecx movl %edx,%ecx
...@@ -132,7 +137,8 @@ s10: movb (%rsi),%bl ...@@ -132,7 +137,8 @@ s10: movb (%rsi),%bl
d10: movb %bl,(%rdi) d10: movb %bl,(%rdi)
incq %rdi incq %rdi
incq %rsi incq %rsi
loop loop_1 decl %ecx
jnz loop_1
ende: ende:
sfence sfence
...@@ -153,7 +159,8 @@ s11: movb (%rsi),%bl ...@@ -153,7 +159,8 @@ s11: movb (%rsi),%bl
d11: movb %bl,(%rdi) d11: movb %bl,(%rdi)
incq %rsi incq %rsi
incq %rdi incq %rdi
loop align_1 decl %ecx
jnz align_1
jmp after_bad_alignment jmp after_bad_alignment
small_align: small_align:
addq %r9,%rdx addq %r9,%rdx
......
...@@ -28,6 +28,10 @@ ...@@ -28,6 +28,10 @@
* Wrappers need to take care of valid exception sum and zeroing. * Wrappers need to take care of valid exception sum and zeroing.
*/ */
/* for now - should vary this based on direction */
#define prefetch prefetcht2
#define movnti movq
.macro source .macro source
10: 10:
.section __ex_table,"a" .section __ex_table,"a"
...@@ -163,7 +167,8 @@ loop_8: ...@@ -163,7 +167,8 @@ loop_8:
movnti %rbx,(%rsi) movnti %rbx,(%rsi)
leaq (%rsi,%rdx),%rsi /* preserve carry */ leaq (%rsi,%rdx),%rsi /* preserve carry */
leaq (%rdi,%rdx),%rdi leaq (%rdi,%rdx),%rdi
loop loop_8 decl %ecx
jnz loop_8
adcq %r9,%rax /* add in carry */ adcq %r9,%rax /* add in carry */
fold: fold:
...@@ -188,7 +193,8 @@ loop_1: ...@@ -188,7 +193,8 @@ loop_1:
movw %bx,(%rsi) movw %bx,(%rsi)
addq %rdx,%rdi addq %rdx,%rdi
addq %rdx,%rsi addq %rdx,%rsi
loop loop_1 decl %ecx
jnz loop_1
adcw %r9w,%ax /* add in carry */ adcw %r9w,%ax /* add in carry */
/* handle last odd byte */ /* handle last odd byte */
...@@ -235,7 +241,8 @@ align_loop: ...@@ -235,7 +241,8 @@ align_loop:
movw %bx,(%rsi) movw %bx,(%rsi)
addq %r10,%rdi addq %r10,%rdi
addq %r10,%rsi addq %r10,%rsi
loop align_loop decl %ecx
jnz align_loop
jmp after_bad_alignment jmp after_bad_alignment
/* weird case. need to swap the sum at the end because the spec requires /* weird case. need to swap the sum at the end because the spec requires
......
...@@ -11,19 +11,22 @@ ...@@ -11,19 +11,22 @@
* Output: * Output:
* rax original destination * rax original destination
*/ */
// #define FIX_ALIGNMENT
.globl __memcpy .globl __memcpy
.globl memcpy .globl memcpy
.p2align .p2align
__memcpy: __memcpy:
memcpy: memcpy:
pushq %rbx pushq %rbx
prefetcht0 (%rsi) /*for more hopefully the hw prefetch will kick in*/
movq %rdi,%rax movq %rdi,%rax
#ifdef FIX_ALIGNMENT
movl %edi,%ecx movl %edi,%ecx
andl $7,%ecx andl $7,%ecx
jnz bad_alignment jnz bad_alignment
after_bad_alignment: after_bad_alignment:
#endif
movq %rdx,%rcx movq %rdx,%rcx
movl $64,%ebx movl $64,%ebx
...@@ -31,30 +34,28 @@ after_bad_alignment: ...@@ -31,30 +34,28 @@ after_bad_alignment:
jz handle_tail jz handle_tail
loop_64: loop_64:
/* no prefetch because we assume the hw prefetcher does it already
and we have no specific temporal hint to give. XXX or give a nta
hint for the source? */
movq (%rsi),%r11 movq (%rsi),%r11
movq 8(%rsi),%r8 movq 8(%rsi),%r8
movq 2*8(%rsi),%r9 movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10 movq 3*8(%rsi),%r10
movnti %r11,(%rdi) movq %r11,(%rdi)
movnti %r8,1*8(%rdi) movq %r8,1*8(%rdi)
movnti %r9,2*8(%rdi) movq %r9,2*8(%rdi)
movnti %r10,3*8(%rdi) movq %r10,3*8(%rdi)
movq 4*8(%rsi),%r11 movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8 movq 5*8(%rsi),%r8
movq 6*8(%rsi),%r9 movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10 movq 7*8(%rsi),%r10
movnti %r11,4*8(%rdi) movq %r11,4*8(%rdi)
movnti %r8,5*8(%rdi) movq %r8,5*8(%rdi)
movnti %r9,6*8(%rdi) movq %r9,6*8(%rdi)
movnti %r10,7*8(%rdi) movq %r10,7*8(%rdi)
addq %rbx,%rsi addq %rbx,%rsi
addq %rbx,%rdi addq %rbx,%rdi
loop loop_64 decl %ecx
jnz loop_64
handle_tail: handle_tail:
movl %edx,%ecx movl %edx,%ecx
...@@ -64,10 +65,11 @@ handle_tail: ...@@ -64,10 +65,11 @@ handle_tail:
movl $8,%ebx movl $8,%ebx
loop_8: loop_8:
movq (%rsi),%r8 movq (%rsi),%r8
movnti %r8,(%rdi) movq %r8,(%rdi)
addq %rbx,%rdi addq %rbx,%rdi
addq %rbx,%rsi addq %rbx,%rsi
loop loop_8 decl %ecx
jnz loop_8
handle_7: handle_7:
movl %edx,%ecx movl %edx,%ecx
...@@ -78,13 +80,16 @@ loop_1: ...@@ -78,13 +80,16 @@ loop_1:
movb %r8b,(%rdi) movb %r8b,(%rdi)
incq %rdi incq %rdi
incq %rsi incq %rsi
loop loop_1 decl %ecx
jnz loop_1
ende: ende:
sfence sfence
popq %rbx popq %rbx
ret ret
#ifdef FIX_ALIGNMENT
/* align destination */ /* align destination */
/* This is simpleminded. For bigger blocks it may make sense to align /* This is simpleminded. For bigger blocks it may make sense to align
src and dst to their aligned subset and handle the rest separately */ src and dst to their aligned subset and handle the rest separately */
...@@ -100,8 +105,10 @@ align_1: ...@@ -100,8 +105,10 @@ align_1:
movb %r8b,(%rdi) movb %r8b,(%rdi)
incq %rdi incq %rdi
incq %rsi incq %rsi
loop align_1 decl %ecx
jnz align_1
jmp after_bad_alignment jmp after_bad_alignment
small_alignment: small_alignment:
addq %r9,%rdx addq %r9,%rdx
jmp handle_7 jmp handle_7
#endif
/* Copyright 2002 Andi Kleen */ /* Copyright 2002 Andi Kleen, SuSE Labs */
/* /*
* ISO C memset - set a memory block to a byte value. * ISO C memset - set a memory block to a byte value.
...@@ -34,16 +34,17 @@ after_bad_alignment: ...@@ -34,16 +34,17 @@ after_bad_alignment:
jz handle_tail jz handle_tail
loop_64: loop_64:
movnti %rax,(%rdi) movq %rax,(%rdi)
movnti %rax,8(%rdi) movq %rax,8(%rdi)
movnti %rax,16(%rdi) movq %rax,16(%rdi)
movnti %rax,24(%rdi) movq %rax,24(%rdi)
movnti %rax,32(%rdi) movq %rax,32(%rdi)
movnti %rax,40(%rdi) movq %rax,40(%rdi)
movnti %rax,48(%rdi) movq %rax,48(%rdi)
movnti %rax,56(%rdi) movq %rax,56(%rdi)
addq %r8,%rdi addq %r8,%rdi
loop loop_64 decl %ecx
jnz loop_64
/* Handle tail in loops. The loops should be faster than hard /* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */ to predict jump tables. */
...@@ -53,9 +54,10 @@ handle_tail: ...@@ -53,9 +54,10 @@ handle_tail:
jz handle_7 jz handle_7
shrl $3,%ecx shrl $3,%ecx
loop_8: loop_8:
movnti %rax,(%rdi) movq %rax,(%rdi)
addq $8,%rdi addq $8,%rdi
loop loop_8 decl %ecx
jnz loop_8
handle_7: handle_7:
movl %r11d,%ecx movl %r11d,%ecx
...@@ -64,7 +66,8 @@ handle_7: ...@@ -64,7 +66,8 @@ handle_7:
loop_1: loop_1:
movb %al,(%rdi) movb %al,(%rdi)
addq $1,%rdi addq $1,%rdi
loop loop_1 decl %ecx
jnz loop_1
ende: ende:
movq %r10,%rax movq %r10,%rax
...@@ -73,7 +76,7 @@ ende: ...@@ -73,7 +76,7 @@ ende:
bad_alignment: bad_alignment:
cmpq $7,%r11 cmpq $7,%r11
jbe handle_7 jbe handle_7
movnti %rax,(%rdi) /* unaligned store */ movq %rax,(%rdi) /* unaligned store */
movq $8,%r8 movq $8,%r8
subq %r9,%r8 subq %r9,%r8
addq %r8,%rdi addq %r8,%rdi
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_wake_thunk,rwsem_wake
thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
#endif #endif
thunk do_softirq_thunk,do_softirq thunk do_softirq_thunk,do_softirq
......
...@@ -11,39 +11,49 @@ ...@@ -11,39 +11,49 @@
* Copy a null terminated string from userspace. * Copy a null terminated string from userspace.
*/ */
long __strncpy_from_user(char *dst, const char *src, long count) #define __do_strncpy_from_user(dst,src,count,res) \
do { \
long __d0, __d1, __d2; \
__asm__ __volatile__( \
" testq %1,%1\n" \
" jz 2f\n" \
"0: lodsb\n" \
" stosb\n" \
" testb %%al,%%al\n" \
" jz 1f\n" \
" decq %1\n" \
" jnz 0b\n" \
"1: subq %1,%0\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: movq %5,%0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .quad 0b,3b\n" \
".previous" \
: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
"=&D" (__d2) \
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
: "memory"); \
} while (0)
long
__strncpy_from_user(char *dst, const char *src, long count)
{ {
long res; long res;
long __d0, __d1, __d2; __do_strncpy_from_user(dst, src, count, res);
asm volatile( \
" testq %1,%1\n"
" jz 2f\n"
"0: lodsb\n"
" stosb\n"
" testb %%al,%%al\n"
" loopnz 0b\n"
"1: subq %1,%0\n"
"2:\n"
".section .fixup,\"ax\"\n"
"3: movq %5,%0\n"
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 8\n"
" .quad 0b,3b\n"
".previous"
: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),
"=&D" (__d2)
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst)
: "memory");
return res; return res;
} }
long strncpy_from_user(char *dst, const char *src, long count) long
strncpy_from_user(char *dst, const char *src, long count)
{ {
long res = -EFAULT;
if (access_ok(VERIFY_READ, src, 1)) if (access_ok(VERIFY_READ, src, 1))
return __strncpy_from_user(dst, src, count); __do_strncpy_from_user(dst, src, count, res);
return -EFAULT; return res;
} }
/* /*
...@@ -60,13 +70,13 @@ unsigned long __clear_user(void *addr, unsigned long size) ...@@ -60,13 +70,13 @@ unsigned long __clear_user(void *addr, unsigned long size)
" jz 4f\n" " jz 4f\n"
"0: movnti %[zero],(%[dst])\n" "0: movnti %[zero],(%[dst])\n"
" addq %[eight],%[dst]\n" " addq %[eight],%[dst]\n"
" loop 0b\n" " decl %%ecx ; jnz 0b\n"
"4: movq %[size1],%%rcx\n" "4: movq %[size1],%%rcx\n"
" testl %%ecx,%%ecx\n" " testl %%ecx,%%ecx\n"
" jz 2f\n" " jz 2f\n"
"1: movb %b[zero],(%[dst])\n" "1: movb %b[zero],(%[dst])\n"
" incq %[dst]\n" " incq %[dst]\n"
" loop 1b\n" " decl %%ecx ; jnz 1b\n"
"2: sfence\n" "2: sfence\n"
".section .fixup,\"ax\"\n" ".section .fixup,\"ax\"\n"
"3: lea 0(%[size1],%[size8],8),%[size8]\n" "3: lea 0(%[size1],%[size8],8),%[size8]\n"
......
...@@ -384,6 +384,32 @@ static __inline__ int find_next_bit(void * addr, int size, int offset) ...@@ -384,6 +384,32 @@ static __inline__ int find_next_bit(void * addr, int size, int offset)
return (offset + set + res); return (offset + set + res);
} }
/*
* Find string of zero bits in a bitmap. -1 when not found.
*/
extern unsigned long
find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len);
static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
int len)
{
unsigned long end = i + len;
while (i < end) {
__set_bit(i, bitmap);
i++;
}
}
static inline void clear_bit_string(unsigned long *bitmap, unsigned long i,
int len)
{
unsigned long end = i + len;
while (i < end) {
clear_bit(i, bitmap);
i++;
}
}
/** /**
* ffz - find first zero in word. * ffz - find first zero in word.
* @word: The word to search * @word: The word to search
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment