Commit 6acfce46 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] library functions updates for x86-64

Update various lib/ functions for x86-64:

 - new memcpy/memset/csum-copy/copy_page/user
 - add bitstr support function for IOMMU
 - Other minor fixes and updates.
parent 59932a95
......@@ -2,14 +2,12 @@
# Makefile for x86_64-specific library files.
#
USE_STANDARD_AS_RULE := true
EXTRA_CFLAGS_csum-partial.o := -funroll-loops
L_TARGET = lib.a
obj-y = csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o io.o clear_page.o copy_page.o
thunk.o io.o clear_page.o copy_page.o bitstr.o
obj-y += memcpy.o
obj-y += memmove.o
obj-y += memset.o
......
#include <asm/bitops.h>
/* Find string of zero bits in a bitmap */
unsigned long
find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
{
unsigned long n, end, i;
again:
n = find_next_zero_bit(bitmap, nbits, start);
if (n == -1)
return -1;
/* could test bitsliced, but it's hardly worth it */
end = n+len;
if (end >= nbits)
return -1;
for (i = n+1; i < end; i++) {
if (test_bit(i, bitmap)) {
start = i+1;
goto again;
}
}
return n;
}
/*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#include <linux/linkage.h>
/*
......@@ -13,7 +12,7 @@ ENTRY(clear_page)
movl $4096/128,%ecx
movl $128,%edx
loop:
#define PUT(x) movnti %rax,x*8(%rdi)
#define PUT(x) movq %rax,x*8(%rdi)
PUT(0)
PUT(1)
PUT(2)
......
/*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#include <linux/linkage.h>
#include <linux/config.h>
#ifdef CONFIG_PREEMPT
......@@ -18,7 +19,9 @@
* Warning: in case of super lazy FP save this needs to be preempt_stop
*/
ENTRY(copy_page)
.globl copy_page
.p2align
copy_page:
prefetchnta (%rsi)
prefetchnta 64(%rsi)
......@@ -37,22 +40,22 @@ loop:
prefetchnta 64(%rsi)
loop_no_prefetch:
movdqa (%rsi),%xmm0
movdqa 1*16(%rsi),%xmm1
movdqa 2*16(%rsi),%xmm2
movdqa 3*16(%rsi),%xmm3
movdqa 16(%rsi),%xmm1
movdqa 32(%rsi),%xmm2
movdqa 48(%rsi),%xmm3
movntdq %xmm0,(%rdi)
movntdq %xmm1,16(%rdi)
movntdq %xmm2,2*16(%rdi)
movntdq %xmm3,3*16(%rdi)
movntdq %xmm2,32(%rdi)
movntdq %xmm3,48(%rdi)
movdqa 4*16(%rsi),%xmm0
movdqa 5*16(%rsi),%xmm1
movdqa 6*16(%rsi),%xmm2
movdqa 7*16(%rsi),%xmm3
movntdq %xmm0,4*16(%rdi)
movntdq %xmm1,5*16(%rdi)
movntdq %xmm2,6*16(%rdi)
movntdq %xmm3,7*16(%rdi)
movdqa 64(%rsi),%xmm0
movdqa 80(%rsi),%xmm1
movdqa 96(%rsi),%xmm2
movdqa 112(%rsi),%xmm3
movntdq %xmm0,64(%rdi)
movntdq %xmm1,80(%rdi)
movntdq %xmm2,96(%rdi)
movntdq %xmm3,112(%rdi)
addq %rdx,%rdi
addq %rdx,%rsi
......
......@@ -6,8 +6,12 @@
#define FIX_ALIGNMENT 1
#include <asm/thread_info.h>
#define movnti movq /* write to cache for now */
#define prefetch prefetcht2
#include <asm/current.h>
#include <asm/offset.h>
#include <asm/thread_info.h>
/* Standard copy_to_user with segment limit checking */
.globl copy_to_user
......@@ -62,7 +66,7 @@ copy_user_generic:
the small movements in ioctls etc., but not penalize the bigger
filesystem data copies too much. */
pushq %rbx
prefetcht0 (%rsi)
prefetch (%rsi)
xorl %eax,%eax /*zero for the exception handler */
#ifdef FIX_ALIGNMENT
......@@ -82,7 +86,7 @@ after_bad_alignment:
jz loop_no_prefetch
loop:
prefetchnta 64(%rsi)
prefetch 64(%rsi)
loop_no_prefetch:
s1: movq (%rsi),%r11
......@@ -118,10 +122,11 @@ handle_tail:
movl $8,%ebx
loop_8:
s9: movq (%rsi),%r8
d9: movnti %r8,(%rdi)
d9: movq %r8,(%rdi)
addq %rbx,%rdi
addq %rbx,%rsi
loop loop_8
decl %ecx
jnz loop_8
handle_7:
movl %edx,%ecx
......@@ -132,7 +137,8 @@ s10: movb (%rsi),%bl
d10: movb %bl,(%rdi)
incq %rdi
incq %rsi
loop loop_1
decl %ecx
jnz loop_1
ende:
sfence
......@@ -153,7 +159,8 @@ s11: movb (%rsi),%bl
d11: movb %bl,(%rdi)
incq %rsi
incq %rdi
loop align_1
decl %ecx
jnz align_1
jmp after_bad_alignment
small_align:
addq %r9,%rdx
......
......@@ -28,6 +28,10 @@
* Wrappers need to take care of valid exception sum and zeroing.
*/
/* for now - should vary this based on direction */
#define prefetch prefetcht2
#define movnti movq
.macro source
10:
.section __ex_table,"a"
......@@ -163,7 +167,8 @@ loop_8:
movnti %rbx,(%rsi)
leaq (%rsi,%rdx),%rsi /* preserve carry */
leaq (%rdi,%rdx),%rdi
loop loop_8
decl %ecx
jnz loop_8
adcq %r9,%rax /* add in carry */
fold:
......@@ -188,7 +193,8 @@ loop_1:
movw %bx,(%rsi)
addq %rdx,%rdi
addq %rdx,%rsi
loop loop_1
decl %ecx
jnz loop_1
adcw %r9w,%ax /* add in carry */
/* handle last odd byte */
......@@ -235,7 +241,8 @@ align_loop:
movw %bx,(%rsi)
addq %r10,%rdi
addq %r10,%rsi
loop align_loop
decl %ecx
jnz align_loop
jmp after_bad_alignment
/* weird case. need to swap the sum at the end because the spec requires
......
......@@ -11,19 +11,22 @@
* Output:
* rax original destination
*/
// #define FIX_ALIGNMENT
.globl __memcpy
.globl memcpy
.p2align
__memcpy:
memcpy:
pushq %rbx
prefetcht0 (%rsi) /*for more hopefully the hw prefetch will kick in*/
movq %rdi,%rax
#ifdef FIX_ALIGNMENT
movl %edi,%ecx
andl $7,%ecx
jnz bad_alignment
after_bad_alignment:
#endif
movq %rdx,%rcx
movl $64,%ebx
......@@ -31,30 +34,28 @@ after_bad_alignment:
jz handle_tail
loop_64:
/* no prefetch because we assume the hw prefetcher does it already
and we have no specific temporal hint to give. XXX or give a nta
hint for the source? */
movq (%rsi),%r11
movq 8(%rsi),%r8
movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10
movnti %r11,(%rdi)
movnti %r8,1*8(%rdi)
movnti %r9,2*8(%rdi)
movnti %r10,3*8(%rdi)
movq %r11,(%rdi)
movq %r8,1*8(%rdi)
movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)
movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8
movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10
movnti %r11,4*8(%rdi)
movnti %r8,5*8(%rdi)
movnti %r9,6*8(%rdi)
movnti %r10,7*8(%rdi)
movq %r11,4*8(%rdi)
movq %r8,5*8(%rdi)
movq %r9,6*8(%rdi)
movq %r10,7*8(%rdi)
addq %rbx,%rsi
addq %rbx,%rdi
loop loop_64
decl %ecx
jnz loop_64
handle_tail:
movl %edx,%ecx
......@@ -64,10 +65,11 @@ handle_tail:
movl $8,%ebx
loop_8:
movq (%rsi),%r8
movnti %r8,(%rdi)
movq %r8,(%rdi)
addq %rbx,%rdi
addq %rbx,%rsi
loop loop_8
decl %ecx
jnz loop_8
handle_7:
movl %edx,%ecx
......@@ -78,13 +80,16 @@ loop_1:
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop loop_1
decl %ecx
jnz loop_1
ende:
sfence
popq %rbx
ret
#ifdef FIX_ALIGNMENT
/* align destination */
/* This is simpleminded. For bigger blocks it may make sense to align
src and dst to their aligned subset and handle the rest separately */
......@@ -100,8 +105,10 @@ align_1:
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop align_1
decl %ecx
jnz align_1
jmp after_bad_alignment
small_alignment:
addq %r9,%rdx
jmp handle_7
#endif
/* Copyright 2002 Andi Kleen */
/* Copyright 2002 Andi Kleen, SuSE Labs */
/*
* ISO C memset - set a memory block to a byte value.
......@@ -34,16 +34,17 @@ after_bad_alignment:
jz handle_tail
loop_64:
movnti %rax,(%rdi)
movnti %rax,8(%rdi)
movnti %rax,16(%rdi)
movnti %rax,24(%rdi)
movnti %rax,32(%rdi)
movnti %rax,40(%rdi)
movnti %rax,48(%rdi)
movnti %rax,56(%rdi)
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
addq %r8,%rdi
loop loop_64
decl %ecx
jnz loop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
......@@ -53,9 +54,10 @@ handle_tail:
jz handle_7
shrl $3,%ecx
loop_8:
movnti %rax,(%rdi)
movq %rax,(%rdi)
addq $8,%rdi
loop loop_8
decl %ecx
jnz loop_8
handle_7:
movl %r11d,%ecx
......@@ -64,7 +66,8 @@ handle_7:
loop_1:
movb %al,(%rdi)
addq $1,%rdi
loop loop_1
decl %ecx
jnz loop_1
ende:
movq %r10,%rax
......@@ -73,7 +76,7 @@ ende:
bad_alignment:
cmpq $7,%r11
jbe handle_7
movnti %rax,(%rdi) /* unaligned store */
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
......
......@@ -34,6 +34,7 @@
thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
thunk rwsem_wake_thunk,rwsem_wake
thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
#endif
thunk do_softirq_thunk,do_softirq
......
......@@ -11,39 +11,49 @@
* Copy a null terminated string from userspace.
*/
long __strncpy_from_user(char *dst, const char *src, long count)
#define __do_strncpy_from_user(dst,src,count,res) \
do { \
long __d0, __d1, __d2; \
__asm__ __volatile__( \
" testq %1,%1\n" \
" jz 2f\n" \
"0: lodsb\n" \
" stosb\n" \
" testb %%al,%%al\n" \
" jz 1f\n" \
" decq %1\n" \
" jnz 0b\n" \
"1: subq %1,%0\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: movq %5,%0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .quad 0b,3b\n" \
".previous" \
: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
"=&D" (__d2) \
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
: "memory"); \
} while (0)
long
__strncpy_from_user(char *dst, const char *src, long count)
{
long res;
long __d0, __d1, __d2;
asm volatile( \
" testq %1,%1\n"
" jz 2f\n"
"0: lodsb\n"
" stosb\n"
" testb %%al,%%al\n"
" loopnz 0b\n"
"1: subq %1,%0\n"
"2:\n"
".section .fixup,\"ax\"\n"
"3: movq %5,%0\n"
" jmp 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 8\n"
" .quad 0b,3b\n"
".previous"
: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),
"=&D" (__d2)
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst)
: "memory");
__do_strncpy_from_user(dst, src, count, res);
return res;
}
long strncpy_from_user(char *dst, const char *src, long count)
long
strncpy_from_user(char *dst, const char *src, long count)
{
long res = -EFAULT;
if (access_ok(VERIFY_READ, src, 1))
return __strncpy_from_user(dst, src, count);
return -EFAULT;
__do_strncpy_from_user(dst, src, count, res);
return res;
}
/*
......@@ -60,13 +70,13 @@ unsigned long __clear_user(void *addr, unsigned long size)
" jz 4f\n"
"0: movnti %[zero],(%[dst])\n"
" addq %[eight],%[dst]\n"
" loop 0b\n"
" decl %%ecx ; jnz 0b\n"
"4: movq %[size1],%%rcx\n"
" testl %%ecx,%%ecx\n"
" jz 2f\n"
"1: movb %b[zero],(%[dst])\n"
" incq %[dst]\n"
" loop 1b\n"
" decl %%ecx ; jnz 1b\n"
"2: sfence\n"
".section .fixup,\"ax\"\n"
"3: lea 0(%[size1],%[size8],8),%[size8]\n"
......
......@@ -384,6 +384,32 @@ static __inline__ int find_next_bit(void * addr, int size, int offset)
return (offset + set + res);
}
/*
* Find string of zero bits in a bitmap. -1 when not found.
*/
extern unsigned long
find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len);
static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
int len)
{
unsigned long end = i + len;
while (i < end) {
__set_bit(i, bitmap);
i++;
}
}
static inline void clear_bit_string(unsigned long *bitmap, unsigned long i,
int len)
{
unsigned long end = i + len;
while (i < end) {
clear_bit(i, bitmap);
i++;
}
}
/**
* ffz - find first zero in word.
* @word: The word to search
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment