Commit 5e9377ec authored by Stuart Menefy's avatar Stuart Menefy Committed by Paul Mundt

sh: Optimise memcpy_to/fromio for SH4

Optimise memcpy_to/fromio. This is used extensivly by MTD, so is a
worthwhile performance gain. The main savings come from not repeatedly
calling readl/writel, and doing word instead of byte at a time
transfers. Also using "movca.l" on SH4 gives a small performance win.
Signed-off-by: default avatarStuart Menefy <stuart.menefy@st.com>
Signed-off-by: default avatarPaul Mundt <lethal@linux-sh.org>
parent 8af57f8b
/* /*
* linux/arch/sh/kernel/io.c * arch/sh/kernel/io.c - Machine independent I/O functions.
* *
* Copyright (C) 2000 Stuart Menefy * Copyright (C) 2000 - 2009 Stuart Menefy
* Copyright (C) 2005 Paul Mundt * Copyright (C) 2005 Paul Mundt
* *
* Provide real functions which expand to whatever the header file defined.
* Also definitions of machine independent IO functions.
*
* This file is subject to the terms and conditions of the GNU General Public * This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive * License. See the file "COPYING" in the main directory of this archive
* for more details. * for more details.
...@@ -18,33 +15,87 @@ ...@@ -18,33 +15,87 @@
/* /*
* Copy data from IO memory space to "real" memory space. * Copy data from IO memory space to "real" memory space.
* This needs to be optimized.
*/ */
void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count)
{ {
unsigned char *p = to; /*
while (count) { * Would it be worthwhile doing byte and long transfers first
count--; * to try and get aligned?
*p = readb(from); */
p++; #ifdef CONFIG_CPU_SH4
if ((count >= 0x20) &&
(((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) {
int tmp2, tmp3, tmp4, tmp5, tmp6;
__asm__ __volatile__(
"1: \n\t"
"mov.l @%7+, r0 \n\t"
"mov.l @%7+, %2 \n\t"
"movca.l r0, @%0 \n\t"
"mov.l @%7+, %3 \n\t"
"mov.l @%7+, %4 \n\t"
"mov.l @%7+, %5 \n\t"
"mov.l @%7+, %6 \n\t"
"mov.l @%7+, r7 \n\t"
"mov.l @%7+, r0 \n\t"
"mov.l %2, @(0x04,%0) \n\t"
"mov #0x20, %2 \n\t"
"mov.l %3, @(0x08,%0) \n\t"
"sub %2, %1 \n\t"
"mov.l %4, @(0x0c,%0) \n\t"
"cmp/hi %1, %2 ! T if 32 > count \n\t"
"mov.l %5, @(0x10,%0) \n\t"
"mov.l %6, @(0x14,%0) \n\t"
"mov.l r7, @(0x18,%0) \n\t"
"mov.l r0, @(0x1c,%0) \n\t"
"bf.s 1b \n\t"
" add #0x20, %0 \n\t"
: "=&r" (to), "=&r" (count),
"=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
"=&r" (tmp5), "=&r" (tmp6), "=&r" (from)
: "7"(from), "0" (to), "1" (count)
: "r0", "r7", "t", "memory");
}
#endif
if ((((u32)to | (u32)from) & 0x3) == 0) {
for (; count > 3; count -= 4) {
*(u32 *)to = *(volatile u32 *)from;
to += 4;
from += 4;
}
}
for (; count > 0; count--) {
*(u8 *)to = *(volatile u8 *)from;
to++;
from++; from++;
} }
mb();
} }
EXPORT_SYMBOL(memcpy_fromio); EXPORT_SYMBOL(memcpy_fromio);
/* /*
* Copy data from "real" memory space to IO memory space. * Copy data from "real" memory space to IO memory space.
* This needs to be optimized.
*/ */
void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count)
{ {
const unsigned char *p = from; if ((((u32)to | (u32)from) & 0x3) == 0) {
while (count) { for ( ; count > 3; count -= 4) {
count--; *(volatile u32 *)to = *(u32 *)from;
writeb(*p, to); to += 4;
p++; from += 4;
}
}
for (; count > 0; count--) {
*(volatile u8 *)to = *(u8 *)from;
to++; to++;
from++;
} }
mb();
} }
EXPORT_SYMBOL(memcpy_toio); EXPORT_SYMBOL(memcpy_toio);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment