[PATCH] optimize ia32 memmove (ed109bc5) · Commits · Kirill Smelkov / linux

Commit ed109bc5 authored Dec 29, 2003 by

Andrew Morton Committed by Linus Torvalds Dec 29, 2003

[PATCH] optimize ia32 memmove

From: Manfred Spraul <manfred@colorfullife.com>

The memmove implementation of i386 is not optimized: it uses movsb, which is
far slower than movsd.  The optimization is trivial: if dest is less than
source, then call memcpy().  markw tried it on a 4xXeon with dbt2, it saved
around 300 million cpu ticks in cache_flusharray():

oprofile, GLOBAL_POWER_EVENTS, count 100k
Before:
c0144ed1 <cache_flusharray>: /* cache_flusharray total:  21823  0.0165 */
     6 4.5e-06 :c0144f8e:       cmp    %esi,%ebx
    11 8.3e-06 :c0144f90:       jae    c0144f9e <cache_flusharray+0xcd>
     3 2.3e-06 :c0144f92:       mov    %ebx,%edi
  7305  0.0055 :c0144f94:       repz movsb %ds:(%esi),%es:(%edi)
   201 1.5e-04 :c0144f96:       add    $0x10,%esp

After:
c0144f1d <cache_flusharray>: /* cache_flusharray total:  17959  0.0136 */
  1270 9.6e-04 :c0144f1d:       push   %ebp
[snip]
     6 4.6e-06 :c0144fdc:       cmp    %esi,%ebx
    13 9.9e-06 :c0144fde:       jae    c0145000 <cache_flusharray+0xe3>
     2 1.5e-06 :c0144fe0:       mov    %edx,%eax
     1 7.6e-07 :c0144fe2:       mov    %ebx,%edi
    11 8.4e-06 :c0144fe4:       shr    $0x2,%eax
     1 7.6e-07 :c0144fe7:       mov    %eax,%ecx
  4129  0.0031 :c0144fe9:       repz movsl %ds:(%esi),%es:(%edi)
   261 2.0e-04 :c0144feb:       test   $0x2,%dl
    27 2.1e-05 :c0144fee:       je     c0144ff2 <cache_flusharray+0xd5>
               :c0144ff0:       movsw  %ds:(%esi),%es:(%edi)
    95 7.2e-05 :c0144ff2:       test   $0x1,%dl
    96 7.3e-05 :c0144ff5:       je     c0144ff8 <cache_flusharray+0xdb>
               :c0144ff7:       movsb  %ds:(%esi),%es:(%edi)
   121 9.2e-05 :c0144ff8:       add    $0x1c,%esp

parent e2c3c9e2

Hide whitespace changes

Inline Side-by-side

View file @ ed109bc5

@@ -299,14 +299,9 @@ extern void __struct_cpy_bug (void);
 static inline void * memmove(void * dest,const void * src, size_t n)
+{
 int d0, d1, d2;
 if (dest<src)
 __asm__ __volatile__(
 	"rep\n\t"
 	"movsb"
 	: "=&c" (d0), "=&S" (d1), "=&D" (d2)
 	:"0" (n),"1" (src),"2" (dest)
 	: "memory");
 else
 if (dest<src) {
 	memcpy(dest,src,n);
 } else
 __asm__ __volatile__(
 	"std\n\t"
 	"rep\n\t"
-...

Please register or to comment