Commit 889ac863 authored by David Mosberger's avatar David Mosberger

More McKinley tuning and minor do_csum() cleanup.

parent d4bbe676
......@@ -214,62 +214,80 @@ GLOBAL_ENTRY(save_switch_stack)
.save @priunat,r17
mov r17=ar.unat // preserve caller's
.body
adds r3=80,sp
#ifdef CONFIG_ITANIUM
adds r2=16+128,sp
adds r3=16+64,sp
adds r14=SW(R4)+16,sp
;;
st8.spill [r14]=r4,16 // spill r4
lfetch.fault.excl.nt1 [r3],128
mov ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0
adds r2=16+128,sp
;;
lfetch.fault.excl.nt1 [r2],128
lfetch.fault.excl.nt1 [r3],128
adds r14=SW(R4)+16,sp
;;
lfetch.fault.excl [r2]
lfetch.fault.excl [r3]
adds r15=SW(R5)+16,sp
#else
add r2=16+3*128,sp
add r3=16,sp
add r14=SW(R4)+16,sp
;;
st8.spill [r14]=r4,SW(R6)-SW(R4) // spill r4 and prefetch offset 0x1c0
lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x010
;;
lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x090
lfetch.fault.excl.nt1 [r2],128 // prefetch offset 0x190
;;
lfetch.fault.excl.nt1 [r3] // prefetch offset 0x110
lfetch.fault.excl.nt1 [r2] // prefetch offset 0x210
adds r15=SW(R5)+16,sp
#endif
;;
st8.spill [r15]=r5,SW(R7)-SW(R5) // spill r5
mov.m ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0
add r2=SW(F2)+16,sp // r2 = &sw->f2
;;
mov r18=ar.fpsr // preserve fpsr
mov r19=ar.rnat
add r2=SW(F2)+16,sp // r2 = &sw->f2
.mem.offset 0,0; st8.spill [r14]=r4,16 // spill r4
.mem.offset 8,0; st8.spill [r15]=r5,16 // spill r5
add r3=SW(F3)+16,sp // r3 = &sw->f3
st8.spill [r14]=r6,SW(B0)-SW(R6) // spill r6
mov.m r18=ar.fpsr // preserve fpsr
add r3=SW(F3)+16,sp // r3 = &sw->f3
;;
stf.spill [r2]=f2,32
stf.spill [r3]=f3,32
mov.m r19=ar.rnat
mov r21=b0
.mem.offset 0,0; st8.spill [r14]=r6,16 // spill r6
.mem.offset 8,0; st8.spill [r15]=r7,16 // spill r7
stf.spill [r3]=f3,32
st8.spill [r15]=r7,SW(B2)-SW(R7) // spill r7
mov r22=b1
;;
// since we're done with the spills, read and save ar.unat:
mov r29=ar.unat // M-unit
mov r20=ar.bspstore // M-unit
mov.m r29=ar.unat
mov.m r20=ar.bspstore
mov r23=b2
stf.spill [r2]=f4,32
stf.spill [r3]=f5,32
mov r24=b3
;;
st8 [r14]=r21,16 // save b0
st8 [r15]=r22,16 // save b1
st8 [r14]=r21,SW(B1)-SW(B0) // save b0
st8 [r15]=r23,SW(B3)-SW(B2) // save b2
mov r25=b4
stf.spill [r2]=f10,32
stf.spill [r3]=f11,32
mov r26=b5
;;
st8 [r14]=r23,16 // save b2
st8 [r15]=r24,16 // save b3
st8 [r14]=r22,SW(B4)-SW(B1) // save b1
st8 [r15]=r24,SW(AR_PFS)-SW(B3) // save b3
mov r21=ar.lc // I-unit
stf.spill [r2]=f12,32
stf.spill [r3]=f13,32
;;
st8 [r14]=r25,16 // save b4
st8 [r15]=r26,16 // save b5
st8 [r14]=r25,SW(B5)-SW(B4) // save b4
st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS) // save ar.pfs
stf.spill [r2]=f14,32
stf.spill [r3]=f15,32
;;
st8 [r14]=r16 // save ar.pfs
st8 [r15]=r21 // save ar.lc
st8 [r14]=r26 // save b5
st8 [r15]=r21 // save ar.lc
stf.spill [r2]=f16,32
stf.spill [r3]=f17,32
;;
......@@ -284,26 +302,26 @@ GLOBAL_ENTRY(save_switch_stack)
;;
stf.spill [r2]=f24,32
stf.spill [r3]=f25,32
add r14=SW(CALLER_UNAT)+16,sp
;;
stf.spill [r2]=f26,32
stf.spill [r3]=f27,32
add r15=SW(AR_FPSR)+16,sp
;;
stf.spill [r2]=f28,32
stf.spill [r3]=f29,32
st8 [r14]=r17 // save caller_unat
st8 [r15]=r18 // save fpsr
mov r21=pr
;;
stf.spill [r2]=f30,(SW(AR_UNAT)-SW(F30))
stf.spill [r3]=f31,(SW(AR_RNAT)-SW(F31))
stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
stf.spill [r3]=f31,SW(PR)-SW(F31)
add r14=SW(CALLER_UNAT)+16,sp
;;
st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT) // save ar.unat
st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
mov r21=pr
;;
st8 [r2]=r29,16 // save ar.unat
st8 [r3]=r19,16 // save ar.rnat
st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
st8 [r3]=r21 // save predicate registers
;;
st8 [r2]=r20 // save ar.bspstore
st8 [r3]=r21 // save predicate registers
st8 [r2]=r20 // save ar.bspstore
st8 [r14]=r18 // save fpsr
mov ar.rsc=3 // put RSE back into eager mode, pl 0
br.cond.sptk.many b7
END(save_switch_stack)
......@@ -647,23 +665,38 @@ dont_preserve_current_frame:
/*
* To prevent leaking bits between the kernel and user-space,
* we must clear the stacked registers in the "invalid" partition here.
* Not pretty, but at least it's fast (3.34 registers/cycle).
* Architecturally, this loop could go at 4.67 registers/cycle, but that would
* oversubscribe Itanium.
* Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
* 5 registers/cycle on McKinley).
*/
# define pRecurse p6
# define pReturn p7
#ifdef CONFIG_ITANIUM
# define Nregs 10
#else
# define Nregs 14
#endif
alloc loc0=ar.pfs,2,Nregs-2,2,0
shr.u loc1=r18,9 // RNaTslots <= dirtySize / (64*8) + 1
sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize
;;
#if 1
.align 32 // see comment below about gas bug...
#endif
mov ar.rsc=r19 // load ar.rsc to be used for "loadrs"
shladd in0=loc1,3,r17
mov in1=0
#if 0
// gas-2.12.90 is unable to generate a stop bit after .align, which is bad,
// because alloc must be at the beginning of an insn-group.
.align 32
#else
nop 0
nop 0
nop 0
#endif
;;
// .align 32 // gas-2.11.90 is unable to generate a stop bit after .align
rse_clear_invalid:
#ifdef CONFIG_ITANIUM
// cycle 0
{ .mii
alloc loc0=ar.pfs,2,Nregs-2,2,0
......@@ -692,9 +725,31 @@ rse_clear_invalid:
mov loc7=0
(pReturn) br.ret.sptk.many b6
}
#else /* !CONFIG_ITANIUM */
alloc loc0=ar.pfs,2,Nregs-2,2,0
cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
add out0=-Nregs*8,in0
add out1=1,in1 // increment recursion count
mov loc1=0
mov loc2=0
;;
mov loc3=0
mov loc4=0
mov loc9=0
mov loc5=0
mov loc6=0
(pRecurse) br.call.sptk.many b6=rse_clear_invalid
;;
mov loc7=0
mov loc8=0
cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
mov loc10=0
mov loc11=0
(pReturn) br.ret.sptk.many b6
#endif /* !CONFIG_ITANIUM */
# undef pRecurse
# undef pReturn
;;
alloc r17=ar.pfs,0,0,0,0 // drop current register frame
;;
loadrs
......
......@@ -28,18 +28,19 @@
* on interrupts.
*/
#define MINSTATE_START_SAVE_MIN_VIRT \
dep r1=-1,r1,61,3; /* r1 = current (virtual) */ \
(pUser) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
dep r1=-1,r1,61,3; /* r1 = current (virtual) */ \
;; \
(pUser) mov.m rARRNAT=ar.rnat; \
(pUser) addl rKRBS=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \
(pUser) mov rARRNAT=ar.rnat; \
(pKern) mov r1=sp; /* get sp */ \
;; \
(pUser) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
(pUser) lfetch.fault.excl.nt1 [rKRBS]; \
(pUser) mov rARBSPSTORE=ar.bspstore; /* save ar.bspstore */ \
(pUser) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
;; \
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
(pUser) mov ar.bspstore=rKRBS; /* switch to kernel RBS */ \
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
;; \
(pUser) mov r18=ar.bsp; \
(pUser) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
......@@ -125,52 +126,58 @@
;; \
SAVE_IFS; \
MINSTATE_START_SAVE_MIN \
add r17=L1_CACHE_BYTES,r1 /* really: biggest cache-line size */ \
;; \
mov r16=r1; /* initialize first base pointer */ \
adds r17=8,r1; /* initialize second base pointer */ \
st8 [r1]=rCRIPSR; /* save cr.ipsr */ \
lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
add r16=16,r1; /* initialize first base pointer */ \
;; \
st8 [r16]=rCRIPSR,16; /* save cr.ipsr */ \
st8 [r17]=rCRIIP,16; /* save cr.iip */ \
lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
;; \
lfetch.fault.excl.nt1 [r17]; \
adds r17=8,r1; /* initialize second base pointer */ \
(pKern) mov r18=r0; /* make sure r18 isn't NaT */ \
;; \
st8 [r17]=rCRIIP,16; /* save cr.iip */ \
st8 [r16]=rCRIFS,16; /* save cr.ifs */ \
st8 [r17]=rARUNAT,16; /* save ar.unat */ \
(pUser) sub r18=r18,rKRBS; /* r18=RSE.ndirty*8 */ \
;; \
st8 [r17]=rARUNAT,16; /* save ar.unat */ \
st8 [r16]=rARPFS,16; /* save ar.pfs */ \
shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
;; \
st8 [r17]=rARRSC,16; /* save ar.rsc */ \
tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT \
;; /* avoid RAW on r16 & r17 */ \
(pKern) adds r16=16,r16; /* skip over ar_rnat field */ \
(pKern) adds r17=16,r17; /* skip over ar_bspstore field */ \
(pUser) st8 [r16]=rARRNAT,16; /* save ar.rnat */ \
(pKern) adds r16=16,r16; /* skip over ar_rnat field */ \
;; /* avoid RAW on r16 & r17 */ \
(pUser) st8 [r17]=rARBSPSTORE,16; /* save ar.bspstore */ \
;; \
st8 [r16]=rARPR,16; /* save predicates */ \
st8 [r17]=rB6,16; /* save b6 */ \
shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
(pKern) adds r17=16,r17; /* skip over ar_bspstore field */ \
;; \
st8 [r17]=rB6,16; /* save b6 */ \
st8 [r16]=r18,16; /* save ar.rsc value for "loadrs" */ \
st8.spill [r17]=rR1,16; /* save original r1 */ \
tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT \
;; \
.mem.offset 8,0; st8.spill [r17]=rR1,16; /* save original r1 */ \
.mem.offset 0,0; st8.spill [r16]=r2,16; \
;; \
.mem.offset 8,0; st8.spill [r17]=r3,16; \
.mem.offset 0,0; st8.spill [r16]=r12,16; \
adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
;; \
.mem.offset 0,0; st8.spill [r16]=r12,16; \
.mem.offset 8,0; st8.spill [r17]=r13,16; \
.mem.offset 8,0; st8.spill [r17]=r13,16; \
.mem.offset 0,0; st8.spill [r16]=r14,16; \
cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
;; \
.mem.offset 0,0; st8.spill [r16]=r14,16; \
.mem.offset 8,0; st8.spill [r17]=r15,16; \
.mem.offset 8,0; st8.spill [r17]=r15,16; \
.mem.offset 0,0; st8.spill [r16]=r8,16; \
dep r14=-1,r0,61,3; \
;; \
.mem.offset 0,0; st8.spill [r16]=r8,16; \
.mem.offset 8,0; st8.spill [r17]=r9,16; \
.mem.offset 8,0; st8.spill [r17]=r9,16; \
.mem.offset 0,0; st8.spill [r16]=r10,16; \
adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
;; \
.mem.offset 0,0; st8.spill [r16]=r10,16; \
.mem.offset 8,0; st8.spill [r17]=r11,16; \
.mem.offset 8,0; st8.spill [r17]=r11,16; \
mov r13=IA64_KR(CURRENT); /* establish `current' */ \
;; \
EXTRA; \
......@@ -190,10 +197,12 @@
*/
#define SAVE_REST \
.mem.offset 0,0; st8.spill [r2]=r16,16; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
.mem.offset 0,0; st8.spill [r2]=r18,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r19,16; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
;; \
mov r16=ar.ccv; /* M-unit */ \
movl r18=FPSR_DEFAULT /* L-unit */ \
......@@ -201,30 +210,29 @@
mov r17=ar.fpsr; /* M-unit */ \
mov ar.fpsr=r18; /* M-unit */ \
;; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
.mem.offset 8,0; st8.spill [r3]=r21,16; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
mov r18=b0; \
;; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
.mem.offset 8,0; st8.spill [r3]=r23,16; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
mov r19=b7; \
;; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
.mem.offset 8,0; st8.spill [r3]=r25,16; \
;; \
.mem.offset 0,0; st8.spill [r2]=r26,16; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
.mem.offset 0,0; st8.spill [r2]=r28,16; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
.mem.offset 0,0; st8.spill [r2]=r30,16; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
st8 [r2]=r16,16; /* ar.ccv */ \
st8 [r3]=r17,16; /* ar.fpsr */ \
;; \
st8 [r3]=r17,16; /* ar.fpsr */ \
st8 [r2]=r18,16; /* b0 */ \
;; \
st8 [r3]=r19,16+8; /* b7 */ \
;; \
stf.spill [r2]=f6,32; \
......
......@@ -8,9 +8,11 @@
* in0: address of buffer to checksum (char *)
* in1: length of the buffer (int)
*
* Copyright (C) 1999, 2001 Hewlett-Packard Co
* Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
* Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* 02/04/08 David Mosberger <davidm@hpl.hp.com>
* More cleanup and tuning.
* 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
* Clean up and optimize and the software pipeline, loading two
* back-to-back 8-byte words per loop. Clean up the initialization
......@@ -71,8 +73,6 @@
// calculating the Internet checksum.
//
// NOT YET DONE:
// - use the lfetch instruction to augment the chances of the data being in
// the cache when we need it.
// - Maybe another algorithm which would take care of the folding at the
// end in a different manner
// - Work with people more knowledgeable than me on the network stack
......@@ -102,10 +102,6 @@
#define buf in0
#define len in1
#ifndef CONFIG_IA64_LOAD_LATENCY
#define CONFIG_IA64_LOAD_LATENCY 2
#endif
#define LOAD_LATENCY 2 // XXX fix me
#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
......@@ -122,45 +118,46 @@ GLOBAL_ENTRY(do_csum)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,16,1,16
.rotr word1[4], word2[4],result1[4],result2[4]
.rotp p[PIPE_DEPTH]
.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
mov ret0=r0 // in case we have zero length
cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
;; // avoid WAW on CFM
mov tmp3=0x7 // a temporary mask/value
;;
add tmp1=buf,len // last byte's address
(p6) br.ret.spnt.many rp // return if true (hope we can avoid that)
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
(p6) br.ret.spnt.many rp // return if zero or negative length
and firstoff=7,buf // how many bytes off for first1 element
tbit.nz p15,p0=buf,0 // is buf an odd address ?
mov hmask=-1 // intialize head mask
;;
andcm first1=buf,tmp3 // 8byte aligned down address of first1 element
tbit.nz p15,p0=buf,0 // is buf an odd address?
and first1=-8,buf // 8-byte align down address of first1 element
and firstoff=7,buf // how many bytes off for first1 element
mov tmask=-1 // initialize tail mask
adds tmp2=-1,tmp1 // last-1
;;
adds tmp2=-1,tmp1 // last-1
and lastoff=7,tmp1 // how many bytes off for last element
andcm last=tmp2,tmp3 // address of word containing last byte
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
;;
sub tmp1=8,lastoff // complement to lastoff
and last=-8,tmp2 // address of word containing last byte
;;
sub tmp3=last,first1 // tmp3=distance from first1 to last
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
cmp.eq p8,p9=last,first1 // everything fits in one word ?
sub tmp1=8,lastoff // complement to lastoff
ld8 firstval=[first1],8 // load,ahead of time, "first1" word
ld8 firstval=[first1],8 // load, ahead of time, "first1" word
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
shl tmp2=firstoff,3 // number of bits
;;
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed
(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
shl tmp1=tmp1,3 // number of bits
(p9) adds tmp3=-8,tmp3 // effectively loaded
;;
(p8) mov lastval=r0 // we don't need lastval if first1==last
shl tmp1=tmp1,3 // number of bits
shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
;;
shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
;;
.body
#define count tmp3
......@@ -171,8 +168,8 @@ GLOBAL_ENTRY(do_csum)
;;
// If count is odd, finish this 8-byte word so that we can
// load two back-to-back 8-byte words per loop thereafter.
tbit.nz p10,p11=count,0 // if (count is odd)
and word1[0]=firstval,hmask // and mask it as appropriate
tbit.nz p10,p11=count,0 // if (count is odd)
;;
(p8) mov result1[0]=word1[0]
(p9) add result1[0]=word1[0],word2[0]
......@@ -181,9 +178,8 @@ GLOBAL_ENTRY(do_csum)
;;
(p6) adds result1[0]=1,result1[0]
(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
;;
(p11) br.cond.dptk .do_csum16 // if (count is even)
;;
// Here count is odd.
ld8 word1[1]=[first1],8 // load an 8-byte word
cmp.eq p9,p10=1,count // if (count == 1)
......@@ -194,11 +190,9 @@ GLOBAL_ENTRY(do_csum)
cmp.ltu p6,p0=result1[0],word1[1]
;;
(p6) adds result1[0]=1,result1[0]
;;
(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
// Fall through to caluculate the checksum, feeding result1[0] as
// the initial value in result1[0].
;;
//
// Calculate the checksum loading two 8-byte words per loop.
//
......@@ -207,45 +201,36 @@ GLOBAL_ENTRY(do_csum)
shr.u count=count,1 // we do 16 bytes per loop
;;
cmp.eq p9,p10=r0,count // if (count == 0)
adds count=-1,count
brp.loop.imp 1f,2f
;;
adds count=-1,count
mov ar.ec=PIPE_DEPTH
;;
mov ar.lc=count // set lc
;;
// result1[0] must be initialized in advance.
mov result2[0]=r0
;;
mov pr.rot=1<<16
;;
mov carry1=r0
mov carry2=r0
;;
add first2=8,first1
;;
(p9) br.cond.sptk .do_csum_exit
;;
nop.m 0
nop.i 0
;;
.align 32
1:
(ELD_1) cmp.ltu p31,p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
(p32) adds carry1=1,carry1
(ELD_1) cmp.ltu p47,p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
(p48) adds carry2=1,carry2
(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
(pC1[1])adds carry1=1,carry1
(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
(pC2[1])adds carry2=1,carry2
(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
2:
(p16) ld8 word1[0]=[first1],16
(p16) ld8 word2[0]=[first2],16
[2:]
(p[0]) ld8 word1[0]=[first1],16
(p[0]) ld8 word2[0]=[first2],16
br.ctop.sptk 1b
;;
// Since len is a 32-bit value, carry cannot be larger than
// a 64-bit value.
(p32) adds carry1=1,carry1 // since we miss the last one
(p48) adds carry2=1,carry2
// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
(pC1[1])adds carry1=1,carry1 // since we miss the last one
(pC2[1])adds carry2=1,carry2
;;
add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
......@@ -263,18 +248,15 @@ GLOBAL_ENTRY(do_csum)
(p6) adds result1[0]=1,result1[0]
;;
.do_csum_exit:
movl tmp3=0xffffffff
;;
// XXX Fixme
//
// now fold 64 into 16 bits taking care of carry
// that's not very good because it has lots of sequentiality
//
and tmp1=result1[0],tmp3
mov tmp3=0xffff
zxt4 tmp1=result1[0]
shr.u tmp2=result1[0],32
;;
add result1[0]=tmp1,tmp2
shr.u tmp3=tmp3,16
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment