Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
889ac863
Commit
889ac863
authored
Apr 10, 2002
by
David Mosberger
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
More McKinley tuning and minor do_csum() cleanup.
parent
d4bbe676
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
178 additions
and
133 deletions
+178
-133
arch/ia64/kernel/entry.S
arch/ia64/kernel/entry.S
+94
-39
arch/ia64/kernel/minstate.h
arch/ia64/kernel/minstate.h
+42
-34
arch/ia64/lib/do_csum.S
arch/ia64/lib/do_csum.S
+42
-60
No files found.
arch/ia64/kernel/entry.S
View file @
889ac863
...
...
@@ -214,62 +214,80 @@ GLOBAL_ENTRY(save_switch_stack)
.
save
@
priunat
,
r17
mov
r17
=
ar
.
unat
//
preserve
caller
's
.
body
adds
r3
=
80
,
sp
#ifdef CONFIG_ITANIUM
adds
r2
=
16
+
128
,
sp
adds
r3
=
16
+
64
,
sp
adds
r14
=
SW
(
R4
)+
16
,
sp
;;
st8.spill
[
r14
]=
r4
,
16
//
spill
r4
lfetch.fault.excl.nt1
[
r3
],
128
mov
ar
.
rsc
=
0
//
put
RSE
in
mode
:
enforced
lazy
,
little
endian
,
pl
0
adds
r2
=
16
+
128
,
sp
;;
lfetch.fault.excl.nt1
[
r2
],
128
lfetch.fault.excl.nt1
[
r3
],
128
adds
r14
=
SW
(
R4
)+
16
,
sp
;;
lfetch.fault.excl
[
r2
]
lfetch.fault.excl
[
r3
]
adds
r15
=
SW
(
R5
)+
16
,
sp
#else
add
r2
=
16
+
3
*
128
,
sp
add
r3
=
16
,
sp
add
r14
=
SW
(
R4
)+
16
,
sp
;;
st8.spill
[
r14
]=
r4
,
SW
(
R6
)-
SW
(
R4
)
//
spill
r4
and
prefetch
offset
0x1c0
lfetch.fault.excl.nt1
[
r3
],
128
//
prefetch
offset
0x010
;;
lfetch.fault.excl.nt1
[
r3
],
128
//
prefetch
offset
0x090
lfetch.fault.excl.nt1
[
r2
],
128
//
prefetch
offset
0x190
;;
lfetch.fault.excl.nt1
[
r3
]
//
prefetch
offset
0x110
lfetch.fault.excl.nt1
[
r2
]
//
prefetch
offset
0x210
adds
r15
=
SW
(
R5
)+
16
,
sp
#endif
;;
st8.spill
[
r15
]=
r5
,
SW
(
R7
)-
SW
(
R5
)
//
spill
r5
mov.m
ar
.
rsc
=
0
//
put
RSE
in
mode
:
enforced
lazy
,
little
endian
,
pl
0
add
r2
=
SW
(
F2
)+
16
,
sp
//
r2
=
&
sw
->
f2
;;
mov
r18
=
ar
.
fpsr
//
preserve
fpsr
mov
r19
=
ar
.
rnat
add
r2
=
SW
(
F2
)+
16
,
sp
//
r2
=
&
sw
->
f2
.
mem
.
offset
0,0
; st8.spill [r14]=r4,16 // spill r4
.
mem
.
offset
8,0
; st8.spill [r15]=r5,16 // spill r5
add
r3
=
SW
(
F3
)+
16
,
sp
//
r3
=
&
sw
->
f3
st8.spill
[
r14
]=
r6
,
SW
(
B0
)-
SW
(
R6
)
//
spill
r6
mov.m
r18
=
ar
.
fpsr
//
preserve
fpsr
add
r3
=
SW
(
F3
)+
16
,
sp
//
r3
=
&
sw
->
f3
;;
stf.spill
[
r2
]=
f2
,
32
stf.spill
[
r3
]=
f3
,
32
mov.m
r19
=
ar
.
rnat
mov
r21
=
b0
.
mem
.
offset
0,0
; st8.spill [r14]=r6,16 // spill r6
.
mem
.
offset
8,0
; st8.spill [r15]=r7,16 // spill r7
stf.spill
[
r3
]=
f3
,
32
st8.spill
[
r15
]=
r7
,
SW
(
B2
)-
SW
(
R7
)
//
spill
r7
mov
r22
=
b1
;;
//
since
we
're done with the spills, read and save ar.unat:
mov
r29
=
ar
.
unat
//
M
-
uni
t
mov
r20
=
ar
.
bspstore
//
M
-
unit
mov
.m
r29
=
ar
.
una
t
mov
.m
r20
=
ar
.
bspstore
mov
r23
=
b2
stf.spill
[
r2
]=
f4
,
32
stf.spill
[
r3
]=
f5
,
32
mov
r24
=
b3
;;
st8
[
r14
]=
r21
,
16
//
save
b0
st8
[
r15
]=
r2
2
,
16
//
save
b1
st8
[
r14
]=
r21
,
SW
(
B1
)-
SW
(
B0
)
//
save
b0
st8
[
r15
]=
r2
3
,
SW
(
B3
)-
SW
(
B2
)
//
save
b2
mov
r25
=
b4
stf.spill
[
r2
]=
f10
,
32
stf.spill
[
r3
]=
f11
,
32
mov
r26
=
b5
;;
st8
[
r14
]=
r2
3
,
16
//
save
b2
st8
[
r15
]=
r24
,
16
//
save
b3
st8
[
r14
]=
r2
2
,
SW
(
B4
)-
SW
(
B1
)
//
save
b1
st8
[
r15
]=
r24
,
SW
(
AR_PFS
)-
SW
(
B3
)
//
save
b3
mov
r21
=
ar
.
lc
//
I
-
unit
stf.spill
[
r2
]=
f12
,
32
stf.spill
[
r3
]=
f13
,
32
;;
st8
[
r14
]=
r25
,
16
//
save
b4
st8
[
r15
]=
r
26
,
16
//
save
b5
st8
[
r14
]=
r25
,
SW
(
B5
)-
SW
(
B4
)
//
save
b4
st8
[
r15
]=
r
16
,
SW
(
AR_LC
)-
SW
(
AR_PFS
)
//
save
ar
.
pfs
stf.spill
[
r2
]=
f14
,
32
stf.spill
[
r3
]=
f15
,
32
;;
st8
[
r14
]=
r
16
//
save
ar
.
pfs
st8
[
r15
]=
r21
//
save
ar
.
lc
st8
[
r14
]=
r
26
//
save
b5
st8
[
r15
]=
r21
//
save
ar
.
lc
stf.spill
[
r2
]=
f16
,
32
stf.spill
[
r3
]=
f17
,
32
;;
...
...
@@ -284,26 +302,26 @@ GLOBAL_ENTRY(save_switch_stack)
;;
stf.spill
[
r2
]=
f24
,
32
stf.spill
[
r3
]=
f25
,
32
add
r14
=
SW
(
CALLER_UNAT
)+
16
,
sp
;;
stf.spill
[
r2
]=
f26
,
32
stf.spill
[
r3
]=
f27
,
32
add
r15
=
SW
(
AR_FPSR
)+
16
,
sp
;;
stf.spill
[
r2
]=
f28
,
32
stf.spill
[
r3
]=
f29
,
32
st8
[
r14
]=
r17
//
save
caller_unat
st8
[
r15
]=
r18
//
save
fpsr
mov
r21
=
pr
;;
stf.spill
[
r2
]=
f30
,(
SW
(
AR_UNAT
)-
SW
(
F30
))
stf.spill
[
r3
]=
f31
,(
SW
(
AR_RNAT
)-
SW
(
F31
))
stf.spill
[
r2
]=
f30
,
SW
(
AR_UNAT
)-
SW
(
F30
)
stf.spill
[
r3
]=
f31
,
SW
(
PR
)-
SW
(
F31
)
add
r14
=
SW
(
CALLER_UNAT
)+
16
,
sp
;;
st8
[
r2
]=
r29
,
SW
(
AR_RNAT
)-
SW
(
AR_UNAT
)
//
save
ar
.
unat
st8
[
r14
]=
r17
,
SW
(
AR_FPSR
)-
SW
(
CALLER_UNAT
)
//
save
caller_unat
mov
r21
=
pr
;;
st8
[
r2
]=
r
29
,
16
//
save
ar
.
u
nat
st8
[
r3
]=
r
19
,
16
//
save
ar
.
rnat
st8
[
r2
]=
r
19
,
SW
(
AR_BSPSTORE
)-
SW
(
AR_RNAT
)
//
save
ar
.
r
nat
st8
[
r3
]=
r
21
//
save
predicate
registers
;;
st8
[
r2
]=
r20
//
save
ar
.
bspstore
st8
[
r
3
]=
r21
//
save
predicate
registers
st8
[
r2
]=
r20
//
save
ar
.
bspstore
st8
[
r
14
]=
r18
//
save
fpsr
mov
ar
.
rsc
=
3
//
put
RSE
back
into
eager
mode
,
pl
0
br.cond.sptk.many
b7
END
(
save_switch_stack
)
...
...
@@ -647,23 +665,38 @@ dont_preserve_current_frame:
/
*
*
To
prevent
leaking
bits
between
the
kernel
and
user
-
space
,
*
we
must
clear
the
stacked
registers
in
the
"invalid"
partition
here
.
*
Not
pretty
,
but
at
least
it
's fast (3.34 registers/cycle).
*
Architecturally
,
this
loop
could
go
at
4
.67
registers
/
cycle
,
but
that
would
*
oversubscribe
Itanium
.
*
Not
pretty
,
but
at
least
it
's fast (3.34 registers/cycle on Itanium,
*
5
registers
/
cycle
on
McKinley
)
.
*/
# define pRecurse p6
# define pReturn p7
#ifdef CONFIG_ITANIUM
# define Nregs 10
#else
# define Nregs 14
#endif
alloc
loc0
=
ar
.
pfs
,
2
,
Nregs
-
2
,
2
,
0
shr.u
loc1
=
r18
,
9
//
RNaTslots
<=
dirtySize
/
(
64
*
8
)
+
1
sub
r17
=
r17
,
r18
//
r17
=
(
physStackedSize
+
8
)
-
dirtySize
;;
#if 1
.
align
32
//
see
comment
below
about
gas
bug
...
#endif
mov
ar
.
rsc
=
r19
//
load
ar
.
rsc
to
be
used
for
"loadrs"
shladd
in0
=
loc1
,
3
,
r17
mov
in1
=
0
#if 0
//
gas
-
2
.12.90
is
unable
to
generate
a
stop
bit
after
.
align
,
which
is
bad
,
//
because
alloc
must
be
at
the
beginning
of
an
insn
-
group
.
.
align
32
#else
nop
0
nop
0
nop
0
#endif
;;
//
.
align
32
//
gas
-
2
.11.90
is
unable
to
generate
a
stop
bit
after
.
align
rse_clear_invalid
:
#ifdef CONFIG_ITANIUM
//
cycle
0
{
.
mii
alloc
loc0
=
ar
.
pfs
,
2
,
Nregs
-
2
,
2
,
0
...
...
@@ -692,9 +725,31 @@ rse_clear_invalid:
mov
loc7
=
0
(
pReturn
)
br.ret.sptk.many
b6
}
#else /* !CONFIG_ITANIUM */
alloc
loc0
=
ar
.
pfs
,
2
,
Nregs
-
2
,
2
,
0
cmp.lt
pRecurse
,
p0
=
Nregs
*
8
,
in0
//
if
more
than
Nregs
regs
left
to
clear
,
(
re
)
curse
add
out0
=-
Nregs
*
8
,
in0
add
out1
=
1
,
in1
//
increment
recursion
count
mov
loc1
=
0
mov
loc2
=
0
;;
mov
loc3
=
0
mov
loc4
=
0
mov
loc9
=
0
mov
loc5
=
0
mov
loc6
=
0
(
pRecurse
)
br.call.sptk.many
b6
=
rse_clear_invalid
;;
mov
loc7
=
0
mov
loc8
=
0
cmp.ne
pReturn
,
p0
=
r0
,
in1
//
if
recursion
count
!=
0
,
we
need
to
do
a
br
.
ret
mov
loc10
=
0
mov
loc11
=
0
(
pReturn
)
br.ret.sptk.many
b6
#endif /* !CONFIG_ITANIUM */
# undef pRecurse
# undef pReturn
;;
alloc
r17
=
ar
.
pfs
,
0
,
0
,
0
,
0
//
drop
current
register
frame
;;
loadrs
...
...
arch/ia64/kernel/minstate.h
View file @
889ac863
...
...
@@ -28,18 +28,19 @@
* on interrupts.
*/
#define MINSTATE_START_SAVE_MIN_VIRT \
dep r1=-1,r1,61,3;
/* r1 = current (virtual) */
\
(pUser) mov ar.rsc=0;
/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */
\
dep r1=-1,r1,61,3;
/* r1 = current (virtual) */
\
;; \
(pUser) mov.m rARRNAT=ar.rnat; \
(pUser) addl rKRBS=IA64_RBS_OFFSET,r1;
/* compute base of RBS */
\
(pUser) mov rARRNAT=ar.rnat; \
(pKern) mov r1=sp;
/* get sp */
\
;; \
(pUser)
addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;
/* compute base of memory stack */
\
(pUser)
lfetch.fault.excl.nt1 [rKRBS];
\
(pUser) mov rARBSPSTORE=ar.bspstore;
/* save ar.bspstore */
\
(pUser) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;
/* compute base of memory stack */
\
;; \
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;
/* if in kernel mode, use sp (r12) */
\
(pUser) mov ar.bspstore=rKRBS;
/* switch to kernel RBS */
\
(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;
/* if in kernel mode, use sp (r12) */
\
;; \
(pUser) mov r18=ar.bsp; \
(pUser) mov ar.rsc=0x3;
/* set eager mode, pl 0, little-endian, loadrs=0 */
\
...
...
@@ -125,52 +126,58 @@
;; \
SAVE_IFS; \
MINSTATE_START_SAVE_MIN \
add r17=L1_CACHE_BYTES,r1
/* really: biggest cache-line size */
\
;; \
mov r16=r1;
/* initialize first base pointer */
\
adds r17=8,r1;
/* initialize second base pointer */
\
st8 [r1]=rCRIPSR;
/* save cr.ipsr */
\
lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
add r16=16,r1;
/* initialize first base pointer */
\
;; \
st8 [r16]=rCRIPSR,16;
/* save cr.ipsr */
\
st8 [r17]=rCRIIP,16;
/* save cr.iip */
\
lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
;; \
lfetch.fault.excl.nt1 [r17]; \
adds r17=8,r1;
/* initialize second base pointer */
\
(pKern) mov r18=r0;
/* make sure r18 isn't NaT */
\
;; \
st8 [r17]=rCRIIP,16;
/* save cr.iip */
\
st8 [r16]=rCRIFS,16;
/* save cr.ifs */
\
st8 [r17]=rARUNAT,16;
/* save ar.unat */
\
(pUser) sub r18=r18,rKRBS;
/* r18=RSE.ndirty*8 */
\
;; \
st8 [r17]=rARUNAT,16;
/* save ar.unat */
\
st8 [r16]=rARPFS,16;
/* save ar.pfs */
\
shl r18=r18,16;
/* compute ar.rsc to be used for "loadrs" */
\
;; \
st8 [r17]=rARRSC,16;
/* save ar.rsc */
\
tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT \
;;
/* avoid RAW on r16 & r17 */
\
(pKern) adds r16=16,r16;
/* skip over ar_rnat field */
\
(pKern) adds r17=16,r17;
/* skip over ar_bspstore field */
\
(pUser) st8 [r16]=rARRNAT,16;
/* save ar.rnat */
\
(pKern) adds r16=16,r16;
/* skip over ar_rnat field */
\
;;
/* avoid RAW on r16 & r17 */
\
(pUser) st8 [r17]=rARBSPSTORE,16;
/* save ar.bspstore */
\
;; \
st8 [r16]=rARPR,16;
/* save predicates */
\
st8 [r17]=rB6,16;
/* save b6 */
\
shl r18=r18,16;
/* compute ar.rsc to be used for "loadrs" */
\
(pKern) adds r17=16,r17;
/* skip over ar_bspstore field */
\
;; \
st8 [r17]=rB6,16;
/* save b6 */
\
st8 [r16]=r18,16;
/* save ar.rsc value for "loadrs" */
\
st8.spill [r17]=rR1,16;
/* save original r1 */
\
tbit.nz p15,p0=rCRIPSR,IA64_PSR_I_BIT
\
;; \
.mem.offset 8,0; st8.spill [r17]=rR1,16;
/* save original r1 */
\
.mem.offset 0,0; st8.spill [r16]=r2,16; \
;; \
.mem.offset 8,0; st8.spill [r17]=r3,16; \
.mem.offset 0,0; st8.spill [r16]=r12,16; \
adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
;; \
.mem.offset
0,0; st8.spill [r16]=r12,16;
\
.mem.offset
8,0; st8.spill [r17]=r13,16;
\
.mem.offset
8,0; st8.spill [r17]=r13,16;
\
.mem.offset
0,0; st8.spill [r16]=r14,16;
\
cmp.eq pNonSys,pSys=r0,r0
/* initialize pSys=0, pNonSys=1 */
\
;; \
.mem.offset
0,0; st8.spill [r16]=r14,16;
\
.mem.offset
8,0; st8.spill [r17]=r15,16;
\
.mem.offset
8,0; st8.spill [r17]=r15,16;
\
.mem.offset
0,0; st8.spill [r16]=r8,16;
\
dep r14=-1,r0,61,3; \
;; \
.mem.offset
0,0; st8.spill [r16]=r8,16;
\
.mem.offset
8,0; st8.spill [r17]=r9,16;
\
.mem.offset
8,0; st8.spill [r17]=r9,16;
\
.mem.offset
0,0; st8.spill [r16]=r10,16;
\
adds r12=-16,r1;
/* switch to kernel memory stack (with 16 bytes of scratch) */
\
;; \
.mem.offset 0,0; st8.spill [r16]=r10,16; \
.mem.offset 8,0; st8.spill [r17]=r11,16; \
.mem.offset 8,0; st8.spill [r17]=r11,16; \
mov r13=IA64_KR(CURRENT);
/* establish `current' */
\
;; \
EXTRA; \
...
...
@@ -190,10 +197,12 @@
*/
#define SAVE_REST \
.mem.offset 0,0; st8.spill [r2]=r16,16; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r17,16; \
.mem.offset 0,0; st8.spill [r2]=r18,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r19,16; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
;; \
mov r16=ar.ccv;
/* M-unit */
\
movl r18=FPSR_DEFAULT
/* L-unit */
\
...
...
@@ -201,30 +210,29 @@
mov r17=ar.fpsr;
/* M-unit */
\
mov ar.fpsr=r18;
/* M-unit */
\
;; \
.mem.offset 0,0; st8.spill [r2]=r20,16; \
.mem.offset 8,0; st8.spill [r3]=r21,16; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
mov r18=b0; \
;; \
.mem.offset 0,0; st8.spill [r2]=r22,16; \
.mem.offset 8,0; st8.spill [r3]=r23,16; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
mov r19=b7; \
;; \
.mem.offset 0,0; st8.spill [r2]=r24,16; \
.mem.offset 8,0; st8.spill [r3]=r25,16; \
;; \
.mem.offset 0,0; st8.spill [r2]=r26,16; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r27,16; \
.mem.offset 0,0; st8.spill [r2]=r28,16; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r29,16; \
.mem.offset 0,0; st8.spill [r2]=r30,16; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
;; \
.mem.offset 8,0; st8.spill [r3]=r31,16; \
st8 [r2]=r16,16;
/* ar.ccv */
\
st8 [r3]=r17,16;
/* ar.fpsr */
\
;; \
st8 [r3]=r17,16;
/* ar.fpsr */
\
st8 [r2]=r18,16;
/* b0 */
\
;; \
st8 [r3]=r19,16+8;
/* b7 */
\
;; \
stf.spill [r2]=f6,32; \
...
...
arch/ia64/lib/do_csum.S
View file @
889ac863
...
...
@@ -8,9 +8,11 @@
*
in0
:
address
of
buffer
to
checksum
(
char
*)
*
in1
:
length
of
the
buffer
(
int
)
*
*
Copyright
(
C
)
1999
,
2001
Hewlett
-
Packard
Co
*
Copyright
(
C
)
1999
Stephane
Eranian
<
eranian
@
hpl
.
hp
.
com
>
*
Copyright
(
C
)
1999
,
2001
-
2002
Hewlett
-
Packard
Co
*
Stephane
Eranian
<
eranian
@
hpl
.
hp
.
com
>
*
*
02
/
04
/
08
David
Mosberger
<
davidm
@
hpl
.
hp
.
com
>
*
More
cleanup
and
tuning
.
*
01
/
04
/
18
Jun
Nakajima
<
jun
.
nakajima
@
intel
.
com
>
*
Clean
up
and
optimize
and
the
software
pipeline
,
loading
two
*
back
-
to
-
back
8
-
byte
words
per
loop
.
Clean
up
the
initialization
...
...
@@ -71,8 +73,6 @@
//
calculating
the
Internet
checksum
.
//
//
NOT
YET
DONE
:
//
-
use
the
lfetch
instruction
to
augment
the
chances
of
the
data
being
in
//
the
cache
when
we
need
it
.
//
-
Maybe
another
algorithm
which
would
take
care
of
the
folding
at
the
//
end
in
a
different
manner
//
-
Work
with
people
more
knowledgeable
than
me
on
the
network
stack
...
...
@@ -102,10 +102,6 @@
#define buf in0
#define len in1
#ifndef CONFIG_IA64_LOAD_LATENCY
#define CONFIG_IA64_LOAD_LATENCY 2
#endif
#define LOAD_LATENCY 2 // XXX fix me
#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
...
...
@@ -122,45 +118,46 @@ GLOBAL_ENTRY(do_csum)
.
prologue
.
save
ar
.
pfs
,
saved_pfs
alloc
saved_pfs
=
ar
.
pfs
,
2
,
16
,
1
,
16
.
rotr
word1
[
4
],
word2
[
4
],
result1
[
4
],
result2
[
4
]
.
rotp
p
[
PIPE_DEPTH
]
.
rotr
word1
[
4
],
word2
[
4
],
result1
[
LOAD_LATENCY
+
2
],
result2
[
LOAD_LATENCY
+
2
]
.
rotp
p
[
PIPE_DEPTH
]
,
pC1
[
2
],
pC2
[
2
]
mov
ret0
=
r0
//
in
case
we
have
zero
length
cmp.lt
p0
,
p6
=
r0
,
len
//
check
for
zero
length
or
negative
(
32
bit
len
)
;; // avoid WAW on CFM
mov
tmp3
=
0x7
//
a
temporary
mask
/
value
;;
add
tmp1
=
buf
,
len
//
last
byte
's address
(
p6
)
br.ret.spnt.many
rp
//
return
if
true
(
hope
we
can
avoid
that
)
.
save
pr
,
saved_pr
mov
saved_pr
=
pr
//
preserve
predicates
(
rotation
)
(
p6
)
br.ret.spnt.many
rp
//
return
if
zero
or
negative
length
and
firstoff
=
7
,
buf
//
how
many
bytes
off
for
first1
element
tbit.nz
p15
,
p0
=
buf
,
0
//
is
buf
an
odd
address
?
mov
hmask
=-
1
//
intialize
head
mask
;;
andcm
first1
=
buf
,
tmp3
//
8
byte
aligned
down
address
of
first1
element
tbit.nz
p15
,
p0
=
buf
,
0
//
is
buf
an
odd
address
?
and
first1
=-
8
,
buf
//
8
-
byte
align
down
address
of
first1
element
and
firstoff
=
7
,
buf
//
how
many
bytes
off
for
first1
element
mov
tmask
=-
1
//
initialize
tail
mask
adds
tmp2
=-
1
,
tmp1
//
last
-
1
;;
adds
tmp2
=-
1
,
tmp1
//
last
-
1
and
lastoff
=
7
,
tmp1
//
how
many
bytes
off
for
last
element
andcm
last
=
tmp2
,
tmp3
//
address
of
word
containing
last
byte
.
save
pr
,
saved_pr
mov
saved_pr
=
pr
//
preserve
predicates
(
rotation
)
;;
sub
tmp1
=
8
,
lastoff
//
complement
to
lastoff
and
last
=-
8
,
tmp2
//
address
of
word
containing
last
byte
;;
sub
tmp3
=
last
,
first1
//
tmp3
=
distance
from
first1
to
last
.
save
ar
.
lc
,
saved_lc
mov
saved_lc
=
ar
.
lc
//
save
lc
cmp.eq
p8
,
p9
=
last
,
first1
//
everything
fits
in
one
word
?
sub
tmp1
=
8
,
lastoff
//
complement
to
lastoff
ld8
firstval
=[
first1
],
8
//
load
,
ahead
of
time
,
"first1"
word
ld8
firstval
=[
first1
],
8
//
load
,
ahead
of
time
,
"first1"
word
and
tmp1
=
7
,
tmp1
//
make
sure
that
if
tmp1
==
8
->
tmp1
=
0
shl
tmp2
=
firstoff
,
3
//
number
of
bits
;;
and
tmp1
=
7
,
tmp1
//
make
sure
that
if
tmp1
==
8
->
tmp1
=
0
(
p9
)
ld8
lastval
=[
last
]
//
load
,
ahead
of
time
,
"last"
word
,
if
needed
(
p9
)
ld8
lastval
=[
last
]
//
load
,
ahead
of
time
,
"last"
word
,
if
needed
shl
tmp1
=
tmp1
,
3
//
number
of
bits
(
p9
)
adds
tmp3
=-
8
,
tmp3
//
effectively
loaded
;;
(
p8
)
mov
lastval
=
r0
//
we
don
't need lastval if first1==last
shl
tmp1
=
tmp1
,
3
//
number
of
bits
shl
hmask
=
hmask
,
tmp2
//
build
head
mask
,
mask
off
[
0
,
first1off
[
;;
shr.u
tmask
=
tmask
,
tmp1
//
build
tail
mask
,
mask
off
]
8
,
lastoff
]
.
save
ar
.
lc
,
saved_lc
mov
saved_lc
=
ar
.
lc
//
save
lc
;;
.
body
#define count tmp3
...
...
@@ -171,8 +168,8 @@ GLOBAL_ENTRY(do_csum)
;;
//
If
count
is
odd
,
finish
this
8
-
byte
word
so
that
we
can
//
load
two
back
-
to
-
back
8
-
byte
words
per
loop
thereafter
.
tbit.nz
p10
,
p11
=
count
,
0
//
if
(
count
is
odd
)
and
word1
[
0
]=
firstval
,
hmask
//
and
mask
it
as
appropriate
tbit.nz
p10
,
p11
=
count
,
0
//
if
(
count
is
odd
)
;;
(
p8
)
mov
result1
[
0
]=
word1
[
0
]
(
p9
)
add
result1
[
0
]=
word1
[
0
],
word2
[
0
]
...
...
@@ -181,9 +178,8 @@ GLOBAL_ENTRY(do_csum)
;;
(
p6
)
adds
result1
[
0
]=
1
,
result1
[
0
]
(
p8
)
br.cond.dptk
.
do_csum_exit
//
if
(
within
an
8
-
byte
word
)
;;
(
p11
)
br.cond.dptk
.
do_csum16
//
if
(
count
is
even
)
;;
//
Here
count
is
odd
.
ld8
word1
[
1
]=[
first1
],
8
//
load
an
8
-
byte
word
cmp.eq
p9
,
p10
=
1
,
count
//
if
(
count
==
1
)
...
...
@@ -194,11 +190,9 @@ GLOBAL_ENTRY(do_csum)
cmp.ltu
p6
,
p0
=
result1
[
0
],
word1
[
1
]
;;
(
p6
)
adds
result1
[
0
]=
1
,
result1
[
0
]
;;
(
p9
)
br.cond.sptk
.
do_csum_exit
//
if
(
count
==
1
)
exit
//
Fall
through
to
caluculate
the
checksum
,
feeding
result1
[
0
]
as
//
the
initial
value
in
result1
[
0
]
.
;;
//
//
Calculate
the
checksum
loading
two
8
-
byte
words
per
loop
.
//
...
...
@@ -207,45 +201,36 @@ GLOBAL_ENTRY(do_csum)
shr.u
count
=
count
,
1
//
we
do
16
bytes
per
loop
;;
cmp.eq
p9
,
p10
=
r0
,
count
//
if
(
count
==
0
)
adds
count
=-
1
,
count
brp.loop.imp
1
f
,
2
f
;;
adds
count
=-
1
,
count
mov
ar
.
ec
=
PIPE_DEPTH
;;
mov
ar
.
lc
=
count
//
set
lc
;;
//
result1
[
0
]
must
be
initialized
in
advance
.
mov
result2
[
0
]=
r0
;;
mov
pr
.
rot
=
1
<<
16
;;
mov
carry1
=
r0
mov
carry2
=
r0
;;
add
first2
=
8
,
first1
;;
(
p9
)
br.cond.sptk
.
do_csum_exit
;;
nop.m
0
nop.i
0
;;
.
align
32
1
:
(
ELD_1
)
cmp.ltu
p
31
,
p0
=
result1
[
LOAD_LATENCY
],
word1
[
LOAD_LATENCY
+
1
]
(
p
32
)
adds
carry1
=
1
,
carry1
(
ELD_1
)
cmp.ltu
p
47
,
p0
=
result2
[
LOAD_LATENCY
],
word2
[
LOAD_LATENCY
+
1
]
(
p
48
)
adds
carry2
=
1
,
carry2
(
ELD_1
)
cmp.ltu
p
C1
[
0
]
,
p0
=
result1
[
LOAD_LATENCY
],
word1
[
LOAD_LATENCY
+
1
]
(
p
C1
[
1
])
adds
carry1
=1,
carry1
(
ELD_1
)
cmp.ltu
p
C2
[
0
]
,
p0
=
result2
[
LOAD_LATENCY
],
word2
[
LOAD_LATENCY
+
1
]
(
p
C2
[
1
])
adds
carry2
=1,
carry2
(
ELD
)
add
result1
[
LOAD_LATENCY
-
1
]=
result1
[
LOAD_LATENCY
],
word1
[
LOAD_LATENCY
]
(
ELD
)
add
result2
[
LOAD_LATENCY
-
1
]=
result2
[
LOAD_LATENCY
],
word2
[
LOAD_LATENCY
]
2
:
(
p
16
)
ld8
word1
[
0
]=[
first1
],
16
(
p
16
)
ld8
word2
[
0
]=[
first2
],
16
[
2
:]
(
p
[
0
]
)
ld8
word1
[
0
]=[
first1
],
16
(
p
[
0
]
)
ld8
word2
[
0
]=[
first2
],
16
br.ctop.sptk
1
b
;;
//
Since
len
is
a
32
-
bit
value
,
carry
cannot
be
larger
than
//
a
64
-
bit
value
.
(
p32
)
adds
carry1
=
1
,
carry1
//
since
we
miss
the
last
one
(
p48
)
adds
carry2
=
1
,
carry2
//
Since
len
is
a
32
-
bit
value
,
carry
cannot
be
larger
than
a
64
-
bit
value
.
(
pC1
[
1
])
adds
carry1
=1,
carry1
//
since
we
miss
the
last
one
(
pC2
[
1
])
adds
carry2
=1,
carry2
;;
add
result1
[
LOAD_LATENCY
+
1
]=
result1
[
LOAD_LATENCY
+
1
],
carry1
add
result2
[
LOAD_LATENCY
+
1
]=
result2
[
LOAD_LATENCY
+
1
],
carry2
...
...
@@ -263,18 +248,15 @@ GLOBAL_ENTRY(do_csum)
(
p6
)
adds
result1
[
0
]=
1
,
result1
[
0
]
;;
.
do_csum_exit
:
movl
tmp3
=
0xffffffff
;;
//
XXX
Fixme
//
//
now
fold
64
into
16
bits
taking
care
of
carry
//
that
's not very good because it has lots of sequentiality
//
and
tmp1
=
result1
[
0
],
tmp3
mov
tmp3
=
0xffff
zxt4
tmp1
=
result1
[
0
]
shr.u
tmp2
=
result1
[
0
],
32
;;
add
result1
[
0
]=
tmp1
,
tmp2
shr.u
tmp3
=
tmp3
,
16
;;
and
tmp1
=
result1
[
0
],
tmp3
shr.u
tmp2
=
result1
[
0
],
16
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment