Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
3c4cefb3
Commit
3c4cefb3
authored
Mar 26, 2002
by
David Mosberger
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix ia64-version of thread_info. Add McKinley-optimized copy_page().
parent
8c9ac7c2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
188 additions
and
1 deletion
+188
-1
arch/ia64/lib/copy_page_mck.S
arch/ia64/lib/copy_page_mck.S
+184
-0
include/asm-ia64/thread_info.h
include/asm-ia64/thread_info.h
+4
-1
No files found.
arch/ia64/lib/copy_page_mck.S
0 → 100644
View file @
3c4cefb3
/*
*
McKinley
-
optimized
version
of
copy_page
()
.
*
*
Copyright
(
C
)
2002
Hewlett
-
Packard
Co
*
David
Mosberger
<
davidm
@
hpl
.
hp
.
com
>
*
*
Inputs
:
*
in0
:
address
of
target
page
*
in1
:
address
of
source
page
*
Output
:
*
no
return
value
*
*
General
idea
:
*
-
use
regular
loads
and
stores
to
prefetch
data
to
avoid
consuming
M
-
slot
just
for
*
lfetches
=>
good
for
in
-
cache
performance
*
-
avoid
l2
bank
-
conflicts
by
not
storing
into
the
same
16
-
byte
bank
within
a
single
*
cycle
*
*
Principle
of
operation
:
*
We
use
a
software
-
pipelined
loop
to
control
the
overall
operation
.
The
pipeline
*
has
2
*
PREFETCH_DIST
+
2
stages
.
The
first
PREFETCH_DIST
stages
are
used
for
prefetching
*
source
cache
-
lines
.
The
second
PREFETCH_DIST
stages
are
used
for
prefetching
destination
*
cache
-
lines
,
the
two
last
stages
are
used
to
copy
the
cache
-
line
words
not
copied
by
*
the
prefetches
.
The
four
relevant
points
in
the
pipelined
are
called
A
,
B
,
C
,
D
:
*
p
[
A
]
is
TRUE
if
a
source
-
line
should
be
prefetched
,
p
[
B
]
is
TRUE
if
a
destination
-
line
*
should
be
prefetched
,
p
[
C
]
is
TRUE
if
at
least
one
more
cacheline
needs
to
be
copied
,
*
and
p
[
D
]
is
TRUE
if
a
cachline
needs
to
be
copied
.
*
*
Note
that
L1
has
a
line
-
size
of
64
bytes
and
L2
a
line
-
size
of
128
bytes
.
To
avoid
*
secondary
misses
in
L2
,
we
prefetch
both
source
and
destination
with
a
line
-
size
*
of
128
bytes
.
When
both
of
these
lines
are
in
the
L2
and
the
first
half
of
the
*
source
line
is
in
L1
,
we
start
copying
the
remaining
words
.
The
second
half
of
the
*
source
line
is
prefetched
in
the
previous
iteration
,
so
that
by
the
time
we
start
*
accessing
it
,
it
's also present in the L1.
*
*
This
all
sounds
very
complicated
,
but
thanks
to
the
modulo
-
scheduled
loop
support
,
*
the
resulting
code
is
very
regular
and
quite
easy
to
follow
(
once
you
get
the
idea
)
.
*
*
As
a
secondary
optimization
,
the
first
2
*
PREFETCH_DIST
iterations
are
implemented
*
as
the
separate
.
prefetch_loop
.
Logically
,
this
loop
performs
exactly
like
the
*
main
-
loop
(
.
line_copy
),
but
has
all
know
-
to
-
be
-
predicated
-
off
instructions
removed
,
*
so
that
each
loop
iteration
is
faster
(
again
,
good
for
cached
case
)
.
*
*
When
reading
the
code
,
it
helps
to
keep
the
following
picture
in
mind
:
*
*
bank
0
bank
1
*
+------+------+---
*
| v[x] |
t1
|
^
*
| t2 |
t3
| |
*
| t4 |
t5
| |
*
| t6 |
t7
| |
128
bytes
*
| n8 |
t9
| |
(
L2
cache
line
)
*
| t10 |
t11
| |
*
| t12 |
t13
| |
*
| t14 |
t15
|
v
*
+------+------+---
*
*
Here
,
v
[
x
]
is
copied
by
the
(
memory
)
prefetch
.
n8
is
loaded
in
the
previous
iteration
*
to
fetch
the
second
-
half
of
the
L2
cache
line
into
L1
,
and
the
tX
words
are
copied
in
*
an
order
that
avoids
bank
conflicts
.
*/
#include <asm/asmmacro.h>
#include <asm/page.h>
#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
#define src0 r2
#define src1 r3
#define dst0 r9
#define dst1 r10
#define src_pre_mem r11
#define dst_pre_mem r14
#define src_pre_l2 r15
#define dst_pre_l2 r16
#define t1 r17
#define t2 r18
#define t3 r19
#define t4 r20
#define t5 t1 // alias!
#define t6 t2 // alias!
#define t7 t3 // alias!
#define n8 r21
#define t9 t5 // alias!
#define t10 t4 // alias!
#define t11 t7 // alias!
#define t12 t6 // alias!
#define t14 t10 // alias!
#define t13 r22
#define t15 r23
#define saved_lc r24
#define saved_pr r25
#define A 0
#define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST)
#define D (C + 1)
#define N (D + 1)
#define Nrot ((N + 7) & ~7)
GLOBAL_ENTRY
(
copy_page
)
.
prologue
alloc
r8
=
ar
.
pfs
,
2
,
Nrot
-
2
,
0
,
Nrot
.
rotr
v
[
2
*
PREFETCH_DIST
]
.
rotp
p
[
N
]
.
save
ar
.
lc
,
saved_lc
mov
saved_lc
=
ar
.
lc
.
save
pr
,
saved_pr
mov
saved_pr
=
pr
.
body
mov
src_pre_mem
=
in1
mov
pr
.
rot
=
0x10000
mov
ar
.
ec
=
1
//
special
unrolled
loop
mov
dst_pre_mem
=
in0
mov
ar
.
lc
=
2
*
PREFETCH_DIST
-
1
add
src_pre_l2
=
8
*
8
,
in1
add
dst_pre_l2
=
8
*
8
,
in0
add
src0
=
8
,
in1
//
first
t1
src
add
src1
=
3
*
8
,
in1
//
first
t3
src
add
dst0
=
8
,
in0
//
first
t1
dst
add
dst1
=
3
*
8
,
in0
//
first
t3
dst
;;
//
same
as
.
line_copy
loop
,
but
with
all
predicated
-
off
instructions
removed
:
.
prefetch_loop
:
(
p
[
A
])
ld8
v
[
A
]
=
[
src_pre_mem
],
128
//
M0
(
p
[
B
])
st8
[
dst_pre_mem
]
=
v
[
B
],
128
//
M2
br.ctop.sptk
.
prefetch_loop
;;
cmp.eq
p16
,
p0
=
r0
,
r0
//
reset
p16
to
1
(
br
.
ctop
cleared
it
to
zero
)
mov
ar
.
lc
=
(
PAGE_SIZE
/
128
)
-
(
2
*
PREFETCH_DIST
)
-
1
mov
ar
.
ec
=
N
//
#
of
stages
in
pipeline
;;
.
align
32
.
line_copy
:
(
p
[
D
])
ld8
t2
=
[
src0
],
3
*
8
//
M0
(
p
[
D
])
ld8
t4
=
[
src1
],
3
*
8
//
M1
(
p
[
B
])
st8
[
dst_pre_mem
]
=
v
[
B
],
128
//
M2
prefetch
dst
from
memory
(
p
[
D
])
st8
[
dst_pre_l2
]
=
n8
,
128
//
M3
prefetch
dst
from
L2
;;
(
p
[
A
])
ld8
v
[
A
]
=
[
src_pre_mem
],
128
//
M0
prefetch
src
from
memory
(
p
[
C
])
ld8
n8
=
[
src_pre_l2
],
128
//
M1
prefetch
src
from
L2
(
p
[
D
])
st8
[
dst0
]
=
t1
,
8
//
M2
(
p
[
D
])
st8
[
dst1
]
=
t3
,
8
//
M3
;;
(
p
[
D
])
ld8
t5
=
[
src0
],
8
(
p
[
D
])
ld8
t7
=
[
src1
],
3
*
8
(
p
[
D
])
st8
[
dst0
]
=
t2
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t4
,
3
*
8
;;
(
p
[
D
])
ld8
t6
=
[
src0
],
3
*
8
(
p
[
D
])
ld8
t10
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t5
,
8
(
p
[
D
])
st8
[
dst1
]
=
t7
,
3
*
8
;;
(
p
[
D
])
ld8
t9
=
[
src0
],
3
*
8
(
p
[
D
])
ld8
t11
=
[
src1
],
3
*
8
(
p
[
D
])
st8
[
dst0
]
=
t6
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t10
,
8
;;
(
p
[
D
])
ld8
t12
=
[
src0
],
8
(
p
[
D
])
ld8
t14
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t9
,
3
*
8
(
p
[
D
])
st8
[
dst1
]
=
t11
,
3
*
8
;;
(
p
[
D
])
ld8
t13
=
[
src0
],
4
*
8
(
p
[
D
])
ld8
t15
=
[
src1
],
4
*
8
(
p
[
D
])
st8
[
dst0
]
=
t12
,
8
(
p
[
D
])
st8
[
dst1
]
=
t14
,
8
;;
(
p
[
C
])
ld8
t1
=
[
src0
],
8
(
p
[
C
])
ld8
t3
=
[
src1
],
8
(
p
[
D
])
st8
[
dst0
]
=
t13
,
4
*
8
(
p
[
D
])
st8
[
dst1
]
=
t15
,
4
*
8
br.ctop.sptk
.
line_copy
;;
mov
ar
.
lc
=
saved_lc
mov
pr
=
saved_pr
,
-
1
br.ret.sptk.many
rp
END
(
copy_page
)
include/asm-ia64/thread_info.h
View file @
3c4cefb3
...
@@ -12,7 +12,8 @@
...
@@ -12,7 +12,8 @@
#define TI_EXEC_DOMAIN 0x00
#define TI_EXEC_DOMAIN 0x00
#define TI_FLAGS 0x08
#define TI_FLAGS 0x08
#define TI_CPU 0x0c
#define TI_CPU 0x0c
#define TI_ADDR_LIMI 0x10
#define TI_ADDR_LIMIT 0x10
#define TI_PRE_COUNT 0x18
#ifndef __ASSEMBLY__
#ifndef __ASSEMBLY__
...
@@ -26,6 +27,7 @@ struct thread_info {
...
@@ -26,6 +27,7 @@ struct thread_info {
__u32
flags
;
/* thread_info flags (see TIF_*) */
__u32
flags
;
/* thread_info flags (see TIF_*) */
__u32
cpu
;
/* current CPU */
__u32
cpu
;
/* current CPU */
mm_segment_t
addr_limit
;
/* user-level address space limit */
mm_segment_t
addr_limit
;
/* user-level address space limit */
__s32
preempt_count
;
/* 0=premptable, <0=BUG; will also serve as bh-counter */
};
};
#define INIT_THREAD_SIZE
/* tell sched.h not to declare the thread_union */
#define INIT_THREAD_SIZE
/* tell sched.h not to declare the thread_union */
...
@@ -37,6 +39,7 @@ struct thread_info {
...
@@ -37,6 +39,7 @@ struct thread_info {
flags: 0, \
flags: 0, \
cpu: 0, \
cpu: 0, \
addr_limit: KERNEL_DS, \
addr_limit: KERNEL_DS, \
preempt_count: 0, \
}
}
/* how to get the thread information struct from C */
/* how to get the thread information struct from C */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment