Commit e34079bb authored by Dave Cheney's avatar Dave Cheney

runtime: avoid r9/r10 during memmove

Fixes #3718.

Requires CL 6300043.

R=rsc, minux.ma, extraterrestrial.neighbour
CC=golang-dev
https://golang.org/cl/6305100
parent 07826038
...@@ -23,19 +23,40 @@ ...@@ -23,19 +23,40 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE. // THE SOFTWARE.
// TE or TS are spilled to the stack during bulk register moves.
TS = 0 TS = 0
TE = 1 TE = 8
FROM = 2
N = 3 // Warning: the linker will use R11 to synthesize certain instructions. Please
TMP = 3 /* N and TMP don't overlap */ // take care and double check with objdump.
TMP1 = 4 FROM = 11
N = 12
// TODO(kaib): This can be done with the existing registers of LR is re-used. Same for memset. TMP = 12 /* N and TMP don't overlap */
TEXT runtime·memmove(SB), 7, $8 TMP1 = 5
// save g and m
MOVW R9, 4(R13) RSHIFT = 5
MOVW R10, 8(R13) LSHIFT = 6
OFFSET = 7
BR0 = 0 /* shared with TS */
BW0 = 1
BR1 = 1
BW1 = 2
BR2 = 2
BW2 = 3
BR3 = 3
BW3 = 4
FW0 = 1
FR0 = 2
FW1 = 2
FR1 = 3
FW2 = 3
FR2 = 4
FW3 = 4
FR3 = 8 /* shared with TE */
TEXT runtime·memmove(SB), 7, $4
_memmove: _memmove:
MOVW to+0(FP), R(TS) MOVW to+0(FP), R(TS)
MOVW from+4(FP), R(FROM) MOVW from+4(FP), R(FROM)
...@@ -64,15 +85,17 @@ _b4aligned: /* is source now aligned? */ ...@@ -64,15 +85,17 @@ _b4aligned: /* is source now aligned? */
BNE _bunaligned BNE _bunaligned
ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
MOVW R(TS), savedts+4(SP)
_b32loop: _b32loop:
CMP R(TMP), R(TE) CMP R(TMP), R(TE)
BLS _b4tail BLS _b4tail
MOVM.DB.W (R(FROM)), [R4-R11] MOVM.DB.W (R(FROM)), [R0-R7]
MOVM.DB.W [R4-R11], (R(TE)) MOVM.DB.W [R0-R7], (R(TE))
B _b32loop B _b32loop
_b4tail: /* do remaining words if possible */ _b4tail: /* do remaining words if possible */
MOVW savedts+4(SP), R(TS)
ADD $3, R(TS), R(TMP) ADD $3, R(TS), R(TMP)
_b4loop: _b4loop:
CMP R(TMP), R(TE) CMP R(TMP), R(TE)
...@@ -107,22 +130,24 @@ _f4aligned: /* is source now aligned? */ ...@@ -107,22 +130,24 @@ _f4aligned: /* is source now aligned? */
BNE _funaligned BNE _funaligned
SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
MOVW R(TE), savedte+4(SP)
_f32loop: _f32loop:
CMP R(TMP), R(TS) CMP R(TMP), R(TS)
BHS _f4tail BHS _f4tail
MOVM.IA.W (R(FROM)), [R4-R11] MOVM.IA.W (R(FROM)), [R1-R8]
MOVM.IA.W [R4-R11], (R(TS)) MOVM.IA.W [R1-R8], (R(TS))
B _f32loop B _f32loop
_f4tail: _f4tail:
MOVW savedte+4(SP), R(TE)
SUB $3, R(TE), R(TMP) /* do remaining words if possible */ SUB $3, R(TE), R(TMP) /* do remaining words if possible */
_f4loop: _f4loop:
CMP R(TMP), R(TS) CMP R(TMP), R(TS)
BHS _f1tail BHS _f1tail
MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
MOVW.P R4, 4(R(TS)) /* implicit write back */ MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */
B _f4loop B _f4loop
_f1tail: _f1tail:
...@@ -134,25 +159,9 @@ _f1tail: ...@@ -134,25 +159,9 @@ _f1tail:
B _f1tail B _f1tail
_return: _return:
// restore g and m
MOVW 4(R13), R9
MOVW 8(R13), R10
MOVW to+0(FP), R0 MOVW to+0(FP), R0
RET RET
RSHIFT = 4
LSHIFT = 5
OFFSET = 6
BR0 = 7
BW0 = 8
BR1 = 8
BW1 = 9
BR2 = 9
BW2 = 10
BR3 = 10
BW3 = 11
_bunaligned: _bunaligned:
CMP $2, R(TMP) /* is R(TMP) < 2 ? */ CMP $2, R(TMP) /* is R(TMP) < 2 ? */
...@@ -172,7 +181,8 @@ _bunaligned: ...@@ -172,7 +181,8 @@ _bunaligned:
CMP R(TMP), R(TE) CMP R(TMP), R(TE)
BLS _b1tail BLS _b1tail
AND $~0x03, R(FROM) /* align source */ BIC $3, R(FROM) /* align source */
MOVW R(TS), savedts+4(SP)
MOVW (R(FROM)), R(BR0) /* prime first block register */ MOVW (R(FROM)), R(BR0) /* prime first block register */
_bu16loop: _bu16loop:
...@@ -196,18 +206,10 @@ _bu16loop: ...@@ -196,18 +206,10 @@ _bu16loop:
B _bu16loop B _bu16loop
_bu1tail: _bu1tail:
MOVW savedts+4(SP), R(TS)
ADD R(OFFSET), R(FROM) ADD R(OFFSET), R(FROM)
B _b1tail B _b1tail
FW0 = 7
FR0 = 8
FW1 = 8
FR1 = 9
FW2 = 9
FR2 = 10
FW3 = 10
FR3 = 11
_funaligned: _funaligned:
CMP $2, R(TMP) CMP $2, R(TMP)
...@@ -227,7 +229,8 @@ _funaligned: ...@@ -227,7 +229,8 @@ _funaligned:
CMP R(TMP), R(TS) CMP R(TMP), R(TS)
BHS _f1tail BHS _f1tail
AND $~0x03, R(FROM) /* align source */ BIC $3, R(FROM) /* align source */
MOVW R(TE), savedte+4(SP)
MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */
_fu16loop: _fu16loop:
...@@ -235,7 +238,7 @@ _fu16loop: ...@@ -235,7 +238,7 @@ _fu16loop:
BHS _fu1tail BHS _fu1tail
MOVW R(FR3)>>R(RSHIFT), R(FW0) MOVW R(FR3)>>R(RSHIFT), R(FW0)
MOVM.IA.W (R(FROM)), [R(FR0)-R(FR3)] MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
ORR R(FR0)<<R(LSHIFT), R(FW0) ORR R(FR0)<<R(LSHIFT), R(FW0)
MOVW R(FR0)>>R(RSHIFT), R(FW1) MOVW R(FR0)>>R(RSHIFT), R(FW1)
...@@ -247,9 +250,10 @@ _fu16loop: ...@@ -247,9 +250,10 @@ _fu16loop:
MOVW R(FR2)>>R(RSHIFT), R(FW3) MOVW R(FR2)>>R(RSHIFT), R(FW3)
ORR R(FR3)<<R(LSHIFT), R(FW3) ORR R(FR3)<<R(LSHIFT), R(FW3)
MOVM.IA.W [R(FW0)-R(FW3)], (R(TS)) MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
B _fu16loop B _fu16loop
_fu1tail: _fu1tail:
MOVW savedte+4(SP), R(TE)
SUB R(OFFSET), R(FROM) SUB R(OFFSET), R(FROM)
B _f1tail B _f1tail
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment