Commit 69ddb7a4 authored by Rob Pike's avatar Rob Pike

[dev.cc] all: edit assembly source for ARM to be more regular

Several .s files for ARM had several properties the new assembler will not support.
These include:

- mentioning SP or PC as a hardware register
	These are always pseudo-registers except that in some contexts
	they're not, and it's confusing because the context should not affect
	which register you mean. Change the references to the hardware
	registers to be explicit: R13 for SP, R15 for PC.
- constant creation using assignment
	The files say a=b when they could instead say #define a b.
	There is no reason to have both mechanisms.
- R(0) to refer to R0.
	Some macros use this to a great extent. Again, it's easy just to
	use a #define to rename a register.

Change-Id: I002335ace8e876c5b63c71c2560533eb835346d2
Reviewed-on: https://go-review.googlesource.com/4822Reviewed-by: default avatarDave Cheney <dave@cheney.net>
parent 2ecefd41
This diff is collapsed.
...@@ -7,56 +7,56 @@ ...@@ -7,56 +7,56 @@
#include "textflag.h" #include "textflag.h"
// Registers // Registers
dst = 0 #define Rdst R0
src = 1 #define Rsrc R1
n = 2 #define Rn R2
state = 3 #define Rstate R3
pi = 4 #define Rpi R4
pj = 5 #define Rpj R5
i = 6 #define Ri R6
j = 7 #define Rj R7
k = 8 #define Rk R8
t = 11 #define Rt R11
t2 = 12 #define Rt2 R12
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) // func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
TEXT ·xorKeyStream(SB),NOSPLIT,$0 TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVW 0(FP), R(dst) MOVW 0(FP), Rdst
MOVW 4(FP), R(src) MOVW 4(FP), Rsrc
MOVW 8(FP), R(n) MOVW 8(FP), Rn
MOVW 12(FP), R(state) MOVW 12(FP), Rstate
MOVW 16(FP), R(pi) MOVW 16(FP), Rpi
MOVW 20(FP), R(pj) MOVW 20(FP), Rpj
MOVBU (R(pi)), R(i) MOVBU (Rpi), Ri
MOVBU (R(pj)), R(j) MOVBU (Rpj), Rj
MOVW $0, R(k) MOVW $0, Rk
loop: loop:
// i += 1; j += state[i] // i += 1; j += state[i]
ADD $1, R(i) ADD $1, Ri
AND $0xff, R(i) AND $0xff, Ri
MOVBU R(i)<<2(R(state)), R(t) MOVBU Ri<<2(Rstate), Rt
ADD R(t), R(j) ADD Rt, Rj
AND $0xff, R(j) AND $0xff, Rj
// swap state[i] <-> state[j] // swap state[i] <-> state[j]
MOVBU R(j)<<2(R(state)), R(t2) MOVBU Rj<<2(Rstate), Rt2
MOVB R(t2), R(i)<<2(R(state)) MOVB Rt2, Ri<<2(Rstate)
MOVB R(t), R(j)<<2(R(state)) MOVB Rt, Rj<<2(Rstate)
// dst[k] = src[k] ^ state[state[i] + state[j]] // dst[k] = src[k] ^ state[state[i] + state[j]]
ADD R(t2), R(t) ADD Rt2, Rt
AND $0xff, R(t) AND $0xff, Rt
MOVBU R(t)<<2(R(state)), R(t) MOVBU Rt<<2(Rstate), Rt
MOVBU R(k)<<0(R(src)), R(t2) MOVBU Rk<<0(Rsrc), Rt2
EOR R(t), R(t2) EOR Rt, Rt2
MOVB R(t2), R(k)<<0(R(dst)) MOVB Rt2, Rk<<0(Rdst)
ADD $1, R(k) ADD $1, Rk
CMP R(k), R(n) CMP Rk, Rn
BNE loop BNE loop
done: done:
MOVB R(i), (R(pi)) MOVB Ri, (Rpi)
MOVB R(j), (R(pj)) MOVB Rj, (Rpj)
RET RET
...@@ -23,20 +23,20 @@ ...@@ -23,20 +23,20 @@
// the round macros instead of by explicit move instructions. // the round macros instead of by explicit move instructions.
// Register definitions // Register definitions
data = 0 // Pointer to incoming data #define Rdata R0 // Pointer to incoming data
const = 1 // Current constant for SHA round #define Rconst R1 // Current constant for SHA round
a = 2 // SHA1 accumulator #define Ra R2 // SHA1 accumulator
b = 3 // SHA1 accumulator #define Rb R3 // SHA1 accumulator
c = 4 // SHA1 accumulator #define Rc R4 // SHA1 accumulator
d = 5 // SHA1 accumulator #define Rd R5 // SHA1 accumulator
e = 6 // SHA1 accumulator #define Re R6 // SHA1 accumulator
t0 = 7 // Temporary #define Rt0 R7 // Temporary
t1 = 8 // Temporary #define Rt1 R8 // Temporary
// r9, r10 are forbidden // r9, r10 are forbidden
// r11 is OK provided you check the assembler that no synthetic instructions use it // r11 is OK provided you check the assembler that no synthetic instructions use it
t2 = 11 // Temporary #define Rt2 R11 // Temporary
ctr = 12 // loop counter #define Rctr R12 // loop counter
w = 14 // point to w buffer #define Rw R14 // point to w buffer
// func block(dig *digest, p []byte) // func block(dig *digest, p []byte)
// 0(FP) is *digest // 0(FP) is *digest
...@@ -45,173 +45,173 @@ w = 14 // point to w buffer ...@@ -45,173 +45,173 @@ w = 14 // point to w buffer
//12(FP) is p.cap //12(FP) is p.cap
// //
// Stack frame // Stack frame
p_end = -4 // -4(SP) pointer to the end of data #define p_end -4 // -4(SP) pointer to the end of data
p_data = p_end - 4 // -8(SP) current data pointer #define p_data (p_end - 4) // -8(SP) current data pointer
w_buf = p_data - 4*80 // -328(SP) 80 words temporary buffer w uint32[80] #define w_buf (p_data - 4*80) // -328(SP) 80 words temporary buffer w uint32[80]
saved = w_buf - 4*5 // -348(SP) saved sha1 registers a,b,c,d,e - these must be last #define saved (w_buf - 4*5) // -348(SP) saved sha1 registers a,b,c,d,e - these must be last
// Total size +4 for saved LR is 352 // Total size +4 for saved LR is 352
// w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3] // w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3]
// e += w[i] // e += w[i]
#define LOAD(e) \ #define LOAD(Re) \
MOVBU 2(R(data)), R(t0) ; \ MOVBU 2(Rdata), Rt0 ; \
MOVBU 3(R(data)), R(t1) ; \ MOVBU 3(Rdata), Rt1 ; \
MOVBU 1(R(data)), R(t2) ; \ MOVBU 1(Rdata), Rt2 ; \
ORR R(t0)<<8, R(t1), R(t0) ; \ ORR Rt0<<8, Rt1, Rt0 ; \
MOVBU.P 4(R(data)), R(t1) ; \ MOVBU.P 4(Rdata), Rt1 ; \
ORR R(t2)<<16, R(t0), R(t0) ; \ ORR Rt2<<16, Rt0, Rt0 ; \
ORR R(t1)<<24, R(t0), R(t0) ; \ ORR Rt1<<24, Rt0, Rt0 ; \
MOVW.P R(t0), 4(R(w)) ; \ MOVW.P Rt0, 4(Rw) ; \
ADD R(t0), R(e), R(e) ADD Rt0, Re, Re
// tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] // tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
// w[i&0xf] = tmp<<1 | tmp>>(32-1) // w[i&0xf] = tmp<<1 | tmp>>(32-1)
// e += w[i&0xf] // e += w[i&0xf]
#define SHUFFLE(e) \ #define SHUFFLE(Re) \
MOVW (-16*4)(R(w)), R(t0) ; \ MOVW (-16*4)(Rw), Rt0 ; \
MOVW (-14*4)(R(w)), R(t1) ; \ MOVW (-14*4)(Rw), Rt1 ; \
MOVW (-8*4)(R(w)), R(t2) ; \ MOVW (-8*4)(Rw), Rt2 ; \
EOR R(t0), R(t1), R(t0) ; \ EOR Rt0, Rt1, Rt0 ; \
MOVW (-3*4)(R(w)), R(t1) ; \ MOVW (-3*4)(Rw), Rt1 ; \
EOR R(t2), R(t0), R(t0) ; \ EOR Rt2, Rt0, Rt0 ; \
EOR R(t0), R(t1), R(t0) ; \ EOR Rt0, Rt1, Rt0 ; \
MOVW R(t0)@>(32-1), R(t0) ; \ MOVW Rt0@>(32-1), Rt0 ; \
MOVW.P R(t0), 4(R(w)) ; \ MOVW.P Rt0, 4(Rw) ; \
ADD R(t0), R(e), R(e) ADD Rt0, Re, Re
// t1 = (b & c) | ((~b) & d) // t1 = (b & c) | ((~b) & d)
#define FUNC1(a, b, c, d, e) \ #define FUNC1(Ra, Rb, Rc, Rd, Re) \
MVN R(b), R(t1) ; \ MVN Rb, Rt1 ; \
AND R(b), R(c), R(t0) ; \ AND Rb, Rc, Rt0 ; \
AND R(d), R(t1), R(t1) ; \ AND Rd, Rt1, Rt1 ; \
ORR R(t0), R(t1), R(t1) ORR Rt0, Rt1, Rt1
// t1 = b ^ c ^ d // t1 = b ^ c ^ d
#define FUNC2(a, b, c, d, e) \ #define FUNC2(Ra, Rb, Rc, Rd, Re) \
EOR R(b), R(c), R(t1) ; \ EOR Rb, Rc, Rt1 ; \
EOR R(d), R(t1), R(t1) EOR Rd, Rt1, Rt1
// t1 = (b & c) | (b & d) | (c & d) = // t1 = (b & c) | (b & d) | (c & d) =
// t1 = (b & c) | ((b | c) & d) // t1 = (b & c) | ((b | c) & d)
#define FUNC3(a, b, c, d, e) \ #define FUNC3(Ra, Rb, Rc, Rd, Re) \
ORR R(b), R(c), R(t0) ; \ ORR Rb, Rc, Rt0 ; \
AND R(b), R(c), R(t1) ; \ AND Rb, Rc, Rt1 ; \
AND R(d), R(t0), R(t0) ; \ AND Rd, Rt0, Rt0 ; \
ORR R(t0), R(t1), R(t1) ORR Rt0, Rt1, Rt1
#define FUNC4 FUNC2 #define FUNC4 FUNC2
// a5 := a<<5 | a>>(32-5) // a5 := a<<5 | a>>(32-5)
// b = b<<30 | b>>(32-30) // b = b<<30 | b>>(32-30)
// e = a5 + t1 + e + const // e = a5 + t1 + e + const
#define MIX(a, b, c, d, e) \ #define MIX(Ra, Rb, Rc, Rd, Re) \
ADD R(t1), R(e), R(e) ; \ ADD Rt1, Re, Re ; \
MOVW R(b)@>(32-30), R(b) ; \ MOVW Rb@>(32-30), Rb ; \
ADD R(a)@>(32-5), R(e), R(e) ; \ ADD Ra@>(32-5), Re, Re ; \
ADD R(const), R(e), R(e) ADD Rconst, Re, Re
#define ROUND1(a, b, c, d, e) \ #define ROUND1(Ra, Rb, Rc, Rd, Re) \
LOAD(e) ; \ LOAD(Re) ; \
FUNC1(a, b, c, d, e) ; \ FUNC1(Ra, Rb, Rc, Rd, Re) ; \
MIX(a, b, c, d, e) MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND1x(a, b, c, d, e) \ #define ROUND1x(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(e) ; \ SHUFFLE(Re) ; \
FUNC1(a, b, c, d, e) ; \ FUNC1(Ra, Rb, Rc, Rd, Re) ; \
MIX(a, b, c, d, e) MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND2(a, b, c, d, e) \ #define ROUND2(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(e) ; \ SHUFFLE(Re) ; \
FUNC2(a, b, c, d, e) ; \ FUNC2(Ra, Rb, Rc, Rd, Re) ; \
MIX(a, b, c, d, e) MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND3(a, b, c, d, e) \ #define ROUND3(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(e) ; \ SHUFFLE(Re) ; \
FUNC3(a, b, c, d, e) ; \ FUNC3(Ra, Rb, Rc, Rd, Re) ; \
MIX(a, b, c, d, e) MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND4(a, b, c, d, e) \ #define ROUND4(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(e) ; \ SHUFFLE(Re) ; \
FUNC4(a, b, c, d, e) ; \ FUNC4(Ra, Rb, Rc, Rd, Re) ; \
MIX(a, b, c, d, e) MIX(Ra, Rb, Rc, Rd, Re)
// func block(dig *digest, p []byte) // func block(dig *digest, p []byte)
TEXT ·block(SB), 0, $352-16 TEXT ·block(SB), 0, $352-16
MOVW p+4(FP), R(data) // pointer to the data MOVW p+4(FP), Rdata // pointer to the data
MOVW p_len+8(FP), R(t0) // number of bytes MOVW p_len+8(FP), Rt0 // number of bytes
ADD R(data), R(t0) ADD Rdata, Rt0
MOVW R(t0), p_end(SP) // pointer to end of data MOVW Rt0, p_end(SP) // pointer to end of data
// Load up initial SHA1 accumulator // Load up initial SHA1 accumulator
MOVW dig+0(FP), R(t0) MOVW dig+0(FP), Rt0
MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)] MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re]
loop: loop:
// Save registers at SP+4 onwards // Save registers at SP+4 onwards
MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13) MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13)
MOVW $w_buf(SP), R(w) MOVW $w_buf(SP), Rw
MOVW $0x5A827999, R(const) MOVW $0x5A827999, Rconst
MOVW $3, R(ctr) MOVW $3, Rctr
loop1: ROUND1(a, b, c, d, e) loop1: ROUND1(Ra, Rb, Rc, Rd, Re)
ROUND1(e, a, b, c, d) ROUND1(Re, Ra, Rb, Rc, Rd)
ROUND1(d, e, a, b, c) ROUND1(Rd, Re, Ra, Rb, Rc)
ROUND1(c, d, e, a, b) ROUND1(Rc, Rd, Re, Ra, Rb)
ROUND1(b, c, d, e, a) ROUND1(Rb, Rc, Rd, Re, Ra)
SUB.S $1, R(ctr) SUB.S $1, Rctr
BNE loop1 BNE loop1
ROUND1(a, b, c, d, e) ROUND1(Ra, Rb, Rc, Rd, Re)
ROUND1x(e, a, b, c, d) ROUND1x(Re, Ra, Rb, Rc, Rd)
ROUND1x(d, e, a, b, c) ROUND1x(Rd, Re, Ra, Rb, Rc)
ROUND1x(c, d, e, a, b) ROUND1x(Rc, Rd, Re, Ra, Rb)
ROUND1x(b, c, d, e, a) ROUND1x(Rb, Rc, Rd, Re, Ra)
MOVW $0x6ED9EBA1, R(const) MOVW $0x6ED9EBA1, Rconst
MOVW $4, R(ctr) MOVW $4, Rctr
loop2: ROUND2(a, b, c, d, e) loop2: ROUND2(Ra, Rb, Rc, Rd, Re)
ROUND2(e, a, b, c, d) ROUND2(Re, Ra, Rb, Rc, Rd)
ROUND2(d, e, a, b, c) ROUND2(Rd, Re, Ra, Rb, Rc)
ROUND2(c, d, e, a, b) ROUND2(Rc, Rd, Re, Ra, Rb)
ROUND2(b, c, d, e, a) ROUND2(Rb, Rc, Rd, Re, Ra)
SUB.S $1, R(ctr) SUB.S $1, Rctr
BNE loop2 BNE loop2
MOVW $0x8F1BBCDC, R(const) MOVW $0x8F1BBCDC, Rconst
MOVW $4, R(ctr) MOVW $4, Rctr
loop3: ROUND3(a, b, c, d, e) loop3: ROUND3(Ra, Rb, Rc, Rd, Re)
ROUND3(e, a, b, c, d) ROUND3(Re, Ra, Rb, Rc, Rd)
ROUND3(d, e, a, b, c) ROUND3(Rd, Re, Ra, Rb, Rc)
ROUND3(c, d, e, a, b) ROUND3(Rc, Rd, Re, Ra, Rb)
ROUND3(b, c, d, e, a) ROUND3(Rb, Rc, Rd, Re, Ra)
SUB.S $1, R(ctr) SUB.S $1, Rctr
BNE loop3 BNE loop3
MOVW $0xCA62C1D6, R(const) MOVW $0xCA62C1D6, Rconst
MOVW $4, R(ctr) MOVW $4, Rctr
loop4: ROUND4(a, b, c, d, e) loop4: ROUND4(Ra, Rb, Rc, Rd, Re)
ROUND4(e, a, b, c, d) ROUND4(Re, Ra, Rb, Rc, Rd)
ROUND4(d, e, a, b, c) ROUND4(Rd, Re, Ra, Rb, Rc)
ROUND4(c, d, e, a, b) ROUND4(Rc, Rd, Re, Ra, Rb)
ROUND4(b, c, d, e, a) ROUND4(Rb, Rc, Rd, Re, Ra)
SUB.S $1, R(ctr) SUB.S $1, Rctr
BNE loop4 BNE loop4
// Accumulate - restoring registers from SP+4 // Accumulate - restoring registers from SP+4
MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)] MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw]
ADD R(t0), R(a) ADD Rt0, Ra
ADD R(t1), R(b) ADD Rt1, Rb
ADD R(t2), R(c) ADD Rt2, Rc
ADD R(ctr), R(d) ADD Rctr, Rd
ADD R(w), R(e) ADD Rw, Re
MOVW p_end(SP), R(t0) MOVW p_end(SP), Rt0
CMP R(t0), R(data) CMP Rt0, Rdata
BLO loop BLO loop
// Save final SHA1 accumulator // Save final SHA1 accumulator
MOVW dig+0(FP), R(t0) MOVW dig+0(FP), Rt0
MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0)) MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0)
RET RET
...@@ -107,7 +107,7 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0 ...@@ -107,7 +107,7 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0
// save state in Gobuf; setjmp // save state in Gobuf; setjmp
TEXT runtime·gosave(SB),NOSPLIT,$-4-4 TEXT runtime·gosave(SB),NOSPLIT,$-4-4
MOVW 0(FP), R0 // gobuf MOVW 0(FP), R0 // gobuf
MOVW SP, gobuf_sp(R0) MOVW R13, gobuf_sp(R0)
MOVW LR, gobuf_pc(R0) MOVW LR, gobuf_pc(R0)
MOVW g, gobuf_g(R0) MOVW g, gobuf_g(R0)
MOVW $0, R11 MOVW $0, R11
...@@ -133,7 +133,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4 ...@@ -133,7 +133,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4
// after this point: it must be straight-line code until the // after this point: it must be straight-line code until the
// final B instruction. // final B instruction.
// See large comment in sigprof for more details. // See large comment in sigprof for more details.
MOVW gobuf_sp(R1), SP // restore SP MOVW gobuf_sp(R1), R13 // restore SP==R13
MOVW gobuf_lr(R1), LR MOVW gobuf_lr(R1), LR
MOVW gobuf_ret(R1), R0 MOVW gobuf_ret(R1), R0
MOVW gobuf_ctxt(R1), R7 MOVW gobuf_ctxt(R1), R7
...@@ -152,7 +152,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4 ...@@ -152,7 +152,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4
// to keep running g. // to keep running g.
TEXT runtime·mcall(SB),NOSPLIT,$-4-4 TEXT runtime·mcall(SB),NOSPLIT,$-4-4
// Save caller state in g->sched. // Save caller state in g->sched.
MOVW SP, (g_sched+gobuf_sp)(g) MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g) MOVW LR, (g_sched+gobuf_pc)(g)
MOVW $0, R11 MOVW $0, R11
MOVW R11, (g_sched+gobuf_lr)(g) MOVW R11, (g_sched+gobuf_lr)(g)
...@@ -170,8 +170,8 @@ TEXT runtime·mcall(SB),NOSPLIT,$-4-4 ...@@ -170,8 +170,8 @@ TEXT runtime·mcall(SB),NOSPLIT,$-4-4
CMP $0, R11 CMP $0, R11
BL.NE runtime·save_g(SB) BL.NE runtime·save_g(SB)
MOVW fn+0(FP), R0 MOVW fn+0(FP), R0
MOVW (g_sched+gobuf_sp)(g), SP MOVW (g_sched+gobuf_sp)(g), R13
SUB $8, SP SUB $8, R13
MOVW R1, 4(SP) MOVW R1, 4(SP)
MOVW R0, R7 MOVW R0, R7
MOVW 0(R0), R0 MOVW 0(R0), R0
...@@ -217,7 +217,7 @@ switch: ...@@ -217,7 +217,7 @@ switch:
MOVW $runtime·systemstack_switch(SB), R3 MOVW $runtime·systemstack_switch(SB), R3
ADD $4, R3, R3 // get past push {lr} ADD $4, R3, R3 // get past push {lr}
MOVW R3, (g_sched+gobuf_pc)(g) MOVW R3, (g_sched+gobuf_pc)(g)
MOVW SP, (g_sched+gobuf_sp)(g) MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_lr)(g) MOVW LR, (g_sched+gobuf_lr)(g)
MOVW g, (g_sched+gobuf_g)(g) MOVW g, (g_sched+gobuf_g)(g)
...@@ -231,7 +231,7 @@ switch: ...@@ -231,7 +231,7 @@ switch:
SUB $4, R3, R3 SUB $4, R3, R3
MOVW $runtime·mstart(SB), R4 MOVW $runtime·mstart(SB), R4
MOVW R4, 0(R3) MOVW R4, 0(R3)
MOVW R3, SP MOVW R3, R13
// call target function // call target function
MOVW R0, R7 MOVW R0, R7
...@@ -242,7 +242,7 @@ switch: ...@@ -242,7 +242,7 @@ switch:
MOVW g_m(g), R1 MOVW g_m(g), R1
MOVW m_curg(R1), R0 MOVW m_curg(R1), R0
BL setg<>(SB) BL setg<>(SB)
MOVW (g_sched+gobuf_sp)(g), SP MOVW (g_sched+gobuf_sp)(g), R13
MOVW $0, R3 MOVW $0, R3
MOVW R3, (g_sched+gobuf_sp)(g) MOVW R3, (g_sched+gobuf_sp)(g)
RET RET
...@@ -284,21 +284,21 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0 ...@@ -284,21 +284,21 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
// Called from f. // Called from f.
// Set g->sched to context in f. // Set g->sched to context in f.
MOVW R7, (g_sched+gobuf_ctxt)(g) MOVW R7, (g_sched+gobuf_ctxt)(g)
MOVW SP, (g_sched+gobuf_sp)(g) MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g) MOVW LR, (g_sched+gobuf_pc)(g)
MOVW R3, (g_sched+gobuf_lr)(g) MOVW R3, (g_sched+gobuf_lr)(g)
// Called from f. // Called from f.
// Set m->morebuf to f's caller. // Set m->morebuf to f's caller.
MOVW R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC MOVW R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC
MOVW SP, (m_morebuf+gobuf_sp)(R8) // f's caller's SP MOVW R13, (m_morebuf+gobuf_sp)(R8) // f's caller's SP
MOVW $4(SP), R3 // f's argument pointer MOVW $4(SP), R3 // f's argument pointer
MOVW g, (m_morebuf+gobuf_g)(R8) MOVW g, (m_morebuf+gobuf_g)(R8)
// Call newstack on m->g0's stack. // Call newstack on m->g0's stack.
MOVW m_g0(R8), R0 MOVW m_g0(R8), R0
BL setg<>(SB) BL setg<>(SB)
MOVW (g_sched+gobuf_sp)(g), SP MOVW (g_sched+gobuf_sp)(g), R13
BL runtime·newstack(SB) BL runtime·newstack(SB)
// Not reached, but make sure the return PC from the call to newstack // Not reached, but make sure the return PC from the call to newstack
...@@ -362,7 +362,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \ ...@@ -362,7 +362,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \
/* copy arguments to stack */ \ /* copy arguments to stack */ \
MOVW argptr+8(FP), R0; \ MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \ MOVW argsize+12(FP), R2; \
ADD $4, SP, R1; \ ADD $4, R13, R1; \
CMP $0, R2; \ CMP $0, R2; \
B.EQ 5(PC); \ B.EQ 5(PC); \
MOVBU.P 1(R0), R5; \ MOVBU.P 1(R0), R5; \
...@@ -378,7 +378,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \ ...@@ -378,7 +378,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \
MOVW argptr+8(FP), R0; \ MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \ MOVW argsize+12(FP), R2; \
MOVW retoffset+16(FP), R3; \ MOVW retoffset+16(FP), R3; \
ADD $4, SP, R1; \ ADD $4, R13, R1; \
ADD R3, R1; \ ADD R3, R1; \
ADD R3, R0; \ ADD R3, R0; \
SUB R3, R2; \ SUB R3, R2; \
...@@ -443,8 +443,8 @@ TEXT runtime·jmpdefer(SB),NOSPLIT,$0-8 ...@@ -443,8 +443,8 @@ TEXT runtime·jmpdefer(SB),NOSPLIT,$0-8
MOVW 0(SP), LR MOVW 0(SP), LR
MOVW $-4(LR), LR // BL deferreturn MOVW $-4(LR), LR // BL deferreturn
MOVW fv+0(FP), R7 MOVW fv+0(FP), R7
MOVW argp+4(FP), SP MOVW argp+4(FP), R13
MOVW $-4(SP), SP // SP is 4 below argp, due to saved LR MOVW $-4(SP), R13 // SP is 4 below argp, due to saved LR
MOVW 0(R7), R1 MOVW 0(R7), R1
B (R1) B (R1)
......
...@@ -25,31 +25,31 @@ ...@@ -25,31 +25,31 @@
#include "textflag.h" #include "textflag.h"
TO = 8 #define TO R8
TOE = 11 #define TOE R11
N = 12 #define N R12
TMP = 12 /* N and TMP don't overlap */ #define TMP R12 /* N and TMP don't overlap */
TEXT runtime·memclr(SB),NOSPLIT,$0-8 TEXT runtime·memclr(SB),NOSPLIT,$0-8
MOVW ptr+0(FP), R(TO) MOVW ptr+0(FP), TO
MOVW n+4(FP), R(N) MOVW n+4(FP), N
MOVW $0, R(0) MOVW $0, R0
ADD R(N), R(TO), R(TOE) /* to end pointer */ ADD N, TO, TOE /* to end pointer */
CMP $4, R(N) /* need at least 4 bytes to copy */ CMP $4, N /* need at least 4 bytes to copy */
BLT _1tail BLT _1tail
_4align: /* align on 4 */ _4align: /* align on 4 */
AND.S $3, R(TO), R(TMP) AND.S $3, TO, TMP
BEQ _4aligned BEQ _4aligned
MOVBU.P R(0), 1(R(TO)) /* implicit write back */ MOVBU.P R0, 1(TO) /* implicit write back */
B _4align B _4align
_4aligned: _4aligned:
SUB $31, R(TOE), R(TMP) /* do 32-byte chunks if possible */ SUB $31, TOE, TMP /* do 32-byte chunks if possible */
CMP R(TMP), R(TO) CMP TMP, TO
BHS _4tail BHS _4tail
MOVW R0, R1 /* replicate */ MOVW R0, R1 /* replicate */
...@@ -61,26 +61,26 @@ _4aligned: ...@@ -61,26 +61,26 @@ _4aligned:
MOVW R0, R7 MOVW R0, R7
_f32loop: _f32loop:
CMP R(TMP), R(TO) CMP TMP, TO
BHS _4tail BHS _4tail
MOVM.IA.W [R0-R7], (R(TO)) MOVM.IA.W [R0-R7], (TO)
B _f32loop B _f32loop
_4tail: _4tail:
SUB $3, R(TOE), R(TMP) /* do remaining words if possible */ SUB $3, TOE, TMP /* do remaining words if possible */
_4loop: _4loop:
CMP R(TMP), R(TO) CMP TMP, TO
BHS _1tail BHS _1tail
MOVW.P R(0), 4(R(TO)) /* implicit write back */ MOVW.P R0, 4(TO) /* implicit write back */
B _4loop B _4loop
_1tail: _1tail:
CMP R(TO), R(TOE) CMP TO, TOE
BEQ _return BEQ _return
MOVBU.P R(0), 1(R(TO)) /* implicit write back */ MOVBU.P R0, 1(TO) /* implicit write back */
B _1tail B _1tail
_return: _return:
......
...@@ -26,138 +26,138 @@ ...@@ -26,138 +26,138 @@
#include "textflag.h" #include "textflag.h"
// TE or TS are spilled to the stack during bulk register moves. // TE or TS are spilled to the stack during bulk register moves.
TS = 0 #define TS R0
TE = 8 #define TE R8
// Warning: the linker will use R11 to synthesize certain instructions. Please // Warning: the linker will use R11 to synthesize certain instructions. Please
// take care and double check with objdump. // take care and double check with objdump.
FROM = 11 #define FROM R11
N = 12 #define N R12
TMP = 12 /* N and TMP don't overlap */ #define TMP R12 /* N and TMP don't overlap */
TMP1 = 5 #define TMP1 R5
RSHIFT = 5 #define RSHIFT R5
LSHIFT = 6 #define LSHIFT R6
OFFSET = 7 #define OFFSET R7
BR0 = 0 /* shared with TS */ #define BR0 R0 /* shared with TS */
BW0 = 1 #define BW0 R1
BR1 = 1 #define BR1 R1
BW1 = 2 #define BW1 R2
BR2 = 2 #define BR2 R2
BW2 = 3 #define BW2 R3
BR3 = 3 #define BR3 R3
BW3 = 4 #define BW3 R4
FW0 = 1 #define FW0 R1
FR0 = 2 #define FR0 R2
FW1 = 2 #define FW1 R2
FR1 = 3 #define FR1 R3
FW2 = 3 #define FW2 R3
FR2 = 4 #define FR2 R4
FW3 = 4 #define FW3 R4
FR3 = 8 /* shared with TE */ #define FR3 R8 /* shared with TE */
TEXT runtime·memmove(SB), NOSPLIT, $4-12 TEXT runtime·memmove(SB), NOSPLIT, $4-12
_memmove: _memmove:
MOVW to+0(FP), R(TS) MOVW to+0(FP), TS
MOVW from+4(FP), R(FROM) MOVW from+4(FP), FROM
MOVW n+8(FP), R(N) MOVW n+8(FP), N
ADD R(N), R(TS), R(TE) /* to end pointer */ ADD N, TS, TE /* to end pointer */
CMP R(FROM), R(TS) CMP FROM, TS
BLS _forward BLS _forward
_back: _back:
ADD R(N), R(FROM) /* from end pointer */ ADD N, FROM /* from end pointer */
CMP $4, R(N) /* need at least 4 bytes to copy */ CMP $4, N /* need at least 4 bytes to copy */
BLT _b1tail BLT _b1tail
_b4align: /* align destination on 4 */ _b4align: /* align destination on 4 */
AND.S $3, R(TE), R(TMP) AND.S $3, TE, TMP
BEQ _b4aligned BEQ _b4aligned
MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b4align B _b4align
_b4aligned: /* is source now aligned? */ _b4aligned: /* is source now aligned? */
AND.S $3, R(FROM), R(TMP) AND.S $3, FROM, TMP
BNE _bunaligned BNE _bunaligned
ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ ADD $31, TS, TMP /* do 32-byte chunks if possible */
MOVW R(TS), savedts-4(SP) MOVW TS, savedts-4(SP)
_b32loop: _b32loop:
CMP R(TMP), R(TE) CMP TMP, TE
BLS _b4tail BLS _b4tail
MOVM.DB.W (R(FROM)), [R0-R7] MOVM.DB.W (FROM), [R0-R7]
MOVM.DB.W [R0-R7], (R(TE)) MOVM.DB.W [R0-R7], (TE)
B _b32loop B _b32loop
_b4tail: /* do remaining words if possible */ _b4tail: /* do remaining words if possible */
MOVW savedts-4(SP), R(TS) MOVW savedts-4(SP), TS
ADD $3, R(TS), R(TMP) ADD $3, TS, TMP
_b4loop: _b4loop:
CMP R(TMP), R(TE) CMP TMP, TE
BLS _b1tail BLS _b1tail
MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ MOVW.W -4(FROM), TMP1 /* pre-indexed */
MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ MOVW.W TMP1, -4(TE) /* pre-indexed */
B _b4loop B _b4loop
_b1tail: /* remaining bytes */ _b1tail: /* remaining bytes */
CMP R(TE), R(TS) CMP TE, TS
BEQ _return BEQ _return
MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b1tail B _b1tail
_forward: _forward:
CMP $4, R(N) /* need at least 4 bytes to copy */ CMP $4, N /* need at least 4 bytes to copy */
BLT _f1tail BLT _f1tail
_f4align: /* align destination on 4 */ _f4align: /* align destination on 4 */
AND.S $3, R(TS), R(TMP) AND.S $3, TS, TMP
BEQ _f4aligned BEQ _f4aligned
MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ MOVBU.P TMP, 1(TS) /* implicit write back */
B _f4align B _f4align
_f4aligned: /* is source now aligned? */ _f4aligned: /* is source now aligned? */
AND.S $3, R(FROM), R(TMP) AND.S $3, FROM, TMP
BNE _funaligned BNE _funaligned
SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ SUB $31, TE, TMP /* do 32-byte chunks if possible */
MOVW R(TE), savedte-4(SP) MOVW TE, savedte-4(SP)
_f32loop: _f32loop:
CMP R(TMP), R(TS) CMP TMP, TS
BHS _f4tail BHS _f4tail
MOVM.IA.W (R(FROM)), [R1-R8] MOVM.IA.W (FROM), [R1-R8]
MOVM.IA.W [R1-R8], (R(TS)) MOVM.IA.W [R1-R8], (TS)
B _f32loop B _f32loop
_f4tail: _f4tail:
MOVW savedte-4(SP), R(TE) MOVW savedte-4(SP), TE
SUB $3, R(TE), R(TMP) /* do remaining words if possible */ SUB $3, TE, TMP /* do remaining words if possible */
_f4loop: _f4loop:
CMP R(TMP), R(TS) CMP TMP, TS
BHS _f1tail BHS _f1tail
MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ MOVW.P 4(FROM), TMP1 /* implicit write back */
MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */ MOVW.P TMP1, 4(TS) /* implicit write back */
B _f4loop B _f4loop
_f1tail: _f1tail:
CMP R(TS), R(TE) CMP TS, TE
BEQ _return BEQ _return
MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ MOVBU.P TMP, 1(TS) /* implicit write back */
B _f1tail B _f1tail
_return: _return:
...@@ -165,97 +165,97 @@ _return: ...@@ -165,97 +165,97 @@ _return:
RET RET
_bunaligned: _bunaligned:
CMP $2, R(TMP) /* is R(TMP) < 2 ? */ CMP $2, TMP /* is TMP < 2 ? */
MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */
MOVW.LT $24, R(LSHIFT) MOVW.LT $24, LSHIFT
MOVW.LT $1, R(OFFSET) MOVW.LT $1, OFFSET
MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */
MOVW.EQ $16, R(LSHIFT) MOVW.EQ $16, LSHIFT
MOVW.EQ $2, R(OFFSET) MOVW.EQ $2, OFFSET
MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */
MOVW.GT $8, R(LSHIFT) MOVW.GT $8, LSHIFT
MOVW.GT $3, R(OFFSET) MOVW.GT $3, OFFSET
ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */ ADD $16, TS, TMP /* do 16-byte chunks if possible */
CMP R(TMP), R(TE) CMP TMP, TE
BLS _b1tail BLS _b1tail
BIC $3, R(FROM) /* align source */ BIC $3, FROM /* align source */
MOVW R(TS), savedts-4(SP) MOVW TS, savedts-4(SP)
MOVW (R(FROM)), R(BR0) /* prime first block register */ MOVW (FROM), BR0 /* prime first block register */
_bu16loop: _bu16loop:
CMP R(TMP), R(TE) CMP TMP, TE
BLS _bu1tail BLS _bu1tail
MOVW R(BR0)<<R(LSHIFT), R(BW3) MOVW BR0<<LSHIFT, BW3
MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)] MOVM.DB.W (FROM), [BR0-BR3]
ORR R(BR3)>>R(RSHIFT), R(BW3) ORR BR3>>RSHIFT, BW3
MOVW R(BR3)<<R(LSHIFT), R(BW2) MOVW BR3<<LSHIFT, BW2
ORR R(BR2)>>R(RSHIFT), R(BW2) ORR BR2>>RSHIFT, BW2
MOVW R(BR2)<<R(LSHIFT), R(BW1) MOVW BR2<<LSHIFT, BW1
ORR R(BR1)>>R(RSHIFT), R(BW1) ORR BR1>>RSHIFT, BW1
MOVW R(BR1)<<R(LSHIFT), R(BW0) MOVW BR1<<LSHIFT, BW0
ORR R(BR0)>>R(RSHIFT), R(BW0) ORR BR0>>RSHIFT, BW0
MOVM.DB.W [R(BW0)-R(BW3)], (R(TE)) MOVM.DB.W [BW0-BW3], (TE)
B _bu16loop B _bu16loop
_bu1tail: _bu1tail:
MOVW savedts-4(SP), R(TS) MOVW savedts-4(SP), TS
ADD R(OFFSET), R(FROM) ADD OFFSET, FROM
B _b1tail B _b1tail
_funaligned: _funaligned:
CMP $2, R(TMP) CMP $2, TMP
MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */
MOVW.LT $24, R(LSHIFT) MOVW.LT $24, LSHIFT
MOVW.LT $3, R(OFFSET) MOVW.LT $3, OFFSET
MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */
MOVW.EQ $16, R(LSHIFT) MOVW.EQ $16, LSHIFT
MOVW.EQ $2, R(OFFSET) MOVW.EQ $2, OFFSET
MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */
MOVW.GT $8, R(LSHIFT) MOVW.GT $8, LSHIFT
MOVW.GT $1, R(OFFSET) MOVW.GT $1, OFFSET
SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */ SUB $16, TE, TMP /* do 16-byte chunks if possible */
CMP R(TMP), R(TS) CMP TMP, TS
BHS _f1tail BHS _f1tail
BIC $3, R(FROM) /* align source */ BIC $3, FROM /* align source */
MOVW R(TE), savedte-4(SP) MOVW TE, savedte-4(SP)
MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */
_fu16loop: _fu16loop:
CMP R(TMP), R(TS) CMP TMP, TS
BHS _fu1tail BHS _fu1tail
MOVW R(FR3)>>R(RSHIFT), R(FW0) MOVW FR3>>RSHIFT, FW0
MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)] MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
ORR R(FR0)<<R(LSHIFT), R(FW0) ORR FR0<<LSHIFT, FW0
MOVW R(FR0)>>R(RSHIFT), R(FW1) MOVW FR0>>RSHIFT, FW1
ORR R(FR1)<<R(LSHIFT), R(FW1) ORR FR1<<LSHIFT, FW1
MOVW R(FR1)>>R(RSHIFT), R(FW2) MOVW FR1>>RSHIFT, FW2
ORR R(FR2)<<R(LSHIFT), R(FW2) ORR FR2<<LSHIFT, FW2
MOVW R(FR2)>>R(RSHIFT), R(FW3) MOVW FR2>>RSHIFT, FW3
ORR R(FR3)<<R(LSHIFT), R(FW3) ORR FR3<<LSHIFT, FW3
MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS)) MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
B _fu16loop B _fu16loop
_fu1tail: _fu1tail:
MOVW savedte-4(SP), R(TE) MOVW savedte-4(SP), TE
SUB R(OFFSET), R(FROM) SUB OFFSET, FROM
B _f1tail B _f1tail
...@@ -77,7 +77,7 @@ DATA bad_abi_msg+0x2c(SB)/1, $0xa ...@@ -77,7 +77,7 @@ DATA bad_abi_msg+0x2c(SB)/1, $0xa
GLOBL bad_abi_msg(SB), RODATA, $45 GLOBL bad_abi_msg(SB), RODATA, $45
TEXT oabi_syscall<>(SB),NOSPLIT,$-4 TEXT oabi_syscall<>(SB),NOSPLIT,$-4
ADD $1, PC, R4 ADD $1, R15, R4 // R15 is hardware PC
WORD $0xe12fff14 //BX (R4) // enter thumb mode WORD $0xe12fff14 //BX (R4) // enter thumb mode
// TODO(minux): only supports little-endian CPUs // TODO(minux): only supports little-endian CPUs
WORD $0x4770df01 // swi $1; bx lr WORD $0x4770df01 // swi $1; bx lr
......
...@@ -383,7 +383,7 @@ TEXT runtime·usleep(SB),NOSPLIT,$12 ...@@ -383,7 +383,7 @@ TEXT runtime·usleep(SB),NOSPLIT,$12
// Use kernel version instead of native armcas in asm_arm.s. // Use kernel version instead of native armcas in asm_arm.s.
// See ../sync/atomic/asm_linux_arm.s for details. // See ../sync/atomic/asm_linux_arm.s for details.
TEXT cas<>(SB),NOSPLIT,$0 TEXT cas<>(SB),NOSPLIT,$0
MOVW $0xffff0fc0, PC MOVW $0xffff0fc0, R15 // R15 is hardware PC.
TEXT runtime·cas(SB),NOSPLIT,$0 TEXT runtime·cas(SB),NOSPLIT,$0
MOVW ptr+0(FP), R2 MOVW ptr+0(FP), R2
......
...@@ -27,8 +27,6 @@ ...@@ -27,8 +27,6 @@
#include "go_tls.h" #include "go_tls.h"
#include "textflag.h" #include "textflag.h"
arg=0
/* replaced use of R10 by R11 because the former can be the data segment base register */ /* replaced use of R10 by R11 because the former can be the data segment base register */
TEXT _mulv(SB), NOSPLIT, $0 TEXT _mulv(SB), NOSPLIT, $0
...@@ -111,70 +109,71 @@ TEXT runtime·_sfloatpanic(SB),NOSPLIT,$-4 ...@@ -111,70 +109,71 @@ TEXT runtime·_sfloatpanic(SB),NOSPLIT,$-4
// Reference: // Reference:
// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software // Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740 // Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
q = 0 // input d, output q #define Rq R0 // input d, output q
r = 1 // input n, output r #define Rr R1 // input n, output r
s = 2 // three temporary variables #define Rs R2 // three temporary variables
M = 3 #define RM R3
a = 11 #define Ra R11
// Be careful: R(a) == R11 will be used by the linker for synthesized instructions.
// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
TEXT udiv<>(SB),NOSPLIT,$-4 TEXT udiv<>(SB),NOSPLIT,$-4
CLZ R(q), R(s) // find normalizing shift CLZ Rq, Rs // find normalizing shift
MOVW.S R(q)<<R(s), R(a) MOVW.S Rq<<Rs, Ra
MOVW $fast_udiv_tab<>-64(SB), R(M) MOVW $fast_udiv_tab<>-64(SB), RM
ADD.NE R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor ADD.NE Ra>>25, RM, Ra // index by most significant 7 bits of divisor
MOVBU.NE (R(a)), R(a) MOVBU.NE (Ra), Ra
SUB.S $7, R(s) SUB.S $7, Rs
RSB $0, R(q), R(M) // M = -q RSB $0, Rq, RM // M = -q
MOVW.PL R(a)<<R(s), R(q) MOVW.PL Ra<<Rs, Rq
// 1st Newton iteration // 1st Newton iteration
MUL.PL R(M), R(q), R(a) // a = -q*d MUL.PL RM, Rq, Ra // a = -q*d
BMI udiv_by_large_d BMI udiv_by_large_d
MULAWT R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32) MULAWT Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
TEQ R(M)->1, R(M) // check for d=0 or d=1 TEQ RM->1, RM // check for d=0 or d=1
// 2nd Newton iteration // 2nd Newton iteration
MUL.NE R(M), R(q), R(a) MUL.NE RM, Rq, Ra
MOVW.NE $0, R(s) MOVW.NE $0, Rs
MULAL.NE R(q), R(a), (R(q),R(s)) MULAL.NE Rq, Ra, (Rq,Rs)
BEQ udiv_by_0_or_1 BEQ udiv_by_0_or_1
// q now accurate enough for a remainder r, 0<=r<3*d // q now accurate enough for a remainder r, 0<=r<3*d
MULLU R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32 MULLU Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
ADD R(M), R(r), R(r) // r = n - d ADD RM, Rr, Rr // r = n - d
MULA R(M), R(q), R(r), R(r) // r = n - (q+1)*d MULA RM, Rq, Rr, Rr // r = n - (q+1)*d
// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d // since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
CMN R(M), R(r) // t = r-d CMN RM, Rr // t = r-d
SUB.CS R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d SUB.CS RM, Rr, Rr // if (t<-d || t>=0) r=r+d
ADD.CC $1, R(q) ADD.CC $1, Rq
ADD.PL R(M)<<1, R(r) ADD.PL RM<<1, Rr
ADD.PL $2, R(q) ADD.PL $2, Rq
RET RET
udiv_by_large_d: udiv_by_large_d:
// at this point we know d>=2^(31-6)=2^25 // at this point we know d>=2^(31-6)=2^25
SUB $4, R(a), R(a) SUB $4, Ra, Ra
RSB $0, R(s), R(s) RSB $0, Rs, Rs
MOVW R(a)>>R(s), R(q) MOVW Ra>>Rs, Rq
MULLU R(q), R(r), (R(q),R(s)) MULLU Rq, Rr, (Rq,Rs)
MULA R(M), R(q), R(r), R(r) MULA RM, Rq, Rr, Rr
// q now accurate enough for a remainder r, 0<=r<4*d // q now accurate enough for a remainder r, 0<=r<4*d
CMN R(r)>>1, R(M) // if(r/2 >= d) CMN Rr>>1, RM // if(r/2 >= d)
ADD.CS R(M)<<1, R(r) ADD.CS RM<<1, Rr
ADD.CS $2, R(q) ADD.CS $2, Rq
CMN R(r), R(M) CMN Rr, RM
ADD.CS R(M), R(r) ADD.CS RM, Rr
ADD.CS $1, R(q) ADD.CS $1, Rq
RET RET
udiv_by_0_or_1: udiv_by_0_or_1:
// carry set if d==1, carry clear if d==0 // carry set if d==1, carry clear if d==0
BCC udiv_by_0 BCC udiv_by_0
MOVW R(r), R(q) MOVW Rr, Rq
MOVW $0, R(r) MOVW $0, Rr
RET RET
udiv_by_0: udiv_by_0:
...@@ -216,96 +215,96 @@ DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788 ...@@ -216,96 +215,96 @@ DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788
DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384 DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
GLOBL fast_udiv_tab<>(SB), RODATA, $64 GLOBL fast_udiv_tab<>(SB), RODATA, $64
// The linker will pass numerator in R(TMP), and it also // The linker will pass numerator in RTMP, and it also
// expects the result in R(TMP) // expects the result in RTMP
TMP = 11 #define RTMP R11
TEXT _divu(SB), NOSPLIT, $16 TEXT _divu(SB), NOSPLIT, $16
MOVW R(q), 4(R13) MOVW Rq, 4(R13)
MOVW R(r), 8(R13) MOVW Rr, 8(R13)
MOVW R(s), 12(R13) MOVW Rs, 12(R13)
MOVW R(M), 16(R13) MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */ MOVW RTMP, Rr /* numerator */
MOVW 0(FP), R(q) /* denominator */ MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB) BL udiv<>(SB)
MOVW R(q), R(TMP) MOVW Rq, RTMP
MOVW 4(R13), R(q) MOVW 4(R13), Rq
MOVW 8(R13), R(r) MOVW 8(R13), Rr
MOVW 12(R13), R(s) MOVW 12(R13), Rs
MOVW 16(R13), R(M) MOVW 16(R13), RM
RET RET
TEXT _modu(SB), NOSPLIT, $16 TEXT _modu(SB), NOSPLIT, $16
MOVW R(q), 4(R13) MOVW Rq, 4(R13)
MOVW R(r), 8(R13) MOVW Rr, 8(R13)
MOVW R(s), 12(R13) MOVW Rs, 12(R13)
MOVW R(M), 16(R13) MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */ MOVW RTMP, Rr /* numerator */
MOVW 0(FP), R(q) /* denominator */ MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB) BL udiv<>(SB)
MOVW R(r), R(TMP) MOVW Rr, RTMP
MOVW 4(R13), R(q) MOVW 4(R13), Rq
MOVW 8(R13), R(r) MOVW 8(R13), Rr
MOVW 12(R13), R(s) MOVW 12(R13), Rs
MOVW 16(R13), R(M) MOVW 16(R13), RM
RET RET
TEXT _div(SB),NOSPLIT,$16 TEXT _div(SB),NOSPLIT,$16
MOVW R(q), 4(R13) MOVW Rq, 4(R13)
MOVW R(r), 8(R13) MOVW Rr, 8(R13)
MOVW R(s), 12(R13) MOVW Rs, 12(R13)
MOVW R(M), 16(R13) MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */ MOVW RTMP, Rr /* numerator */
MOVW 0(FP), R(q) /* denominator */ MOVW 0(FP), Rq /* denominator */
CMP $0, R(r) CMP $0, Rr
BGE d1 BGE d1
RSB $0, R(r), R(r) RSB $0, Rr, Rr
CMP $0, R(q) CMP $0, Rq
BGE d2 BGE d2
RSB $0, R(q), R(q) RSB $0, Rq, Rq
d0: d0:
BL udiv<>(SB) /* none/both neg */ BL udiv<>(SB) /* none/both neg */
MOVW R(q), R(TMP) MOVW Rq, RTMP
B out1 B out1
d1: d1:
CMP $0, R(q) CMP $0, Rq
BGE d0 BGE d0
RSB $0, R(q), R(q) RSB $0, Rq, Rq
d2: d2:
BL udiv<>(SB) /* one neg */ BL udiv<>(SB) /* one neg */
RSB $0, R(q), R(TMP) RSB $0, Rq, RTMP
out1: out1:
MOVW 4(R13), R(q) MOVW 4(R13), Rq
MOVW 8(R13), R(r) MOVW 8(R13), Rr
MOVW 12(R13), R(s) MOVW 12(R13), Rs
MOVW 16(R13), R(M) MOVW 16(R13), RM
RET RET
TEXT _mod(SB),NOSPLIT,$16 TEXT _mod(SB),NOSPLIT,$16
MOVW R(q), 4(R13) MOVW Rq, 4(R13)
MOVW R(r), 8(R13) MOVW Rr, 8(R13)
MOVW R(s), 12(R13) MOVW Rs, 12(R13)
MOVW R(M), 16(R13) MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */ MOVW RTMP, Rr /* numerator */
MOVW 0(FP), R(q) /* denominator */ MOVW 0(FP), Rq /* denominator */
CMP $0, R(q) CMP $0, Rq
RSB.LT $0, R(q), R(q) RSB.LT $0, Rq, Rq
CMP $0, R(r) CMP $0, Rr
BGE m1 BGE m1
RSB $0, R(r), R(r) RSB $0, Rr, Rr
BL udiv<>(SB) /* neg numerator */ BL udiv<>(SB) /* neg numerator */
RSB $0, R(r), R(TMP) RSB $0, Rr, RTMP
B out B out
m1: m1:
BL udiv<>(SB) /* pos numerator */ BL udiv<>(SB) /* pos numerator */
MOVW R(r), R(TMP) MOVW Rr, RTMP
out: out:
MOVW 4(R13), R(q) MOVW 4(R13), Rq
MOVW 8(R13), R(r) MOVW 8(R13), Rr
MOVW 12(R13), R(s) MOVW 12(R13), Rs
MOVW 16(R13), R(M) MOVW 16(R13), RM
RET RET
// _mul64by32 and _div64by32 not implemented on arm // _mul64by32 and _div64by32 not implemented on arm
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
// http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5 // http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5
// //
TEXT cas<>(SB),NOSPLIT,$0 TEXT cas<>(SB),NOSPLIT,$0
MOVW $0xffff0fc0, PC MOVW $0xffff0fc0, R15
TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0 TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
B ·CompareAndSwapUint32(SB) B ·CompareAndSwapUint32(SB)
...@@ -95,7 +95,7 @@ TEXT ·SwapUintptr(SB),NOSPLIT,$0 ...@@ -95,7 +95,7 @@ TEXT ·SwapUintptr(SB),NOSPLIT,$0
B ·SwapUint32(SB) B ·SwapUint32(SB)
TEXT cas64<>(SB),NOSPLIT,$0 TEXT cas64<>(SB),NOSPLIT,$0
MOVW $0xffff0f60, PC // __kuser_cmpxchg64: Linux-3.1 and above MOVW $0xffff0f60, R15 // R15 = hardware PC. __kuser_cmpxchg64: Linux-3.1 and above
TEXT kernelCAS64<>(SB),NOSPLIT,$0-21 TEXT kernelCAS64<>(SB),NOSPLIT,$0-21
// int (*__kuser_cmpxchg64_t)(const int64_t *oldval, const int64_t *newval, volatile int64_t *ptr); // int (*__kuser_cmpxchg64_t)(const int64_t *oldval, const int64_t *newval, volatile int64_t *ptr);
...@@ -127,17 +127,17 @@ TEXT setupAndCallCAS64<>(SB),NOSPLIT,$-4-21 ...@@ -127,17 +127,17 @@ TEXT setupAndCallCAS64<>(SB),NOSPLIT,$-4-21
CMP $5, R0 CMP $5, R0
MOVW.CS $kernelCAS64<>(SB), R1 MOVW.CS $kernelCAS64<>(SB), R1
MOVW.CS R1, armCAS64(SB) MOVW.CS R1, armCAS64(SB)
MOVW.CS R1, PC MOVW.CS R1, R15 // R15 = hardware PC
MOVB runtime·armArch(SB), R0 MOVB runtime·armArch(SB), R0
// LDREXD, STREXD only present on ARMv6K or higher // LDREXD, STREXD only present on ARMv6K or higher
CMP $6, R0 // TODO(minux): how to differentiate ARMv6 with ARMv6K? CMP $6, R0 // TODO(minux): how to differentiate ARMv6 with ARMv6K?
MOVW.CS armCompareAndSwapUint64(SB), R1 MOVW.CS armCompareAndSwapUint64(SB), R1
MOVW.CS R1, armCAS64(SB) MOVW.CS R1, armCAS64(SB)
MOVW.CS R1, PC MOVW.CS R1, R15
// we are out of luck, can only use runtime's emulated 64-bit cas // we are out of luck, can only use runtime's emulated 64-bit cas
MOVW generalCAS64(SB), R1 MOVW generalCAS64(SB), R1
MOVW R1, armCAS64(SB) MOVW R1, armCAS64(SB)
MOVW R1, PC MOVW R1, R15
TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0 TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
B ·CompareAndSwapUint64(SB) B ·CompareAndSwapUint64(SB)
...@@ -145,7 +145,7 @@ TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0 ...@@ -145,7 +145,7 @@ TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4-21 TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4-21
MOVW armCAS64(SB), R0 MOVW armCAS64(SB), R0
CMP $0, R0 CMP $0, R0
MOVW.NE R0, PC MOVW.NE R0, R15 // R15 = hardware PC
B setupAndCallCAS64<>(SB) B setupAndCallCAS64<>(SB)
TEXT ·AddInt64(SB),NOSPLIT,$0 TEXT ·AddInt64(SB),NOSPLIT,$0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment