Commit 8403d4ea authored by Meng Zhuo's avatar Meng Zhuo Committed by Cherry Zhang

cmd/asm: add V[LD|ST][2-4] vector instructions on arm64

This change adds VLD2, VLD3, VLD4, VST2, VST3, VST4 (multiple structures)
for image or multi media optimazation.

Change-Id: Iae3538ef4434e436e3fb2f19153c58f918f773af
Reviewed-on: https://go-review.googlesource.com/c/go/+/166518
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent be452cea
...@@ -343,6 +343,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 ...@@ -343,6 +343,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
VST1 [V0.S4, V1.S4], (R0) // 00a8004c VST1 [V0.S4, V1.S4], (R0) // 00a8004c
VLD1 (R30), [V15.S2, V16.S2] // cfab400c VLD1 (R30), [V15.S2, V16.S2] // cfab400c
VLD1.P 24(R30), [V3.S2,V4.S2,V5.S2] // c36bdf0c VLD1.P 24(R30), [V3.S2,V4.S2,V5.S2] // c36bdf0c
VLD2 (R29), [V23.H8, V24.H8] // b787404c
VLD2.P 16(R0), [V18.B8, V19.B8] // 1280df0c
VLD2.P (R1)(R2), [V15.S2, V16.S2] // VLD2.P (R1)(R2*1), [V15.S2,V16.S2] // 2f88c20c
VLD3 (R27), [V11.S4, V12.S4, V13.S4] // 6b4b404c
VLD3.P 48(RSP), [V11.S4, V12.S4, V13.S4] // eb4bdf4c
VLD3.P (R30)(R2), [V14.D2, V15.D2, V16.D2] // VLD3.P (R30)(R2*1), [V14.D2,V15.D2,V16.D2] // ce4fc24c
VLD4 (R15), [V10.H4, V11.H4, V12.H4, V13.H4] // ea05400c
VLD4.P 32(R24), [V31.B8, V0.B8, V1.B8, V2.B8] // 1f03df0c
VLD4.P (R13)(R9), [V14.S2, V15.S2, V16.S2, V17.S2] // VLD4.P (R13)(R9*1), [V14.S2,V15.S2,V16.S2,V17.S2] // ae09c90c
VST1.P [V24.S2], 8(R2) // 58789f0c VST1.P [V24.S2], 8(R2) // 58789f0c
VST1 [V29.S2, V30.S2], (R29) // bdab000c VST1 [V29.S2, V30.S2], (R29) // bdab000c
VST1 [V14.H4, V15.H4, V16.H4], (R27) // 6e67000c VST1 [V14.H4, V15.H4, V16.H4], (R27) // 6e67000c
...@@ -352,6 +361,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 ...@@ -352,6 +361,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
VST1.P V4.D[1], 8(R0) // 04849f4d VST1.P V4.D[1], 8(R0) // 04849f4d
VST1.P V4.D[1], (R0)(R1) // VST1.P V4.D[1], (R0)(R1*1) // 0484814d VST1.P V4.D[1], (R0)(R1) // VST1.P V4.D[1], (R0)(R1*1) // 0484814d
VST1 V4.D[1], (R0) // 0484004d VST1 V4.D[1], (R0) // 0484004d
VST2 [V22.H8, V23.H8], (R23) // f686004c
VST2.P [V14.H4, V15.H4], 16(R17) // 2e869f0c
VST2.P [V14.H4, V15.H4], (R3)(R17) // VST2.P [V14.H4,V15.H4], (R3)(R17*1) // 6e84910c
VST3 [V1.D2, V2.D2, V3.D2], (R11) // 614d004c
VST3.P [V18.S4, V19.S4, V20.S4], 48(R25) // 324b9f4c
VST3.P [V19.B8, V20.B8, V21.B8], (R3)(R7) // VST3.P [V19.B8, V20.B8, V21.B8], (R3)(R7*1) // 7340870c
VST4 [V22.D2, V23.D2, V24.D2, V25.D2], (R3) // 760c004c
VST4.P [V14.D2, V15.D2, V16.D2, V17.D2], 64(R15) // ee0d9f4c
VST4.P [V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23) // VST4.P [V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23*1) // 7800970c
FMOVS F20, (R0) // 140000bd FMOVS F20, (R0) // 140000bd
FMOVS.P F20, 4(R0) // 144400bc FMOVS.P F20, 4(R0) // 144400bc
FMOVS.W F20, 4(R0) // 144c00bc FMOVS.W F20, 4(R0) // 144c00bc
......
...@@ -953,10 +953,16 @@ const ( ...@@ -953,10 +953,16 @@ const (
AVEOR AVEOR
AVMOV AVMOV
AVLD1 AVLD1
AVLD2
AVLD3
AVLD4
AVORR AVORR
AVREV32 AVREV32
AVREV64 AVREV64
AVST1 AVST1
AVST2
AVST3
AVST4
AVDUP AVDUP
AVADDV AVADDV
AVMOVI AVMOVI
......
...@@ -460,10 +460,16 @@ var Anames = []string{ ...@@ -460,10 +460,16 @@ var Anames = []string{
"VEOR", "VEOR",
"VMOV", "VMOV",
"VLD1", "VLD1",
"VLD2",
"VLD3",
"VLD4",
"VORR", "VORR",
"VREV32", "VREV32",
"VREV64", "VREV64",
"VST1", "VST1",
"VST2",
"VST3",
"VST4",
"VDUP", "VDUP",
"VADDV", "VADDV",
"VMOVI", "VMOVI",
......
...@@ -780,16 +780,34 @@ var optab = []Optab{ ...@@ -780,16 +780,34 @@ var optab = []Optab{
{ASTLXR, C_REG, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0}, // RegTo2=C_REG {ASTLXR, C_REG, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0}, // RegTo2=C_REG
{ASTXP, C_PAIR, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0}, {ASTXP, C_PAIR, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0},
/* VLD1/VST1 */ /* VLD[1-4]/VST[1-4] */
{AVLD1, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0}, {AVLD1, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST}, {AVLD1, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD1, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST}, {AVLD1, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD2, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD2, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD2, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD3, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD3, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD3, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD4, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD4, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD4, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST}, {AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
{AVLD1, C_ROFF, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST}, {AVLD1, C_ROFF, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, 0}, {AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, 0},
{AVST1, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0}, {AVST1, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST1, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST}, {AVST1, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST1, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST}, {AVST1, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST2, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST2, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST2, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST3, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST3, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST3, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST4, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST4, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST4, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, C_XPOST}, {AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_ROFF, 96, 4, 0, 0, C_XPOST}, {AVST1, C_ELEM, C_NONE, C_NONE, C_ROFF, 96, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, 0}, {AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, 0},
...@@ -2695,7 +2713,13 @@ func buildop(ctxt *obj.Link) { ...@@ -2695,7 +2713,13 @@ func buildop(ctxt *obj.Link) {
AVCNT, AVCNT,
AVMOV, AVMOV,
AVLD1, AVLD1,
AVLD2,
AVLD3,
AVLD4,
AVST1, AVST1,
AVST2,
AVST3,
AVST4,
AVTBL, AVTBL,
AVDUP, AVDUP,
AVMOVI, AVMOVI,
...@@ -2775,14 +2799,14 @@ func (c *ctxt7) checkindex(p *obj.Prog, index, maxindex int) { ...@@ -2775,14 +2799,14 @@ func (c *ctxt7) checkindex(p *obj.Prog, index, maxindex int) {
} }
} }
/* checkoffset checks whether the immediate offset is valid for VLD1.P and VST1.P */ /* checkoffset checks whether the immediate offset is valid for VLD[1-4].P and VST[1-4].P */
func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) { func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
var offset, list, n int64 var offset, list, n, expect int64
switch as { switch as {
case AVLD1: case AVLD1, AVLD2, AVLD3, AVLD4:
offset = p.From.Offset offset = p.From.Offset
list = p.To.Offset list = p.To.Offset
case AVST1: case AVST1, AVST2, AVST3, AVST4:
offset = p.To.Offset offset = p.To.Offset
list = p.From.Offset list = p.From.Offset
default: default:
...@@ -2808,6 +2832,21 @@ func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) { ...@@ -2808,6 +2832,21 @@ func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
if !(q == 0 && offset == n*8) && !(q == 1 && offset == n*16) { if !(q == 0 && offset == n*8) && !(q == 1 && offset == n*16) {
c.ctxt.Diag("invalid post-increment offset: %v", p) c.ctxt.Diag("invalid post-increment offset: %v", p)
} }
switch as {
case AVLD1, AVST1:
return
case AVLD2, AVST2:
expect = 2
case AVLD3, AVST3:
expect = 3
case AVLD4, AVST4:
expect = 4
}
if expect != n {
c.ctxt.Diag("expected %d registers, got %d: %v.", expect, n, p)
}
} }
/* checkShiftAmount checks whether the index shift amount is valid */ /* checkShiftAmount checks whether the index shift amount is valid */
...@@ -4305,14 +4344,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { ...@@ -4305,14 +4344,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
} }
o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31) o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31)
case 81: /* vld1 (Rn), [Vt1.<T>, Vt2.<T>, ...] */ case 81: /* vld[1-4] (Rn), [Vt1.<T>, Vt2.<T>, ...] */
c.checkoffset(p, p.As)
r := int(p.From.Reg) r := int(p.From.Reg)
o1 = 3<<26 | 1<<22 o1 = 3<<26 | 1<<22
if o.scond == C_XPOST { if o.scond == C_XPOST {
o1 |= 1 << 23 o1 |= 1 << 23
if p.From.Index == 0 { if p.From.Index == 0 {
// immediate offset variant // immediate offset variant
c.checkoffset(p, p.As)
o1 |= 0x1f << 16 o1 |= 0x1f << 16
} else { } else {
// register offset variant // register offset variant
...@@ -4323,6 +4362,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { ...@@ -4323,6 +4362,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
} }
} }
o1 |= uint32(p.To.Offset) o1 |= uint32(p.To.Offset)
// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
// add opcode(bit 12-15) for vld1, mask it off if it's not vld1
o1 = c.maskOpvldvst(p, o1)
o1 |= uint32(r&31) << 5 o1 |= uint32(r&31) << 5
case 82: /* vmov Rn, Vd.<T> */ case 82: /* vmov Rn, Vd.<T> */
...@@ -4410,14 +4452,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { ...@@ -4410,14 +4452,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= (Q&1)<<30 | (size&3)<<22 | uint32(rf&31)<<5 | uint32(rt&31) o1 |= (Q&1)<<30 | (size&3)<<22 | uint32(rf&31)<<5 | uint32(rt&31)
case 84: /* vst1 [Vt1.<T>, Vt2.<T>, ...], (Rn) */ case 84: /* vst[1-4] [Vt1.<T>, Vt2.<T>, ...], (Rn) */
c.checkoffset(p, p.As)
r := int(p.To.Reg) r := int(p.To.Reg)
o1 = 3 << 26 o1 = 3 << 26
if o.scond == C_XPOST { if o.scond == C_XPOST {
o1 |= 1 << 23 o1 |= 1 << 23
if p.To.Index == 0 { if p.To.Index == 0 {
// immediate offset variant // immediate offset variant
c.checkoffset(p, p.As)
o1 |= 0x1f << 16 o1 |= 0x1f << 16
} else { } else {
// register offset variant // register offset variant
...@@ -4428,6 +4470,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { ...@@ -4428,6 +4470,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
} }
} }
o1 |= uint32(p.From.Offset) o1 |= uint32(p.From.Offset)
// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
// add opcode(bit 12-15) for vst1, mask it off if it's not vst1
o1 = c.maskOpvldvst(p, o1)
o1 |= uint32(r&31) << 5 o1 |= uint32(r&31) << 5
case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/ case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
...@@ -6727,6 +6772,24 @@ func (c *ctxt7) opldpstp(p *obj.Prog, o *Optab, vo int32, rbase, rl, rh, ldp uin ...@@ -6727,6 +6772,24 @@ func (c *ctxt7) opldpstp(p *obj.Prog, o *Optab, vo int32, rbase, rl, rh, ldp uin
return ret return ret
} }
func (c *ctxt7) maskOpvldvst(p *obj.Prog, o1 uint32) uint32 {
if p.As == AVLD1 || p.As == AVST1 {
return o1
}
o1 &^= 0xf000 // mask out "opcode" field (bit 12-15)
switch p.As {
case AVLD2, AVST2:
o1 |= 8 << 12
case AVLD3, AVST3:
o1 |= 4 << 12
case AVLD4, AVST4:
default:
c.ctxt.Diag("unsupported instruction:%v\n", p.As)
}
return o1
}
/* /*
* size in log2(bytes) * size in log2(bytes)
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment