Commit 644ddaa8 authored by fanzha02's avatar fanzha02 Committed by Cherry Zhang

cmd/internal/obj/arm64: encode large constants into MOVZ/MOVN and MOVK instructions

Current assembler gets large constants from constant pool, this CL
gets rid of the pool by using MOVZ/MOVN and MOVK to load large
constants.

This CL changes the assembler behavior as follows.

1. go assembly  1, MOVD $0x1111222233334444, R1
                2, MOVD $0x1111ffff1111ffff, R1
   previous version: MOVD 0x9a4, R1 (loads constant from pool).
   optimized version: 1, MOVD $0x4444, R1; MOVK $(0x3333<<16), R1; MOVK $(0x2222<<32), R1;
   MOVK $(0x1111<<48), R1. 2, MOVN $(0xeeee<<16), R1; MOVK $(0x1111<<48), R1.

Add test cases, and below are binary size comparison and bechmark results.

1. Binary size before/after
binary                 size change
pkg/linux_arm64        +25.4KB
pkg/tool/linux_arm64   -2.9KB
go                     -2KB
gofmt                  no change

2. compiler benchmark.
name       old time/op       new time/op       delta
Template         574ms ±21%        577ms ±14%     ~     (p=0.853 n=10+10)
Unicode          327ms ±29%        353ms ±23%     ~     (p=0.360 n=10+8)
GoTypes          1.97s ± 8%        2.04s ±11%     ~     (p=0.143 n=10+10)
Compiler         9.13s ± 9%        9.25s ± 8%     ~     (p=0.684 n=10+10)
SSA              29.2s ± 5%        27.0s ± 4%   -7.40%  (p=0.000 n=10+10)
Flate            402ms ±40%        308ms ± 6%  -23.29%  (p=0.004 n=10+10)
GoParser         470ms ±26%        382ms ±10%  -18.82%  (p=0.000 n=9+10)
Reflect          1.36s ±16%        1.17s ± 7%  -13.92%  (p=0.001 n=9+10)
Tar              561ms ±19%        466ms ±15%  -17.08%  (p=0.000 n=9+10)
XML              745ms ±20%        679ms ±20%     ~     (p=0.123 n=10+10)
StdCmd           35.5s ± 6%        37.2s ± 3%   +4.81%  (p=0.001 n=9+8)

name       old user-time/op  new user-time/op  delta
Template         625ms ±14%        660ms ±18%     ~     (p=0.343 n=10+10)
Unicode          355ms ±10%        373ms ±20%     ~     (p=0.346 n=9+10)
GoTypes          2.39s ± 8%        2.37s ± 5%     ~     (p=0.897 n=10+10)
Compiler         11.1s ± 4%        11.4s ± 2%   +2.63%  (p=0.010 n=10+9)
SSA              35.4s ± 3%        34.9s ± 2%     ~     (p=0.113 n=10+9)
Flate            402ms ±13%        371ms ±30%     ~     (p=0.089 n=10+9)
GoParser         513ms ± 8%        489ms ±24%   -4.76%  (p=0.039 n=9+9)
Reflect          1.52s ±12%        1.41s ± 5%   -7.32%  (p=0.001 n=9+10)
Tar              607ms ±10%        558ms ± 8%   -7.96%  (p=0.009 n=9+10)
XML              828ms ±10%        789ms ±12%     ~     (p=0.059 n=10+10)

name       old text-bytes    new text-bytes    delta
HelloSize        714kB ± 0%        712kB ± 0%   -0.23%  (p=0.000 n=10+10)
CmdGoSize       8.26MB ± 0%       8.25MB ± 0%   -0.14%  (p=0.000 n=10+10)

name       old data-bytes    new data-bytes    delta
HelloSize       10.5kB ± 0%       10.5kB ± 0%     ~     (all equal)
CmdGoSize        258kB ± 0%        258kB ± 0%     ~     (all equal)

name       old bss-bytes     new bss-bytes     delta
HelloSize        125kB ± 0%        125kB ± 0%     ~     (all equal)
CmdGoSize        146kB ± 0%        146kB ± 0%     ~     (all equal)

name       old exe-bytes     new exe-bytes     delta
HelloSize       1.18MB ± 0%       1.18MB ± 0%     ~     (all equal)
CmdGoSize       11.2MB ± 0%       11.2MB ± 0%   -0.13%  (p=0.000 n=10+10)

3. go1 benckmark.
name                   old time/op    new time/op    delta
BinaryTree17              6.60s ±18%     7.36s ±22%    ~     (p=0.222 n=5+5)
Fannkuch11                4.04s ± 0%     4.05s ± 0%    ~     (p=0.421 n=5+5)
FmtFprintfEmpty          91.8ns ±14%    91.2ns ± 9%    ~     (p=0.667 n=5+5)
FmtFprintfString          145ns ± 0%     151ns ± 6%    ~     (p=0.397 n=4+5)
FmtFprintfInt             169ns ± 0%     176ns ± 5%  +4.14%  (p=0.016 n=4+5)
FmtFprintfIntInt          229ns ± 2%     243ns ± 6%    ~     (p=0.143 n=5+5)
FmtFprintfPrefixedInt     343ns ± 0%     350ns ± 3%  +1.92%  (p=0.048 n=5+5)
FmtFprintfFloat           400ns ± 3%     394ns ± 3%    ~     (p=0.063 n=5+5)
FmtManyArgs              1.04µs ± 0%    1.05µs ± 0%  +1.62%  (p=0.029 n=4+4)
GobDecode                13.9ms ± 4%    13.9ms ± 5%    ~     (p=1.000 n=5+5)
GobEncode                10.6ms ± 4%    10.6ms ± 5%    ~     (p=0.421 n=5+5)
Gzip                      567ms ± 1%     563ms ± 4%    ~     (p=0.548 n=5+5)
Gunzip                   60.2ms ± 1%    60.4ms ± 0%    ~     (p=0.056 n=5+5)
HTTPClientServer          114µs ± 4%     108µs ± 7%    ~     (p=0.095 n=5+5)
JSONEncode               18.4ms ± 2%    17.8ms ± 2%  -3.06%  (p=0.016 n=5+5)
JSONDecode                105ms ± 1%     103ms ± 2%    ~     (p=0.056 n=5+5)
Mandelbrot200            5.48ms ± 0%    5.49ms ± 0%    ~     (p=0.841 n=5+5)
GoParse                  6.05ms ± 1%    6.05ms ± 2%    ~     (p=1.000 n=5+5)
RegexpMatchEasy0_32       143ns ± 1%     146ns ± 4%  +2.10%  (p=0.048 n=4+5)
RegexpMatchEasy0_1K       499ns ± 1%     492ns ± 2%    ~     (p=0.079 n=5+5)
RegexpMatchEasy1_32       137ns ± 0%     136ns ± 1%  -0.73%  (p=0.016 n=4+5)
RegexpMatchEasy1_1K       826ns ± 4%     823ns ± 2%    ~     (p=0.841 n=5+5)
RegexpMatchMedium_32      224ns ± 5%     233ns ± 8%    ~     (p=0.119 n=5+5)
RegexpMatchMedium_1K     59.6µs ± 0%    59.3µs ± 1%  -0.66%  (p=0.016 n=4+5)
RegexpMatchHard_32       3.29µs ± 3%    3.26µs ± 1%    ~     (p=0.889 n=5+5)
RegexpMatchHard_1K       98.8µs ± 2%    99.0µs ± 0%    ~     (p=0.690 n=5+5)
Revcomp                   1.02s ± 1%     1.01s ± 1%    ~     (p=0.095 n=5+5)
Template                  135ms ± 5%     131ms ± 1%    ~     (p=0.151 n=5+5)
TimeParse                 591ns ± 0%     593ns ± 0%  +0.20%  (p=0.048 n=5+5)
TimeFormat                655ns ± 2%     607ns ± 0%  -7.42%  (p=0.016 n=5+4)
[Geo mean]               93.5µs         93.8µs       +0.23%

name                   old speed      new speed      delta
GobDecode              55.1MB/s ± 4%  55.1MB/s ± 4%    ~     (p=1.000 n=5+5)
GobEncode              72.4MB/s ± 4%  72.3MB/s ± 5%    ~     (p=0.421 n=5+5)
Gzip                   34.2MB/s ± 1%  34.5MB/s ± 4%    ~     (p=0.548 n=5+5)
Gunzip                  322MB/s ± 1%   321MB/s ± 0%    ~     (p=0.056 n=5+5)
JSONEncode              106MB/s ± 2%   109MB/s ± 2%  +3.16%  (p=0.016 n=5+5)
JSONDecode             18.5MB/s ± 1%  18.8MB/s ± 2%    ~     (p=0.056 n=5+5)
GoParse                9.57MB/s ± 1%  9.57MB/s ± 2%    ~     (p=0.952 n=5+5)
RegexpMatchEasy0_32     223MB/s ± 1%   221MB/s ± 0%  -1.10%  (p=0.029 n=4+4)
RegexpMatchEasy0_1K    2.05GB/s ± 1%  2.08GB/s ± 2%    ~     (p=0.095 n=5+5)
RegexpMatchEasy1_32     232MB/s ± 0%   234MB/s ± 1%  +0.76%  (p=0.016 n=4+5)
RegexpMatchEasy1_1K    1.24GB/s ± 4%  1.24GB/s ± 2%    ~     (p=0.841 n=5+5)
RegexpMatchMedium_32   4.45MB/s ± 5%  4.20MB/s ± 1%  -5.63%  (p=0.000 n=5+4)
RegexpMatchMedium_1K   17.2MB/s ± 0%  17.3MB/s ± 1%  +0.66%  (p=0.016 n=4+5)
RegexpMatchHard_32     9.73MB/s ± 3%  9.83MB/s ± 1%    ~     (p=0.889 n=5+5)
RegexpMatchHard_1K     10.4MB/s ± 2%  10.3MB/s ± 0%    ~     (p=0.635 n=5+5)
Revcomp                 249MB/s ± 1%   252MB/s ± 1%    ~     (p=0.095 n=5+5)
Template               14.4MB/s ± 4%  14.8MB/s ± 1%    ~     (p=0.151 n=5+5)
[Geo mean]             62.1MB/s       62.3MB/s       +0.34%

Fixes #10108

Change-Id: I79038f3c4c2ff874c136053d1a2b1c8a5a9cfac5
Reviewed-on: https://go-review.googlesource.com/c/118796Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent ac277d92
......@@ -195,6 +195,11 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
CMPW $27745, R2 // 3b8c8d525f001b6b
CMNW $0x3fffffc0, R2 // CMNW $1073741760, R2 // fb5f1a325f001b2b
CMPW $0xffff0, R1 // CMPW $1048560, R1 // fb3f1c323f001b6b
CMP $0xffffffffffa0, R3 // CMP $281474976710560, R3 // fb0b80921b00e0f27f001beb
CMP $0xf4240, R1 // CMP $1000000, R1 // 1b4888d2fb01a0f23f001beb
ADD $0x186a0, R2, R5 // ADD $100000, R2, R5 // 45801a91a5604091
SUB $0xe7791f700, R3, R1 // SUB $62135596800, R3, R1 // 1be09ed23bf2aef2db01c0f261001bcb
CMP $3343198598084851058, R3 // 5bae8ed2db8daef23badcdf2bbcce5f27f001beb
ADD $0x3fffffffc000, R5 // ADD $70368744161280, R5 // fb7f72b2a5001b8b
// LTYPE1 imsr ',' spreg ','
// {
......@@ -240,12 +245,21 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
EOR $0xe03fffffffffffff, R20, R22 // EOR $-2287828610704211969, R20, R22 // 96e243d2
TSTW $0x600000006, R1 // TSTW $25769803782, R1 // 3f041f72
TST $0x4900000049, R0 // TST $313532612681, R0 // 3b0980d23b09c0f21f001bea
ORR $0x170000, R2, R1 // ORR $1507328, R2, R1 // fb02a0d241001baa
AND $0xff00ff, R2 // AND $16711935, R2 // fb1f80d2fb1fa0f242001b8a
AND $0xff00ffff, R1 // AND $4278255615, R1 // fbff9fd21be0bff221001b8a
ANDS $0xffff, R2 // ANDS $65535, R2 // 423c40f2
AND $0x7fffffff, R3 // AND $2147483647, R3 // 63784092
ANDS $0x0ffffffff80000000, R2 // ANDS $-2147483648, R2 // 428061f2
AND $0xfffff, R2 // AND $1048575, R2 // 424c4092
ANDW $0xf00fffff, R1 // ANDW $4027580415, R1 // 215c0412
ANDSW $0xff00ffff, R1 // ANDSW $4278255615, R1 // 215c0872
TST $0x11223344, R2 // TST $287454020, R2 // 9b6886d25b24a2f25f001bea
TSTW $0xa000, R3 // TSTW $40960, R3 // 1b0094527f001b6a
BICW $0xa000, R3 // BICW $40960, R3 // 1b00945263003b0a
ORRW $0x1b000, R2, R3 // ORRW $110592, R2, R3 // 1b0096523b00a07243001b2a
TSTW $0x500000, R1 // TSTW $5242880, R1 // 1b0aa0523f001b6a
TSTW $0xff00ff, R1 // TSTW $16711935, R1 // 3f9c0072
AND $8, R0, RSP // 1f007d92
......@@ -256,13 +270,20 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
EON $8, R0, RSP // 1ff87cd2
MOVD $0x3fffffffc000, R0 // MOVD $70368744161280, R0 // e07f72b2
MOVW $1000000, R4 // 04488852e401a072
MOVW $0xaaaa0000, R1 // MOVW $2863267840, R1 // 4155b552
MOVW $0xaaaaffff, R1 // MOVW $2863333375, R1 // a1aaaa12
MOVW $0xaaaa, R1 // MOVW $43690, R1 // 41559552
MOVW $0xffffaaaa, R1 // MOVW $4294945450, R1 // a1aa8a12
MOVW $0xffff0000, R1 // MOVW $4294901760, R1 // e1ffbf52
MOVD $0xffff00000000000, R1 // MOVD $1152903912420802560, R1 // e13f54b2
MOVD $0x1111000000001111, R1 // MOVD $1229764173248860433, R1 // 212282d22122e2f2
MOVD $0x1111ffff1111ffff, R1 // MOVD $1230045644216991743, R1 // c1ddbd922122e2f2
MOVD $0x1111222233334444, R1 // MOVD $1229801703532086340, R1 // 818888d26166a6f24144c4f22122e2f2
MOVD $0xaaaaffff, R1 // MOVD $2863333375, R1 // e1ff9fd24155b5f2
MOVD $0x11110000, R1 // MOVD $286326784, R1 // 2122a2d2
MOVD $0xaaaa0000aaaa1111, R1 // MOVD $-6149102338357718767, R1 // 212282d24155b5f24155f5f2
MOVD $0x1111ffff1111aaaa, R1 // MOVD $1230045644216969898, R1 // a1aa8a922122a2f22122e2f2
MOVD $0, R1 // 010080d2
MOVD $-1, R1 // 01008092
MOVD $0x210000, R0 // MOVD $2162688, R0 // 2004a0d2
......
......@@ -414,6 +414,8 @@ const (
C_BITCON // bitfield and logical immediate masks
C_ADDCON2 // 24-bit constant
C_LCON // 32-bit constant
C_MOVCON2 // a constant that can be loaded with one MOVZ/MOVN and one MOVK
C_MOVCON3 // a constant that can be loaded with one MOVZ/MOVN and two MOVKs
C_VCON // 64-bit constant
C_FCON // floating-point constant
C_VCONADDR // 64-bit memory address
......
......@@ -30,6 +30,8 @@ var cnames7 = []string{
"BITCON",
"ADDCON2",
"LCON",
"MOVCON2",
"MOVCON3",
"VCON",
"FCON",
"VCONADDR",
......
......@@ -198,9 +198,15 @@ var optab = []Optab{
{ACMP, C_BITCON, C_RSP, C_NONE, C_NONE, 62, 8, 0, 0, 0},
{AADD, C_ADDCON2, C_RSP, C_NONE, C_RSP, 48, 8, 0, 0, 0},
{AADD, C_ADDCON2, C_NONE, C_NONE, C_RSP, 48, 8, 0, 0, 0},
{AADD, C_VCON, C_RSP, C_NONE, C_RSP, 13, 8, 0, LFROM, 0},
{AADD, C_VCON, C_NONE, C_NONE, C_RSP, 13, 8, 0, LFROM, 0},
{ACMP, C_VCON, C_REG, C_NONE, C_NONE, 13, 8, 0, LFROM, 0},
{AADD, C_MOVCON2, C_RSP, C_NONE, C_RSP, 13, 12, 0, 0, 0},
{AADD, C_MOVCON2, C_NONE, C_NONE, C_RSP, 13, 12, 0, 0, 0},
{AADD, C_MOVCON3, C_RSP, C_NONE, C_RSP, 13, 16, 0, 0, 0},
{AADD, C_MOVCON3, C_NONE, C_NONE, C_RSP, 13, 16, 0, 0, 0},
{AADD, C_VCON, C_RSP, C_NONE, C_RSP, 13, 20, 0, 0, 0},
{AADD, C_VCON, C_NONE, C_NONE, C_RSP, 13, 20, 0, 0, 0},
{ACMP, C_MOVCON2, C_REG, C_NONE, C_NONE, 13, 12, 0, 0, 0},
{ACMP, C_MOVCON3, C_REG, C_NONE, C_NONE, 13, 16, 0, 0, 0},
{ACMP, C_VCON, C_REG, C_NONE, C_NONE, 13, 20, 0, 0, 0},
{AADD, C_SHIFT, C_REG, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AADD, C_SHIFT, C_NONE, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AMVN, C_SHIFT, C_NONE, C_NONE, C_REG, 3, 4, 0, 0, 0},
......@@ -255,11 +261,21 @@ var optab = []Optab{
{AANDS, C_MOVCON, C_REG, C_NONE, C_REG, 62, 8, 0, 0, 0},
{AANDS, C_MOVCON, C_NONE, C_NONE, C_REG, 62, 8, 0, 0, 0},
{ATST, C_MOVCON, C_REG, C_NONE, C_NONE, 62, 8, 0, 0, 0},
{AAND, C_VCON, C_REG, C_NONE, C_REG, 28, 8, 0, LFROM, 0},
{AAND, C_VCON, C_NONE, C_NONE, C_REG, 28, 8, 0, LFROM, 0},
{AANDS, C_VCON, C_REG, C_NONE, C_REG, 28, 8, 0, LFROM, 0},
{AANDS, C_VCON, C_NONE, C_NONE, C_REG, 28, 8, 0, LFROM, 0},
{ATST, C_VCON, C_REG, C_NONE, C_NONE, 28, 8, 0, LFROM, 0},
{AAND, C_MOVCON2, C_REG, C_NONE, C_REG, 28, 12, 0, 0, 0},
{AAND, C_MOVCON2, C_NONE, C_NONE, C_REG, 28, 12, 0, 0, 0},
{AAND, C_MOVCON3, C_REG, C_NONE, C_REG, 28, 16, 0, 0, 0},
{AAND, C_MOVCON3, C_NONE, C_NONE, C_REG, 28, 16, 0, 0, 0},
{AAND, C_VCON, C_REG, C_NONE, C_REG, 28, 20, 0, 0, 0},
{AAND, C_VCON, C_NONE, C_NONE, C_REG, 28, 20, 0, 0, 0},
{AANDS, C_MOVCON2, C_REG, C_NONE, C_REG, 28, 12, 0, 0, 0},
{AANDS, C_MOVCON2, C_NONE, C_NONE, C_REG, 28, 12, 0, 0, 0},
{AANDS, C_MOVCON3, C_REG, C_NONE, C_REG, 28, 16, 0, 0, 0},
{AANDS, C_MOVCON3, C_NONE, C_NONE, C_REG, 28, 16, 0, 0, 0},
{AANDS, C_VCON, C_REG, C_NONE, C_REG, 28, 20, 0, 0, 0},
{AANDS, C_VCON, C_NONE, C_NONE, C_REG, 28, 20, 0, 0, 0},
{ATST, C_MOVCON2, C_REG, C_NONE, C_NONE, 28, 12, 0, 0, 0},
{ATST, C_MOVCON3, C_REG, C_NONE, C_NONE, 28, 16, 0, 0, 0},
{ATST, C_VCON, C_REG, C_NONE, C_NONE, 28, 20, 0, 0, 0},
{AAND, C_SHIFT, C_REG, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AAND, C_SHIFT, C_NONE, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AANDS, C_SHIFT, C_REG, C_NONE, C_REG, 3, 4, 0, 0, 0},
......@@ -278,8 +294,10 @@ var optab = []Optab{
{AMOVD, C_MOVCON, C_NONE, C_NONE, C_REG, 32, 4, 0, 0, 0},
{AMOVW, C_BITCON, C_NONE, C_NONE, C_REG, 32, 4, 0, 0, 0},
{AMOVD, C_BITCON, C_NONE, C_NONE, C_REG, 32, 4, 0, 0, 0},
{AMOVW, C_LCON, C_NONE, C_NONE, C_REG, 12, 4, 0, LFROM, 0},
{AMOVD, C_VCON, C_NONE, C_NONE, C_REG, 12, 4, 0, LFROM, 0},
{AMOVW, C_MOVCON2, C_NONE, C_NONE, C_REG, 12, 8, 0, 0, 0},
{AMOVD, C_MOVCON2, C_NONE, C_NONE, C_REG, 12, 8, 0, 0, 0},
{AMOVD, C_MOVCON3, C_NONE, C_NONE, C_REG, 12, 12, 0, 0, 0},
{AMOVD, C_VCON, C_NONE, C_NONE, C_REG, 12, 16, 0, 0, 0},
{AMOVK, C_VCON, C_NONE, C_NONE, C_REG, 33, 4, 0, 0, 0},
{AMOVD, C_AACON, C_NONE, C_NONE, C_REG, 4, 4, REGFROM, 0, 0},
......@@ -401,8 +419,8 @@ var optab = []Optab{
{AMOVH, C_REG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
{AMOVW, C_REG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
{AMOVW, C_REG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
{AMOVD, C_REG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
{AMOVD, C_REG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
{AMOVD, C_REG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
{AFMOVS, C_FREG, C_NONE, C_NONE, C_NSAUTO, 20, 4, REGSP, 0, 0},
{AFMOVS, C_FREG, C_NONE, C_NONE, C_NSOREG, 20, 4, 0, 0, 0},
......@@ -411,15 +429,15 @@ var optab = []Optab{
/* scaled 12-bit unsigned displacement load */
{AMOVB, C_UAUTO4K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVB, C_UOREG4K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVB, C_UOREG4K, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVBU, C_UAUTO4K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVBU, C_UOREG4K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVBU, C_UOREG4K, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVH, C_UAUTO8K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVH, C_UOREG8K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVH, C_UOREG8K, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVW, C_UAUTO16K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVW, C_UOREG16K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVW, C_UOREG16K, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVD, C_UAUTO32K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVD, C_UOREG32K, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVD, C_UOREG32K, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AFMOVS, C_UAUTO16K, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
{AFMOVS, C_UOREG16K, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
......@@ -428,15 +446,15 @@ var optab = []Optab{
/* unscaled 9-bit signed displacement load */
{AMOVB, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVB, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVB, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVBU, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVBU, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVBU, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVH, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVH, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVH, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVW, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVW, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVW, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AMOVD, C_NSAUTO, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVD, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, REGSP, 0, 0},
{AMOVD, C_NSOREG, C_NONE, C_NONE, C_REG, 21, 4, 0, 0, 0},
{AFMOVS, C_NSAUTO, C_NONE, C_NONE, C_FREG, 21, 4, REGSP, 0, 0},
{AFMOVS, C_NSOREG, C_NONE, C_NONE, C_FREG, 21, 4, 0, 0, 0},
......@@ -1105,6 +1123,15 @@ func isSTXPop(op obj.As) bool {
return false
}
func isANDop(op obj.As) bool {
switch op {
case AAND, AORR, AEOR, AANDS, ATST,
ABIC, AEON, AORN, ABICS:
return true
}
return false
}
func isANDWop(op obj.As) bool {
switch op {
case AANDW, AORRW, AEORW, AANDSW, ATSTW,
......@@ -1114,6 +1141,14 @@ func isANDWop(op obj.As) bool {
return false
}
func isADDop(op obj.As) bool {
switch op {
case AADD, AADDS, ASUB, ASUBS, ACMN, ACMP:
return true
}
return false
}
func isADDWop(op obj.As) bool {
switch op {
case AADDW, AADDSW, ASUBW, ASUBSW, ACMNW, ACMPW:
......@@ -1445,6 +1480,12 @@ func (c *ctxt7) con32class(a *obj.Addr) int {
if isbitcon(uint64(v)) {
return C_ABCON
}
if movcon(int64(v)) >= 0 {
return C_AMCON
}
if movcon(int64(^v)) >= 0 {
return C_AMCON
}
return C_ADDCON
}
......@@ -1474,6 +1515,29 @@ func (c *ctxt7) con32class(a *obj.Addr) int {
return C_LCON
}
// con64class reclassifies the constant of C_VCON and C_LCON class.
func (c *ctxt7) con64class(a *obj.Addr) int {
zeroCount := 0
negCount := 0
for i := uint(0); i < 4; i++ {
immh := uint32(a.Offset >> (i * 16) & 0xffff)
if immh == 0 {
zeroCount++
} else if immh == 0xffff {
negCount++
}
}
if zeroCount >= 3 || negCount >= 3 {
return C_MOVCON
} else if zeroCount == 2 || negCount == 2 {
return C_MOVCON2
} else if zeroCount == 1 || negCount == 1 {
return C_MOVCON3
} else {
return C_VCON
}
}
func (c *ctxt7) aclass(a *obj.Addr) int {
switch a.Type {
case obj.TYPE_NONE:
......@@ -1698,6 +1762,10 @@ func (c *ctxt7) oplook(p *obj.Prog) *Optab {
a1 = c.con32class(&p.From) + 1
p.From.Class = int8(a1)
}
if ((p.As == AMOVD) || isANDop(p.As) || isADDop(p.As)) && (a0 == C_LCON || a0 == C_VCON) {
a1 = c.con64class(&p.From) + 1
p.From.Class = int8(a1)
}
}
}
......@@ -1793,6 +1861,9 @@ func cmp(a int, b int) bool {
return true
}
case C_MOVCON2:
return cmp(C_LCON, b)
case C_VCON:
return cmp(C_LCON, b)
......@@ -2711,6 +2782,7 @@ func (c *ctxt7) checkShiftAmount(p *obj.Prog, a *obj.Addr) {
}
func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
var os [5]uint32
o1 := uint32(0)
o2 := uint32(0)
o3 := uint32(0)
......@@ -2900,13 +2972,29 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
case 12: /* movT $vcon, reg */
o1 = c.omovlit(p.As, p, &p.From, int(p.To.Reg))
num := c.omovlconst(p.As, p, &p.From, int(p.To.Reg), os[:])
if num == 0 {
c.ctxt.Diag("invalid constant: %v", p)
}
o1 = os[0]
o2 = os[1]
o3 = os[2]
o4 = os[3]
case 13: /* addop $vcon, [R], R (64 bit literal); cmp $lcon,R -> addop $lcon,R, ZR */
o1 = c.omovlit(AMOVD, p, &p.From, REGTMP)
if o1 == 0 {
break
o := uint32(0)
num := uint8(0)
cls := oclass(&p.From)
if isADDWop(p.As) {
if (cls != C_LCON) && (cls != C_ADDCON2) {
c.ctxt.Diag("illegal combination: %v", p)
}
num = c.omovlconst(AMOVW, p, &p.From, REGTMP, os[:])
} else {
num = c.omovlconst(AMOVD, p, &p.From, REGTMP, os[:])
}
if num == 0 {
c.ctxt.Diag("invalid constant: %v", p)
}
rt := int(p.To.Reg)
if p.To.Type == obj.TYPE_NONE {
......@@ -2917,16 +3005,23 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
r = rt
}
if p.To.Type != obj.TYPE_NONE && (p.To.Reg == REGSP || r == REGSP) {
o2 = c.opxrrr(p, p.As, false)
o2 |= REGTMP & 31 << 16
o2 |= LSL0_64
o = c.opxrrr(p, p.As, false)
o |= REGTMP & 31 << 16
o |= LSL0_64
} else {
o2 = c.oprrr(p, p.As)
o2 |= REGTMP & 31 << 16 /* shift is 0 */
o = c.oprrr(p, p.As)
o |= REGTMP & 31 << 16 /* shift is 0 */
}
o2 |= uint32(r&31) << 5
o2 |= uint32(rt & 31)
o |= uint32(r&31) << 5
o |= uint32(rt & 31)
os[num] = o
o1 = os[0]
o2 = os[1]
o3 = os[2]
o4 = os[3]
o5 = os[4]
case 14: /* word */
if c.aclass(&p.To) == C_ADDR {
......@@ -3172,10 +3267,20 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= (uint32(r&31) << 5) | uint32(rt&31)
case 28: /* logop $vcon, [R], R (64 bit literal) */
o1 = c.omovlit(AMOVD, p, &p.From, REGTMP)
o := uint32(0)
num := uint8(0)
cls := oclass(&p.From)
if isANDWop(p.As) {
if (cls != C_LCON) && (cls != C_ADDCON) {
c.ctxt.Diag("illegal combination: %v", p)
}
num = c.omovlconst(AMOVW, p, &p.From, REGTMP, os[:])
} else {
num = c.omovlconst(AMOVD, p, &p.From, REGTMP, os[:])
}
if o1 == 0 {
break
if num == 0 {
c.ctxt.Diag("invalid constant: %v", p)
}
rt := int(p.To.Reg)
if p.To.Type == obj.TYPE_NONE {
......@@ -3185,10 +3290,17 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
if r == 0 {
r = rt
}
o2 = c.oprrr(p, p.As)
o2 |= REGTMP & 31 << 16 /* shift is 0 */
o2 |= uint32(r&31) << 5
o2 |= uint32(rt & 31)
o = c.oprrr(p, p.As)
o |= REGTMP & 31 << 16 /* shift is 0 */
o |= uint32(r&31) << 5
o |= uint32(rt & 31)
os[num] = o
o1 = os[0]
o2 = os[1]
o3 = os[2]
o4 = os[3]
o5 = os[4]
case 29: /* op Rn, Rd */
fc := c.aclass(&p.From)
......@@ -6319,10 +6431,155 @@ func (c *ctxt7) omovconst(as obj.As, p *obj.Prog, a *obj.Addr, rt int) (o1 uint3
}
o1 |= MOVCONST(d, s, rt)
}
return o1
}
// load a 32-bit/64-bit large constant (LCON or VCON) in a.Offset into rt
// put the instruction sequence in os and return the number of instructions.
func (c *ctxt7) omovlconst(as obj.As, p *obj.Prog, a *obj.Addr, rt int, os []uint32) (num uint8) {
switch as {
case AMOVW:
d := uint32(a.Offset)
// use MOVZW and MOVKW to load a constant to rt
os[0] = c.opirr(p, AMOVZW)
os[0] |= MOVCONST(int64(d), 0, rt)
os[1] = c.opirr(p, AMOVKW)
os[1] |= MOVCONST(int64(d), 1, rt)
return 2
case AMOVD:
d := a.Offset
dn := ^d
var immh [4]uint64
var i int
zeroCount := int(0)
negCount := int(0)
for i = 0; i < 4; i++ {
immh[i] = uint64((d >> uint(i*16)) & 0xffff)
if immh[i] == 0 {
zeroCount++
} else if immh[i] == 0xffff {
negCount++
}
}
if zeroCount == 4 || negCount == 4 {
c.ctxt.Diag("the immediate should be MOVCON: %v", p)
}
switch {
case zeroCount == 3:
// one MOVZ
for i = 0; i < 4; i++ {
if immh[i] != 0 {
os[0] = c.opirr(p, AMOVZ)
os[0] |= MOVCONST(d, i, rt)
break
}
}
return 1
case negCount == 3:
// one MOVN
for i = 0; i < 4; i++ {
if immh[i] != 0xffff {
os[0] = c.opirr(p, AMOVN)
os[0] |= MOVCONST(dn, i, rt)
break
}
}
return 1
case zeroCount == 2:
// one MOVZ and one MOVK
for i = 0; i < 4; i++ {
if immh[i] != 0 {
os[0] = c.opirr(p, AMOVZ)
os[0] |= MOVCONST(d, i, rt)
i++
break
}
}
for ; i < 4; i++ {
if immh[i] != 0 {
os[1] = c.opirr(p, AMOVK)
os[1] |= MOVCONST(d, i, rt)
}
}
return 2
case negCount == 2:
// one MOVN and one MOVK
for i = 0; i < 4; i++ {
if immh[i] != 0xffff {
os[0] = c.opirr(p, AMOVN)
os[0] |= MOVCONST(dn, i, rt)
i++
break
}
}
for ; i < 4; i++ {
if immh[i] != 0xffff {
os[1] = c.opirr(p, AMOVK)
os[1] |= MOVCONST(d, i, rt)
}
}
return 2
case zeroCount == 1:
// one MOVZ and two MOVKs
for i = 0; i < 4; i++ {
if immh[i] != 0 {
os[0] = c.opirr(p, AMOVZ)
os[0] |= MOVCONST(d, i, rt)
i++
break
}
}
for j := 1; i < 4; i++ {
if immh[i] != 0 {
os[j] = c.opirr(p, AMOVK)
os[j] |= MOVCONST(d, i, rt)
j++
}
}
return 3
case negCount == 1:
// one MOVN and two MOVKs
for i = 0; i < 4; i++ {
if immh[i] != 0xffff {
os[0] = c.opirr(p, AMOVN)
os[0] |= MOVCONST(dn, i, rt)
i++
break
}
}
for j := 1; i < 4; i++ {
if immh[i] != 0xffff {
os[j] = c.opirr(p, AMOVK)
os[j] |= MOVCONST(d, i, rt)
j++
}
}
return 3
default:
// one MOVZ and 3 MOVKs
os[0] = c.opirr(p, AMOVZ)
os[0] |= MOVCONST(d, 0, rt)
for i = 1; i < 4; i++ {
os[i] = c.opirr(p, AMOVK)
os[i] |= MOVCONST(d, i, rt)
}
return 4
}
default:
return 0
}
}
func (c *ctxt7) opbfm(p *obj.Prog, a obj.As, r int, s int, rf int, rt int) uint32 {
var b uint32
o := c.opirr(p, a)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment