Commit fb0ccc5d authored by Cherry Zhang's avatar Cherry Zhang

cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64

ARM64 assembler backend only accepts loads and stores with small
or aligned offset. The compiler therefore can only fold small or
aligned offsets into loads and stores. For locals and args, their
offsets to SP are not known until very late, and the compiler
makes conservative decision not folding some of them. However,
in most cases, the offset is indeed small or aligned, and can
be folded into load and store (but actually not).

This CL adds support of loads and stores with large and unaligned
offsets. When the offset doesn't fit into the instruction, it
uses two instructions and (for very large offset) the constant
pool. This way, the compiler doesn't need to be conservative,
and can simply fold the offset.

To make it work, the assembler's optab matching rules need to be
changed. Before, MOVD accepts C_UAUTO32K which matches multiple
of 8 between 0 and 32K, and also C_UAUTO16K, which may not be
multiple of 8 and does not fit into MOVD instruction. The
assembler errors in the latter case. This change makes it only
matches multiple of 8 (or offsets within ±256, which also fits
in instruction), and uses the large-or-unaligned-offset rule
for things doesn't fit (without error). Other sized move rules
are changed similarly.

Class C_UAUTO64K and C_UOREG64K are removed, as they are never
used.

In shared library, load/store of global is rewritten to using
GOT and temp register, which conflicts with the use of temp
register for assembling large offset. So the folding is disabled
for globals in shared library mode.

Reduce cmd/go binary size by 2%.

name                     old time/op    new time/op    delta
BinaryTree17-8              8.67s ± 0%     8.61s ± 0%   -0.60%  (p=0.000 n=9+10)
Fannkuch11-8                6.24s ± 0%     6.19s ± 0%   -0.83%  (p=0.000 n=10+9)
FmtFprintfEmpty-8           116ns ± 0%     116ns ± 0%     ~     (all equal)
FmtFprintfString-8          196ns ± 0%     192ns ± 0%   -1.89%  (p=0.000 n=10+10)
FmtFprintfInt-8             199ns ± 0%     198ns ± 0%   -0.35%  (p=0.001 n=9+10)
FmtFprintfIntInt-8          294ns ± 0%     293ns ± 0%   -0.34%  (p=0.000 n=8+8)
FmtFprintfPrefixedInt-8     318ns ± 1%     318ns ± 1%     ~     (p=1.000 n=10+10)
FmtFprintfFloat-8           537ns ± 0%     531ns ± 0%   -1.17%  (p=0.000 n=9+10)
FmtManyArgs-8              1.19µs ± 1%    1.18µs ± 1%   -1.41%  (p=0.001 n=10+10)
GobDecode-8                17.2ms ± 1%    17.3ms ± 2%     ~     (p=0.165 n=10+10)
GobEncode-8                14.7ms ± 1%    14.7ms ± 2%     ~     (p=0.631 n=10+10)
Gzip-8                      837ms ± 0%     836ms ± 0%   -0.14%  (p=0.006 n=9+10)
Gunzip-8                    141ms ± 0%     139ms ± 0%   -1.24%  (p=0.000 n=9+10)
HTTPClientServer-8          256µs ± 1%     253µs ± 1%   -1.35%  (p=0.000 n=10+10)
JSONEncode-8               40.1ms ± 1%    41.3ms ± 1%   +3.06%  (p=0.000 n=10+9)
JSONDecode-8                157ms ± 1%     156ms ± 1%   -0.83%  (p=0.001 n=9+8)
Mandelbrot200-8            8.94ms ± 0%    8.94ms ± 0%   +0.02%  (p=0.000 n=9+9)
GoParse-8                  8.69ms ± 0%    8.54ms ± 1%   -1.69%  (p=0.000 n=8+10)
RegexpMatchEasy0_32-8       227ns ± 1%     228ns ± 1%   +0.48%  (p=0.016 n=10+9)
RegexpMatchEasy0_1K-8      1.92µs ± 0%    1.63µs ± 0%  -15.08%  (p=0.000 n=10+9)
RegexpMatchEasy1_32-8       256ns ± 0%     251ns ± 0%   -2.19%  (p=0.000 n=10+9)
RegexpMatchEasy1_1K-8      2.38µs ± 0%    2.09µs ± 0%  -12.49%  (p=0.000 n=10+9)
RegexpMatchMedium_32-8      352ns ± 0%     354ns ± 0%   +0.39%  (p=0.002 n=10+9)
RegexpMatchMedium_1K-8      106µs ± 0%     106µs ± 0%   -0.05%  (p=0.005 n=10+9)
RegexpMatchHard_32-8       5.92µs ± 0%    5.89µs ± 0%   -0.40%  (p=0.000 n=9+8)
RegexpMatchHard_1K-8        180µs ± 0%     179µs ± 0%   -0.14%  (p=0.000 n=10+9)
Revcomp-8                   1.20s ± 0%     1.13s ± 0%   -6.29%  (p=0.000 n=9+8)
Template-8                  159ms ± 1%     154ms ± 1%   -3.14%  (p=0.000 n=9+10)
TimeParse-8                 800ns ± 3%     769ns ± 1%   -3.91%  (p=0.000 n=10+10)
TimeFormat-8                826ns ± 2%     817ns ± 2%   -1.04%  (p=0.050 n=10+10)
[Geo mean]                  145µs          143µs        -1.79%

Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985
Reviewed-on: https://go-review.googlesource.com/42172
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 5e0bcb38
...@@ -85,6 +85,60 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 ...@@ -85,6 +85,60 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
MOVD $1, R1 MOVD $1, R1
MOVD ZR, (R1) MOVD ZR, (R1)
// small offset fits into instructions
MOVB 1(R1), R2 // 22048039
MOVH 1(R1), R2 // 22108078
MOVH 2(R1), R2 // 22048079
MOVW 1(R1), R2 // 221080b8
MOVW 4(R1), R2 // 220480b9
MOVD 1(R1), R2 // 221040f8
MOVD 8(R1), R2 // 220440f9
FMOVS 1(R1), F2 // 221040bc
FMOVS 4(R1), F2 // 220440bd
FMOVD 1(R1), F2 // 221040fc
FMOVD 8(R1), F2 // 220440fd
MOVB R1, 1(R2) // 41040039
MOVH R1, 1(R2) // 41100078
MOVH R1, 2(R2) // 41040079
MOVW R1, 1(R2) // 411000b8
MOVW R1, 4(R2) // 410400b9
MOVD R1, 1(R2) // 411000f8
MOVD R1, 8(R2) // 410400f9
FMOVS F1, 1(R2) // 411000bc
FMOVS F1, 4(R2) // 410400bd
FMOVD F1, 1(R2) // 411000fc
FMOVD F1, 8(R2) // 410400fd
// large aligned offset, use two instructions
MOVB 0x1001(R1), R2 // MOVB 4097(R1), R2 // 3b04409162078039
MOVH 0x2002(R1), R2 // MOVH 8194(R1), R2 // 3b08409162078079
MOVW 0x4004(R1), R2 // MOVW 16388(R1), R2 // 3b104091620780b9
MOVD 0x8008(R1), R2 // MOVD 32776(R1), R2 // 3b204091620740f9
FMOVS 0x4004(R1), F2 // FMOVS 16388(R1), F2 // 3b104091620740bd
FMOVD 0x8008(R1), F2 // FMOVD 32776(R1), F2 // 3b204091620740fd
MOVB R1, 0x1001(R2) // MOVB R1, 4097(R2) // 5b04409161070039
MOVH R1, 0x2002(R2) // MOVH R1, 8194(R2) // 5b08409161070079
MOVW R1, 0x4004(R2) // MOVW R1, 16388(R2) // 5b104091610700b9
MOVD R1, 0x8008(R2) // MOVD R1, 32776(R2) // 5b204091610700f9
FMOVS F1, 0x4004(R2) // FMOVS F1, 16388(R2) // 5b104091610700bd
FMOVD F1, 0x8008(R2) // FMOVD F1, 32776(R2) // 5b204091610700fd
// very large or unaligned offset uses constant pool
// the encoding cannot be checked as the address of the constant pool is unknown.
// here we only test that they can be assembled.
MOVB 0x44332211(R1), R2 // MOVB 1144201745(R1), R2
MOVH 0x44332211(R1), R2 // MOVH 1144201745(R1), R2
MOVW 0x44332211(R1), R2 // MOVW 1144201745(R1), R2
MOVD 0x44332211(R1), R2 // MOVD 1144201745(R1), R2
FMOVS 0x44332211(R1), F2 // FMOVS 1144201745(R1), F2
FMOVD 0x44332211(R1), F2 // FMOVD 1144201745(R1), F2
MOVB R1, 0x44332211(R2) // MOVB R1, 1144201745(R2)
MOVH R1, 0x44332211(R2) // MOVH R1, 1144201745(R2)
MOVW R1, 0x44332211(R2) // MOVW R1, 1144201745(R2)
MOVD R1, 0x44332211(R2) // MOVD R1, 1144201745(R2)
FMOVS F1, 0x44332211(R2) // FMOVS F1, 1144201745(R2)
FMOVD F1, 0x44332211(R2) // FMOVD F1, 1144201745(R2)
// //
// MOVK // MOVK
// //
......
...@@ -1424,6 +1424,16 @@ var linuxARM64Tests = []*asmTest{ ...@@ -1424,6 +1424,16 @@ var linuxARM64Tests = []*asmTest{
`, `,
[]string{"\tAND\t"}, []string{"\tAND\t"},
}, },
{
// make sure offsets are folded into load and store.
`
func f36(_, a [20]byte) (b [20]byte) {
b = a
return
}
`,
[]string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(RSP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(RSP\\)"},
},
} }
var linuxMIPSTests = []*asmTest{ var linuxMIPSTests = []*asmTest{
......
...@@ -283,20 +283,6 @@ func isAuto(s interface{}) bool { ...@@ -283,20 +283,6 @@ func isAuto(s interface{}) bool {
return ok return ok
} }
func fitsARM64Offset(off, align int64, sym interface{}) bool {
// only small offset (between -256 and 256) or offset that is a multiple of data size
// can be encoded in the instructions
// since this rewriting takes place before stack allocation, the offset to SP is unknown,
// so don't do it for args and locals with unaligned offset
if !is32Bit(off) {
return false
}
if align == 1 {
return true
}
return !isArg(sym) && (off%align == 0 || off < 256 && off > -256 && !isAuto(sym))
}
// isSameSym returns whether sym is the same as the given named symbol // isSameSym returns whether sym is the same as the given named symbol
func isSameSym(sym interface{}, name string) bool { func isSameSym(sym interface{}, name string) bool {
s, ok := sym.(fmt.Stringer) s, ok := sym.(fmt.Stringer)
......
...@@ -289,16 +289,21 @@ const ( ...@@ -289,16 +289,21 @@ const (
C_SBRA // for TYPE_BRANCH C_SBRA // for TYPE_BRANCH
C_LBRA C_LBRA
C_NPAUTO // -512 <= x < 0, 0 mod 8 C_NPAUTO // -512 <= x < 0, 0 mod 8
C_NSAUTO // -256 <= x < 0 C_NSAUTO // -256 <= x < 0
C_PSAUTO // 0 to 255 C_PSAUTO // 0 to 255
C_PPAUTO // 0 to 504, 0 mod 8 C_PPAUTO // 0 to 504, 0 mod 8
C_UAUTO4K // 0 to 4095 C_UAUTO4K_8 // 0 to 4095, 0 mod 8
C_UAUTO8K // 0 to 8190, 0 mod 2 C_UAUTO4K_4 // 0 to 4095, 0 mod 4
C_UAUTO16K // 0 to 16380, 0 mod 4 C_UAUTO4K_2 // 0 to 4095, 0 mod 2
C_UAUTO32K // 0 to 32760, 0 mod 8 C_UAUTO4K // 0 to 4095
C_UAUTO64K // 0 to 65520, 0 mod 16 C_UAUTO8K_8 // 0 to 8190, 0 mod 8
C_LAUTO // any other 32-bit constant C_UAUTO8K_4 // 0 to 8190, 0 mod 4
C_UAUTO8K // 0 to 8190, 0 mod 2
C_UAUTO16K_8 // 0 to 16380, 0 mod 8
C_UAUTO16K // 0 to 16380, 0 mod 4
C_UAUTO32K // 0 to 32760, 0 mod 8
C_LAUTO // any other 32-bit constant
C_SEXT1 // 0 to 4095, direct C_SEXT1 // 0 to 4095, direct
C_SEXT2 // 0 to 8190 C_SEXT2 // 0 to 8190
...@@ -307,17 +312,21 @@ const ( ...@@ -307,17 +312,21 @@ const (
C_SEXT16 // 0 to 65520 C_SEXT16 // 0 to 65520
C_LEXT C_LEXT
// TODO(aram): s/AUTO/INDIR/
C_ZOREG // 0(R) C_ZOREG // 0(R)
C_NPOREG // mirror NPAUTO, etc C_NPOREG // must mirror NPAUTO, etc
C_NSOREG C_NSOREG
C_PSOREG C_PSOREG
C_PPOREG C_PPOREG
C_UOREG4K_8
C_UOREG4K_4
C_UOREG4K_2
C_UOREG4K C_UOREG4K
C_UOREG8K_8
C_UOREG8K_4
C_UOREG8K C_UOREG8K
C_UOREG16K_8
C_UOREG16K C_UOREG16K
C_UOREG32K C_UOREG32K
C_UOREG64K
C_LOREG C_LOREG
C_ADDR // TODO(aram): explain difference from C_VCONADDR C_ADDR // TODO(aram): explain difference from C_VCONADDR
......
...@@ -37,11 +37,16 @@ var cnames7 = []string{ ...@@ -37,11 +37,16 @@ var cnames7 = []string{
"NSAUTO", "NSAUTO",
"PSAUTO", "PSAUTO",
"PPAUTO", "PPAUTO",
"UAUTO4K_8",
"UAUTO4K_4",
"UAUTO4K_2",
"UAUTO4K", "UAUTO4K",
"UAUTO8K_8",
"UAUTO8K_4",
"UAUTO8K", "UAUTO8K",
"UAUTO16K_8",
"UAUTO16K", "UAUTO16K",
"UAUTO32K", "UAUTO32K",
"UAUTO64K",
"LAUTO", "LAUTO",
"SEXT1", "SEXT1",
"SEXT2", "SEXT2",
...@@ -54,11 +59,16 @@ var cnames7 = []string{ ...@@ -54,11 +59,16 @@ var cnames7 = []string{
"NSOREG", "NSOREG",
"PSOREG", "PSOREG",
"PPOREG", "PPOREG",
"UOREG4K_8",
"UOREG4K_4",
"UOREG4K_2",
"UOREG4K", "UOREG4K",
"UOREG8K_8",
"UOREG8K_4",
"UOREG8K", "UOREG8K",
"UOREG16K_8",
"UOREG16K", "UOREG16K",
"UOREG32K", "UOREG32K",
"UOREG64K",
"LOREG", "LOREG",
"ADDR", "ADDR",
"GOTADDR", "GOTADDR",
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment