Commit 74a9bad6 authored by Zhongwei Yao's avatar Zhongwei Yao Committed by Michael Munday

cmd/compile: enable const division for arm64

performance:
benchmark                   old ns/op     new ns/op     delta
BenchmarkDivconstI64-8      8.28          2.70          -67.39%
BenchmarkDivconstU64-8      8.28          4.69          -43.36%
BenchmarkDivconstI32-8      8.28          6.39          -22.83%
BenchmarkDivconstU32-8      8.28          4.43          -46.50%
BenchmarkDivconstI16-8      5.17          5.17          +0.00%
BenchmarkDivconstU16-8      5.33          5.34          +0.19%
BenchmarkDivconstI8-8       3.50          3.50          +0.00%
BenchmarkDivconstU8-8       3.51          3.50          -0.28%

Fixes #15382

Change-Id: Ibce7b28f0586d593b33c4d4ecc5d5e7e7c905d13
Reviewed-on: https://go-review.googlesource.com/22292Reviewed-by: default avatarMichael Munday <munday@ca.ibm.com>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 7538b1db
...@@ -29,6 +29,8 @@ func Main() { ...@@ -29,6 +29,8 @@ func Main() {
gc.Thearch.Betypeinit = betypeinit gc.Thearch.Betypeinit = betypeinit
gc.Thearch.Cgen_hmul = cgen_hmul gc.Thearch.Cgen_hmul = cgen_hmul
gc.Thearch.AddSetCarry = AddSetCarry
gc.Thearch.RightShiftWithCarry = RightShiftWithCarry
gc.Thearch.Cgen_shift = cgen_shift gc.Thearch.Cgen_shift = cgen_shift
gc.Thearch.Clearfat = clearfat gc.Thearch.Clearfat = clearfat
gc.Thearch.Defframe = defframe gc.Thearch.Defframe = defframe
......
...@@ -252,6 +252,53 @@ func dodiv(op gc.Op, nl *gc.Node, nr *gc.Node, res *gc.Node) { ...@@ -252,6 +252,53 @@ func dodiv(op gc.Op, nl *gc.Node, nr *gc.Node, res *gc.Node) {
} }
} }
// RightShiftWithCarry generates a constant unsigned
// right shift with carry.
//
// res = n >> shift // with carry
func RightShiftWithCarry(n *gc.Node, shift uint, res *gc.Node) {
// Extra 1 is for carry bit.
maxshift := uint(n.Type.Width*8 + 1)
if shift == 0 {
gmove(n, res)
} else if shift < maxshift {
// 1. clear rightmost bit of target
var n1 gc.Node
gc.Nodconst(&n1, n.Type, 1)
gins(optoas(gc.ORSH, n.Type), &n1, n)
gins(optoas(gc.OLSH, n.Type), &n1, n)
// 2. add carry flag to target
var n2 gc.Node
gc.Nodconst(&n1, n.Type, 0)
gc.Regalloc(&n2, n.Type, nil)
gins(optoas(gc.OAS, n.Type), &n1, &n2)
gins(arm64.AADC, &n2, n)
// 3. right rotate 1 bit
gc.Nodconst(&n1, n.Type, 1)
gins(arm64.AROR, &n1, n)
// ARM64 backend doesn't eliminate shifts by 0. It is manually checked here.
if shift > 1 {
var n3 gc.Node
gc.Nodconst(&n3, n.Type, int64(shift-1))
cgen_shift(gc.ORSH, true, n, &n3, res)
} else {
gmove(n, res)
}
gc.Regfree(&n2)
} else {
gc.Fatalf("RightShiftWithCarry: shift(%v) is bigger than max size(%v)", shift, maxshift)
}
}
// AddSetCarry generates add and set carry.
//
// res = nl + nr // with carry flag set
func AddSetCarry(nl *gc.Node, nr *gc.Node, res *gc.Node) {
gins(arm64.AADDS, nl, nr)
gmove(nr, res)
}
/* /*
* generate high multiply: * generate high multiply:
* res = (nl*nr) >> width * res = (nl*nr) >> width
......
...@@ -890,18 +890,6 @@ func optoas(op gc.Op, t *gc.Type) obj.As { ...@@ -890,18 +890,6 @@ func optoas(op gc.Op, t *gc.Type) obj.As {
ORSH_ | gc.TINT64: ORSH_ | gc.TINT64:
a = arm64.AASR a = arm64.AASR
// TODO(minux): handle rotates
//case CASE(ORROTC, TINT8):
//case CASE(ORROTC, TUINT8):
//case CASE(ORROTC, TINT16):
//case CASE(ORROTC, TUINT16):
//case CASE(ORROTC, TINT32):
//case CASE(ORROTC, TUINT32):
//case CASE(ORROTC, TINT64):
//case CASE(ORROTC, TUINT64):
// a = 0//??? RLDC??
// break;
case OHMUL_ | gc.TINT64: case OHMUL_ | gc.TINT64:
a = arm64.ASMULH a = arm64.ASMULH
......
...@@ -534,10 +534,13 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int { ...@@ -534,10 +534,13 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
return 0 return 0
case arm64.AADD, /* read p->from, read p->reg, write p->to */ case arm64.AADD, /* read p->from, read p->reg, write p->to */
arm64.AADDS,
arm64.ASUB, arm64.ASUB,
arm64.AADC,
arm64.AAND, arm64.AAND,
arm64.AORR, arm64.AORR,
arm64.AEOR, arm64.AEOR,
arm64.AROR,
arm64.AMUL, arm64.AMUL,
arm64.ASMULL, arm64.ASMULL,
arm64.AUMULL, arm64.AUMULL,
......
...@@ -59,6 +59,9 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{ ...@@ -59,6 +59,9 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{
arm64.ALSR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite}, arm64.ALSR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
arm64.AASR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite}, arm64.AASR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
arm64.ACMP & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead}, arm64.ACMP & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead},
arm64.AADC & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.UseCarry},
arm64.AROR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
arm64.AADDS & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.SetCarry},
// Floating point. // Floating point.
arm64.AFADDD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, arm64.AFADDD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
......
...@@ -2642,9 +2642,9 @@ func cgen_ret(n *Node) { ...@@ -2642,9 +2642,9 @@ func cgen_ret(n *Node) {
// signed and unsigned high multiplication (OHMUL). // signed and unsigned high multiplication (OHMUL).
func hasHMUL64() bool { func hasHMUL64() bool {
switch Ctxt.Arch.Family { switch Ctxt.Arch.Family {
case sys.AMD64, sys.S390X: case sys.AMD64, sys.S390X, sys.ARM64:
return true return true
case sys.ARM, sys.ARM64, sys.I386, sys.MIPS64, sys.PPC64: case sys.ARM, sys.I386, sys.MIPS64, sys.PPC64:
return false return false
} }
Fatalf("unknown architecture") Fatalf("unknown architecture")
...@@ -2664,6 +2664,28 @@ func hasRROTC64() bool { ...@@ -2664,6 +2664,28 @@ func hasRROTC64() bool {
return false return false
} }
func hasRightShiftWithCarry() bool {
switch Ctxt.Arch.Family {
case sys.ARM64:
return true
case sys.AMD64, sys.ARM, sys.I386, sys.MIPS64, sys.PPC64, sys.S390X:
return false
}
Fatalf("unknown architecture")
return false
}
func hasAddSetCarry() bool {
switch Ctxt.Arch.Family {
case sys.ARM64:
return true
case sys.AMD64, sys.ARM, sys.I386, sys.MIPS64, sys.PPC64, sys.S390X:
return false
}
Fatalf("unknown architecture")
return false
}
// generate division according to op, one of: // generate division according to op, one of:
// res = nl / nr // res = nl / nr
// res = nl % nr // res = nl % nr
...@@ -2699,8 +2721,9 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) { ...@@ -2699,8 +2721,9 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
// the MSB. For now this needs the RROTC instruction. // the MSB. For now this needs the RROTC instruction.
// TODO(mundaym): Hacker's Delight 2nd ed. chapter 10 proposes // TODO(mundaym): Hacker's Delight 2nd ed. chapter 10 proposes
// an alternative sequence of instructions for architectures // an alternative sequence of instructions for architectures
// that do not have a shift right with carry instruction. // (TODO: MIPS64, PPC64, S390X) that do not have a shift
if m.Ua != 0 && !hasRROTC64() { // right with carry instruction.
if m.Ua != 0 && !hasRROTC64() && !hasRightShiftWithCarry() {
goto longdiv goto longdiv
} }
if op == OMOD { if op == OMOD {
...@@ -2717,12 +2740,20 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) { ...@@ -2717,12 +2740,20 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
if m.Ua != 0 { if m.Ua != 0 {
// Need to add numerator accounting for overflow. // Need to add numerator accounting for overflow.
if hasAddSetCarry() {
Thearch.AddSetCarry(&n1, &n3, &n3)
} else {
Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3) Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3)
}
if !hasRROTC64() {
Thearch.RightShiftWithCarry(&n3, uint(m.S), &n3)
} else {
Nodconst(&n2, nl.Type, 1) Nodconst(&n2, nl.Type, 1)
Thearch.Gins(Thearch.Optoas(ORROTC, nl.Type), &n2, &n3) Thearch.Gins(Thearch.Optoas(ORROTC, nl.Type), &n2, &n3)
Nodconst(&n2, nl.Type, int64(m.S)-1) Nodconst(&n2, nl.Type, int64(m.S)-1)
Thearch.Gins(Thearch.Optoas(ORSH, nl.Type), &n2, &n3) Thearch.Gins(Thearch.Optoas(ORSH, nl.Type), &n2, &n3)
}
} else { } else {
Nodconst(&n2, nl.Type, int64(m.S)) Nodconst(&n2, nl.Type, int64(m.S))
Thearch.Gins(Thearch.Optoas(ORSH, nl.Type), &n2, &n3) // shift dx Thearch.Gins(Thearch.Optoas(ORSH, nl.Type), &n2, &n3) // shift dx
......
...@@ -386,6 +386,8 @@ type Arch struct { ...@@ -386,6 +386,8 @@ type Arch struct {
Cgen_bmul func(Op, *Node, *Node, *Node) bool Cgen_bmul func(Op, *Node, *Node, *Node) bool
Cgen_float func(*Node, *Node) // optional Cgen_float func(*Node, *Node) // optional
Cgen_hmul func(*Node, *Node, *Node) Cgen_hmul func(*Node, *Node, *Node)
RightShiftWithCarry func(*Node, uint, *Node) // only on systems without RROTC instruction
AddSetCarry func(*Node, *Node, *Node) // only on systems when ADD does not update carry flag
Cgen_shift func(Op, bool, *Node, *Node, *Node) Cgen_shift func(Op, bool, *Node, *Node, *Node)
Clearfat func(*Node) Clearfat func(*Node)
Cmp64 func(*Node, *Node, Op, int, *obj.Prog) // only on 32-bit systems Cmp64 func(*Node, *Node, Op, int, *obj.Prog) // only on 32-bit systems
......
...@@ -3424,7 +3424,7 @@ func walkdiv(n *Node, init *Nodes) *Node { ...@@ -3424,7 +3424,7 @@ func walkdiv(n *Node, init *Nodes) *Node {
// if >= 0, nr is 1<<pow // 1 if nr is negative. // if >= 0, nr is 1<<pow // 1 if nr is negative.
// TODO(minux) // TODO(minux)
if Thearch.LinkArch.InFamily(sys.MIPS64, sys.ARM64, sys.PPC64) { if Thearch.LinkArch.InFamily(sys.MIPS64, sys.PPC64) {
return n return n
} }
...@@ -3485,6 +3485,16 @@ func walkdiv(n *Node, init *Nodes) *Node { ...@@ -3485,6 +3485,16 @@ func walkdiv(n *Node, init *Nodes) *Node {
goto ret goto ret
} }
// TODO(zhongwei) Test shows that TUINT8, TINT8, TUINT16 and TINT16's "quick division" method
// on current arm64 backend is slower than hardware div instruction on ARM64 due to unnecessary
// data movement between registers. It could be enabled when generated code is good enough.
if Thearch.LinkArch.Family == sys.ARM64 {
switch Simtype[nl.Type.Etype] {
case TUINT8, TINT8, TUINT16, TINT16:
return n
}
}
switch Simtype[nl.Type.Etype] { switch Simtype[nl.Type.Etype] {
default: default:
return n return n
......
...@@ -155,6 +155,7 @@ var optab = []Optab{ ...@@ -155,6 +155,7 @@ var optab = []Optab{
{AADC, C_REG, C_REG, C_REG, 1, 4, 0, 0, 0}, {AADC, C_REG, C_REG, C_REG, 1, 4, 0, 0, 0},
{AADC, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0}, {AADC, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
{ANEG, C_REG, C_NONE, C_REG, 25, 4, 0, 0, 0}, {ANEG, C_REG, C_NONE, C_REG, 25, 4, 0, 0, 0},
{ANEG, C_NONE, C_NONE, C_REG, 25, 4, 0, 0, 0},
{ANGC, C_REG, C_NONE, C_REG, 17, 4, 0, 0, 0}, {ANGC, C_REG, C_NONE, C_REG, 17, 4, 0, 0, 0},
{ACMP, C_REG, C_REG, C_NONE, 1, 4, 0, 0, 0}, {ACMP, C_REG, C_REG, C_NONE, 1, 4, 0, 0, 0},
{AADD, C_ADDCON, C_RSP, C_RSP, 2, 4, 0, 0, 0}, {AADD, C_ADDCON, C_RSP, C_RSP, 2, 4, 0, 0, 0},
...@@ -2198,6 +2199,9 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) { ...@@ -2198,6 +2199,9 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
o1 = oprrr(ctxt, p.As) o1 = oprrr(ctxt, p.As)
rf := int(p.From.Reg) rf := int(p.From.Reg)
if rf == C_NONE {
rf = int(p.To.Reg)
}
rt := int(p.To.Reg) rt := int(p.To.Reg)
o1 |= (uint32(rf&31) << 16) | (REGZERO & 31 << 5) | uint32(rt&31) o1 |= (uint32(rf&31) << 16) | (REGZERO & 31 << 5) | uint32(rt&31)
......
...@@ -195,7 +195,6 @@ func dodiv(n, d uint64) (q, r uint64) { ...@@ -195,7 +195,6 @@ func dodiv(n, d uint64) (q, r uint64) {
if GOARCH == "arm" { if GOARCH == "arm" {
// arm doesn't have a division instruction, so // arm doesn't have a division instruction, so
// slowdodiv is the best that we can do. // slowdodiv is the best that we can do.
// TODO: revisit for arm64.
return slowdodiv(n, d) return slowdodiv(n, d)
} }
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package go1
import (
"testing"
)
var i64res int64
func BenchmarkDivconstI64(b *testing.B) {
for i := 0; i < b.N; i++ {
i64res = int64(i) / 7
}
}
var u64res uint64
func BenchmarkDivconstU64(b *testing.B) {
for i := 0; i < b.N; i++ {
u64res = uint64(i) / 7
}
}
var i32res int32
func BenchmarkDivconstI32(b *testing.B) {
for i := 0; i < b.N; i++ {
i32res = int32(i) / 7
}
}
var u32res uint32
func BenchmarkDivconstU32(b *testing.B) {
for i := 0; i < b.N; i++ {
u32res = uint32(i) / 7
}
}
var i16res int16
func BenchmarkDivconstI16(b *testing.B) {
for i := 0; i < b.N; i++ {
i16res = int16(i) / 7
}
}
var u16res uint16
func BenchmarkDivconstU16(b *testing.B) {
for i := 0; i < b.N; i++ {
u16res = uint16(i) / 7
}
}
var i8res int8
func BenchmarkDivconstI8(b *testing.B) {
for i := 0; i < b.N; i++ {
i8res = int8(i) / 7
}
}
var u8res uint8
func BenchmarkDivconstU8(b *testing.B) {
for i := 0; i < b.N; i++ {
u8res = uint8(i) / 7
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment