Commit 203e1888 authored by Richard Musiol's avatar Richard Musiol Committed by Richard Musiol

cmd/internal/obj/wasm: cache SP in a local

We use Wasm global variables extensively for simulating
registers, especially SP. V8 does not handle global variables
efficiently.

This CL reduces global variable accesses by caching the global SP
in a local variable in each function. The local cache is set on
function entry and updated after each call (where the stack could
have moved). Within a function, the SP access will use the local
variable.

Supersedes https://golang.org/cl/173979.

Running on Chrome Version 73.0.3683.103 on darwin/amd64:

name                   old time/op    new time/op     delta
BinaryTree17              15.3s ± 2%      14.5s ± 3%   -5.20%  (p=0.000 n=9+10)
Fannkuch11                8.91s ± 2%      9.48s ± 2%   +6.41%  (p=0.000 n=9+10)
FmtFprintfEmpty           197ns ± 5%      165ns ± 3%  -16.09%  (p=0.000 n=9+8)
FmtFprintfString          354ns ± 8%      325ns ± 7%   -8.33%  (p=0.001 n=10+10)
FmtFprintfInt             400ns ± 4%      368ns ± 6%   -8.01%  (p=0.000 n=10+10)
FmtFprintfIntInt          618ns ± 3%      587ns ± 6%   -4.97%  (p=0.001 n=10+10)
FmtFprintfPrefixedInt     637ns ± 4%      606ns ± 4%   -4.88%  (p=0.000 n=10+10)
FmtFprintfFloat           965ns ± 7%      898ns ± 4%   -6.97%  (p=0.000 n=10+10)
FmtManyArgs              2.34µs ± 1%     2.24µs ± 3%   -4.40%  (p=0.000 n=9+10)
GobDecode                29.8ms ± 3%     28.8ms ± 6%   -3.60%  (p=0.006 n=9+10)
GobEncode                20.5ms ± 8%     17.6ms ± 3%  -14.32%  (p=0.000 n=10+10)
Gzip                      714ms ± 3%      718ms ± 8%     ~     (p=0.971 n=10+10)
Gunzip                    148ms ± 3%      136ms ± 3%   -7.99%  (p=0.000 n=10+9)
HTTPClientServer          219µs ± 3%      215µs ± 4%     ~     (p=0.190 n=10+10)
JSONEncode               35.1ms ± 2%     31.8ms ±13%   -9.52%  (p=0.002 n=10+10)
JSONDecode                220ms ± 3%      207ms ± 5%   -5.87%  (p=0.000 n=10+10)
Mandelbrot200            5.22ms ± 1%     5.11ms ± 4%   -2.11%  (p=0.027 n=8+10)
GoParse                  17.2ms ± 6%     16.1ms ± 5%   -6.63%  (p=0.000 n=10+9)
RegexpMatchEasy0_32       375ns ± 3%      340ns ± 3%   -9.25%  (p=0.000 n=10+10)
RegexpMatchEasy0_1K      2.70µs ± 3%     2.65µs ± 4%     ~     (p=0.118 n=10+10)
RegexpMatchEasy1_32       341ns ± 2%      305ns ± 4%  -10.62%  (p=0.000 n=9+10)
RegexpMatchEasy1_1K      3.20µs ± 3%     2.99µs ± 3%   -6.35%  (p=0.000 n=10+10)
RegexpMatchMedium_32      520ns ± 3%      501ns ± 4%   -3.64%  (p=0.002 n=9+10)
RegexpMatchMedium_1K      145µs ± 7%      128µs ± 3%  -11.57%  (p=0.000 n=9+10)
RegexpMatchHard_32       7.88µs ± 3%     7.01µs ± 5%  -10.97%  (p=0.000 n=10+10)
RegexpMatchHard_1K        237µs ± 5%      207µs ± 4%  -12.71%  (p=0.000 n=9+10)
Revcomp                   2.34s ± 1%      2.31s ± 5%     ~     (p=0.230 n=7+10)
Template                  261ms ± 7%      246ms ± 5%   -5.93%  (p=0.007 n=10+10)
TimeParse                1.47µs ± 3%     1.39µs ± 5%   -5.75%  (p=0.000 n=9+10)
TimeFormat               1.52µs ± 3%     1.43µs ± 4%   -6.42%  (p=0.000 n=8+10)

name                   old speed      new speed       delta
GobDecode              25.7MB/s ± 3%   26.7MB/s ± 5%   +3.77%  (p=0.006 n=9+10)
GobEncode              37.5MB/s ± 8%   43.7MB/s ± 3%  +16.61%  (p=0.000 n=10+10)
Gzip                   27.2MB/s ± 3%   27.0MB/s ± 7%     ~     (p=0.971 n=10+10)
Gunzip                  131MB/s ± 3%    142MB/s ± 5%   +8.07%  (p=0.000 n=10+10)
JSONEncode             55.2MB/s ± 2%   61.2MB/s ±12%  +10.80%  (p=0.002 n=10+10)
JSONDecode             8.84MB/s ± 3%   9.39MB/s ± 5%   +6.28%  (p=0.000 n=10+10)
GoParse                3.37MB/s ± 6%   3.61MB/s ± 5%   +7.09%  (p=0.000 n=10+9)
RegexpMatchEasy0_32    85.3MB/s ± 3%   94.0MB/s ± 3%  +10.20%  (p=0.000 n=10+10)
RegexpMatchEasy0_1K     379MB/s ± 3%    387MB/s ± 4%     ~     (p=0.123 n=10+10)
RegexpMatchEasy1_32    93.9MB/s ± 2%  105.1MB/s ± 4%  +11.96%  (p=0.000 n=9+10)
RegexpMatchEasy1_1K     320MB/s ± 3%    342MB/s ± 3%   +6.79%  (p=0.000 n=10+10)
RegexpMatchMedium_32   1.92MB/s ± 2%   2.00MB/s ± 3%   +3.94%  (p=0.001 n=9+10)
RegexpMatchMedium_1K   7.09MB/s ± 6%   8.01MB/s ± 3%  +13.00%  (p=0.000 n=9+10)
RegexpMatchHard_32     4.06MB/s ± 3%   4.56MB/s ± 5%  +12.38%  (p=0.000 n=10+10)
RegexpMatchHard_1K     4.32MB/s ± 4%   4.96MB/s ± 4%  +14.60%  (p=0.000 n=9+10)
Revcomp                 109MB/s ± 1%    110MB/s ± 5%     ~     (p=0.219 n=7+10)
Template               7.44MB/s ± 8%   7.91MB/s ± 5%   +6.30%  (p=0.007 n=10+10)

Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab
Reviewed-on: https://go-review.googlesource.com/c/go/+/174062
Run-TryBot: Richard Musiol <neelance@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent c2d9eea1
...@@ -774,21 +774,34 @@ func countRegisters(s *obj.LSym) (numI, numF int16) { ...@@ -774,21 +774,34 @@ func countRegisters(s *obj.LSym) (numI, numF int16) {
func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
w := new(bytes.Buffer) w := new(bytes.Buffer)
numI, numF := countRegisters(s) hasLocalSP := false
var r0, f0 int16
// Function starts with declaration of locals: numbers and types. // Function starts with declaration of locals: numbers and types.
// Some functions use a special calling convention.
switch s.Name { switch s.Name {
// memchr and memcmp don't use the normal Go calling convention and need i32 variables. case "wasm_export_run", "runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody":
case "memchr": writeUleb128(w, 0) // number of sets of locals
case "memchr", "memcmp":
writeUleb128(w, 1) // number of sets of locals writeUleb128(w, 1) // number of sets of locals
writeUleb128(w, 3) // number of locals writeUleb128(w, 2) // number of locals
w.WriteByte(0x7F) // i32 w.WriteByte(0x7F) // i32
case "memcmp": case "cmpbody":
writeUleb128(w, 1) // number of sets of locals writeUleb128(w, 1) // number of sets of locals
writeUleb128(w, 2) // number of locals writeUleb128(w, 2) // number of locals
w.WriteByte(0x7F) // i32 w.WriteByte(0x7E) // i64
case "runtime.gcWriteBarrier":
writeUleb128(w, 1) // number of sets of locals
writeUleb128(w, 4) // number of locals
w.WriteByte(0x7E) // i64
default: default:
numTypes := 0 // Normal calling convention: No WebAssembly parameters. First local variable is local SP cache.
hasLocalSP = true
numI, numF := countRegisters(s)
r0 = 1
f0 = 1 + numI
numTypes := 1
if numI > 0 { if numI > 0 {
numTypes++ numTypes++
} }
...@@ -797,6 +810,8 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -797,6 +810,8 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
} }
writeUleb128(w, uint64(numTypes)) writeUleb128(w, uint64(numTypes))
writeUleb128(w, 1) // number of locals (SP)
w.WriteByte(0x7F) // i32
if numI > 0 { if numI > 0 {
writeUleb128(w, uint64(numI)) // number of locals writeUleb128(w, uint64(numI)) // number of locals
w.WriteByte(0x7E) // i64 w.WriteByte(0x7E) // i64
...@@ -807,6 +822,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -807,6 +822,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
} }
} }
if hasLocalSP {
// Copy SP from its global variable into a local variable. Accessing a local variable is more efficient.
updateLocalSP(w)
}
for p := s.Func.Text; p != nil; p = p.Link { for p := s.Func.Text; p != nil; p = p.Link {
switch p.As { switch p.As {
case AGet: case AGet:
...@@ -815,15 +834,18 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -815,15 +834,18 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
} }
reg := p.From.Reg reg := p.From.Reg
switch { switch {
case reg == REG_SP && hasLocalSP:
w.WriteByte(0x20) // local.get
writeUleb128(w, 0) // local SP
case reg >= REG_PC_F && reg <= REG_PAUSE: case reg >= REG_PC_F && reg <= REG_PAUSE:
w.WriteByte(0x23) // global.get w.WriteByte(0x23) // global.get
writeUleb128(w, uint64(reg-REG_PC_F)) writeUleb128(w, uint64(reg-REG_PC_F))
case reg >= REG_R0 && reg <= REG_R15: case reg >= REG_R0 && reg <= REG_R15:
w.WriteByte(0x20) // local.get (i64) w.WriteByte(0x20) // local.get (i64)
writeUleb128(w, uint64(reg-REG_R0)) writeUleb128(w, uint64(r0+(reg-REG_R0)))
case reg >= REG_F0 && reg <= REG_F15: case reg >= REG_F0 && reg <= REG_F15:
w.WriteByte(0x20) // local.get (f64) w.WriteByte(0x20) // local.get (f64)
writeUleb128(w, uint64(numI+(reg-REG_F0))) writeUleb128(w, uint64(f0+(reg-REG_F0)))
default: default:
panic("bad Get: invalid register") panic("bad Get: invalid register")
} }
...@@ -836,6 +858,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -836,6 +858,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
reg := p.To.Reg reg := p.To.Reg
switch { switch {
case reg >= REG_PC_F && reg <= REG_PAUSE: case reg >= REG_PC_F && reg <= REG_PAUSE:
if reg == REG_SP && hasLocalSP {
w.WriteByte(0x22) // local.tee
writeUleb128(w, 0) // local SP
}
w.WriteByte(0x24) // global.set w.WriteByte(0x24) // global.set
writeUleb128(w, uint64(reg-REG_PC_F)) writeUleb128(w, uint64(reg-REG_PC_F))
case reg >= REG_R0 && reg <= REG_F15: case reg >= REG_R0 && reg <= REG_F15:
...@@ -846,9 +872,9 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -846,9 +872,9 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
w.WriteByte(0x21) // local.set w.WriteByte(0x21) // local.set
} }
if reg <= REG_R15 { if reg <= REG_R15 {
writeUleb128(w, uint64(reg-REG_R0)) writeUleb128(w, uint64(r0+(reg-REG_R0)))
} else { } else {
writeUleb128(w, uint64(numI+(reg-REG_F0))) writeUleb128(w, uint64(f0+(reg-REG_F0)))
} }
default: default:
panic("bad Set: invalid register") panic("bad Set: invalid register")
...@@ -863,10 +889,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -863,10 +889,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
switch { switch {
case reg >= REG_R0 && reg <= REG_R15: case reg >= REG_R0 && reg <= REG_R15:
w.WriteByte(0x22) // local.tee (i64) w.WriteByte(0x22) // local.tee (i64)
writeUleb128(w, uint64(reg-REG_R0)) writeUleb128(w, uint64(r0+(reg-REG_R0)))
case reg >= REG_F0 && reg <= REG_F15: case reg >= REG_F0 && reg <= REG_F15:
w.WriteByte(0x22) // local.tee (f64) w.WriteByte(0x22) // local.tee (f64)
writeUleb128(w, uint64(numI+(reg-REG_F0))) writeUleb128(w, uint64(f0+(reg-REG_F0)))
default: default:
panic("bad Tee: invalid register") panic("bad Tee: invalid register")
} }
...@@ -942,6 +968,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -942,6 +968,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
r.Type = objabi.R_WASMIMPORT r.Type = objabi.R_WASMIMPORT
} }
r.Sym = p.To.Sym r.Sym = p.To.Sym
if hasLocalSP {
// The stack may have moved, which changes SP. Update the local SP variable.
updateLocalSP(w)
}
default: default:
panic("bad type for Call") panic("bad type for Call")
...@@ -950,6 +980,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -950,6 +980,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
case ACallIndirect: case ACallIndirect:
writeUleb128(w, uint64(p.To.Offset)) writeUleb128(w, uint64(p.To.Offset))
w.WriteByte(0x00) // reserved value w.WriteByte(0x00) // reserved value
if hasLocalSP {
// The stack may have moved, which changes SP. Update the local SP variable.
updateLocalSP(w)
}
case AI32Const, AI64Const: case AI32Const, AI64Const:
if p.From.Name == obj.NAME_EXTERN { if p.From.Name == obj.NAME_EXTERN {
...@@ -1001,6 +1035,13 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { ...@@ -1001,6 +1035,13 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
s.P = w.Bytes() s.P = w.Bytes()
} }
func updateLocalSP(w *bytes.Buffer) {
w.WriteByte(0x23) // global.get
writeUleb128(w, uint64(REG_SP-REG_PC_F)) // SP
w.WriteByte(0x21) // local.set
writeUleb128(w, 0) // local SP
}
func align(as obj.As) uint64 { func align(as obj.As) uint64 {
switch as { switch as {
case AI32Load8S, AI32Load8U, AI64Load8S, AI64Load8U, AI32Store8, AI64Store8: case AI32Load8S, AI32Load8U, AI64Load8S, AI64Load8U, AI32Store8, AI64Store8:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment