Commit fa316ba4 authored by Rémy Oudompheng's avatar Rémy Oudompheng

cmd/8g: eliminate obviously useless temps before regopt.

This patch introduces a sort of pre-regopt peephole optimization.
When a temporary is introduced that just holds a value for the
duration of the next instruction and is otherwise unused, we
elide it to make the job of regopt easier.

Since x86 has very few registers, this situation happens very
often. The result is large savings in stack variables for
arithmetic-heavy functions.

crypto/aes

benchmark                 old ns/op    new ns/op    delta
BenchmarkEncrypt               1301          392  -69.87%
BenchmarkDecrypt               1309          368  -71.89%
BenchmarkExpand                2913         1036  -64.44%
benchmark                  old MB/s     new MB/s  speedup
BenchmarkEncrypt              12.29        40.74    3.31x
BenchmarkDecrypt              12.21        43.37    3.55x

crypto/md5

benchmark                 old ns/op    new ns/op    delta
BenchmarkHash8Bytes            1761          914  -48.10%
BenchmarkHash1K               16912         5570  -67.06%
BenchmarkHash8K              123895        38286  -69.10%
benchmark                  old MB/s     new MB/s  speedup
BenchmarkHash8Bytes            4.54         8.75    1.93x
BenchmarkHash1K               60.55       183.83    3.04x
BenchmarkHash8K               66.12       213.97    3.24x

bench/go1

benchmark                 old ns/op    new ns/op    delta
BenchmarkBinaryTree17    8364835000   8303154000   -0.74%
BenchmarkFannkuch11      7511723000   6381729000  -15.04%
BenchmarkGobDecode         27764090     27103270   -2.38%
BenchmarkGobEncode         11240880     11184370   -0.50%
BenchmarkGzip            1470224000    856668400  -41.73%
BenchmarkGunzip           240660800    201697300  -16.19%
BenchmarkJSONEncode       155225800    185571900  +19.55%
BenchmarkJSONDecode       243347900    282123000  +15.93%
BenchmarkMandelbrot200     12240970     12201880   -0.32%
BenchmarkParse              8837445      8765210   -0.82%
BenchmarkRevcomp         2556310000   1868566000  -26.90%
BenchmarkTemplate         389298000    379792000   -2.44%
benchmark                  old MB/s     new MB/s  speedup
BenchmarkGobDecode            27.64        28.32    1.02x
BenchmarkGobEncode            68.28        68.63    1.01x
BenchmarkGzip                 13.20        22.65    1.72x
BenchmarkGunzip               80.63        96.21    1.19x
BenchmarkJSONEncode           12.50        10.46    0.84x
BenchmarkJSONDecode            7.97         6.88    0.86x
BenchmarkParse                 6.55         6.61    1.01x
BenchmarkRevcomp              99.43       136.02    1.37x
BenchmarkTemplate              4.98         5.11    1.03x

Fixes #4035.

R=golang-dev, minux.ma, rsc
CC=golang-dev
https://golang.org/cl/6828056
parent 0a47d2ef
......@@ -40,6 +40,7 @@
static int first = 1;
static void fixjmp(Prog*);
static void fixtemp(Prog*);
Reg*
rega(void)
......@@ -135,6 +136,7 @@ regopt(Prog *firstp)
first = 0;
}
fixtemp(firstp);
fixjmp(firstp);
// count instructions
......@@ -1692,3 +1694,122 @@ fixjmp(Prog *firstp)
print("\n");
}
}
static uint32
fnv1(Sym *sym)
{
uint32 h;
char *s;
h = 2166136261U;
for(s=sym->name;*s;s++) {
h = (16777619 * h) ^ (uint32)(uint8)(*s);
}
return h;
}
static uint16
hash32to16(uint32 h)
{
return (h & 0xffff) ^ (h >> 16);
}
/*
* fixtemp eliminates sequences like:
* MOV reg1, mem
* OP mem, reg2
* when mem is a stack variable which is not mentioned
* anywhere else. The instructions are replaced by
* OP reg1, reg2
* this reduces the number of variables that the register optimizer
* sees, which lets it do a better job and makes it less likely to turn
* itself off.
*/
void
fixtemp(Prog *firstp)
{
static uint8 counts[1<<16]; // A hash table to count variable occurences.
int i;
Prog *p, *p2;
uint32 h;
if(debug['R'] && debug['v'])
print("\nfixtemp\n");
// Count variable references. We actually use a hashtable so this
// is only approximate.
for(i=0; i<nelem(counts); i++)
counts[i] = 0;
for(p=firstp; p!=P; p=p->link) {
if(p->from.type == D_AUTO) {
h = hash32to16(fnv1(p->from.sym));
//print("seen %S hash %d\n", p->from.sym, hash32to16(h));
if(counts[h] < 10)
counts[h]++;
}
if(p->to.type == D_AUTO) {
h = hash32to16(fnv1(p->to.sym));
//print("seen %S hash %d\n", p->to.sym, hash32to16(h));
if(counts[h] < 10)
counts[h]++;
}
}
// Eliminate single-write, single-read stack variables.
for(p=firstp; p!=P; p=p->link) {
if(debug['R'] && debug['v'])
print("%P\n", p);
if(p->link == P
|| !RtoB(p->from.type)
|| p->to.type != D_AUTO
|| isfloat[p->to.etype])
continue;
switch(p->as) {
case AMOVB:
if(p->to.width == 1)
break;
case AMOVW:
if(p->to.width == 2)
break;
case AMOVL:
if(p->to.width == 4)
break;
default:
continue;
}
// p is a MOV reg, mem.
// and it is not a float.
p2 = p->link;
h = hash32to16(fnv1(p->to.sym));
if(counts[h] != 2) {
continue;
}
switch(p2->as) {
case ALEAL:
case AFMOVL:
case AFMOVW:
case AFMOVV:
// funny
continue;
}
// p2 is OP mem, reg2
// and OP is not a funny instruction.
if(p2->from.sym == p->to.sym
&& p2->from.offset == p->to.offset
&& p2->from.type == p->to.type) {
if(debug['R'] && debug['v']) {
print(" ===elide== %D\n", &p->to);
print("%P", p2);
}
// p2 is OP mem, reg2.
// change to OP reg, reg2 and
// eliminate the mov.
p2->from = p->from;
*p = *p2;
p->link = p2->link;
if(debug['R'] && debug['v']) {
print(" ===change== %P\n", p);
}
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment