cmd/8g: eliminate obviously useless temps before regopt.

This patch introduces a sort of pre-regopt peephole optimization. When a temporary is introduced that just holds a value for the duration of the next instruction and is otherwise unused, we elide it to make the job of regopt easier. Since x86 has very few registers, this situation happens very often. The result is large savings in stack variables for arithmetic-heavy functions. crypto/aes benchmark old ns/op new ns/op delta BenchmarkEncrypt 1301 392 -69.87% BenchmarkDecrypt 1309 368 -71.89% BenchmarkExpand 2913 1036 -64.44% benchmark old MB/s new MB/s speedup BenchmarkEncrypt 12.29 40.74 3.31x BenchmarkDecrypt 12.21 43.37 3.55x crypto/md5 benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1761 914 -48.10% BenchmarkHash1K 16912 5570 -67.06% BenchmarkHash8K 123895 38286 -69.10% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.54 8.75 1.93x BenchmarkHash1K 60.55 183.83 3.04x BenchmarkHash8K 66.12 213.97 3.24x bench/go1 benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 8364835000 8303154000 -0.74% BenchmarkFannkuch11 7511723000 6381729000 -15.04% BenchmarkGobDecode 27764090 27103270 -2.38% BenchmarkGobEncode 11240880 11184370 -0.50% BenchmarkGzip 1470224000 856668400 -41.73% BenchmarkGunzip 240660800 201697300 -16.19% BenchmarkJSONEncode 155225800 185571900 +19.55% BenchmarkJSONDecode 243347900 282123000 +15.93% BenchmarkMandelbrot200 12240970 12201880 -0.32% BenchmarkParse 8837445 8765210 -0.82% BenchmarkRevcomp 2556310000 1868566000 -26.90% BenchmarkTemplate 389298000 379792000 -2.44% benchmark old MB/s new MB/s speedup BenchmarkGobDecode 27.64 28.32 1.02x BenchmarkGobEncode 68.28 68.63 1.01x BenchmarkGzip 13.20 22.65 1.72x BenchmarkGunzip 80.63 96.21 1.19x BenchmarkJSONEncode 12.50 10.46 0.84x BenchmarkJSONDecode 7.97 6.88 0.86x BenchmarkParse 6.55 6.61 1.01x BenchmarkRevcomp 99.43 136.02 1.37x BenchmarkTemplate 4.98 5.11 1.03x Fixes #4035. R=golang-dev, minux.ma, rsc CC=golang-dev https://golang.org/cl/6828056

cmd/8g: eliminate obviously useless temps before regopt.
This patch introduces a sort of pre-regopt peephole optimization. When a temporary is introduced that just holds a value for the duration of the next instruction and is otherwise unused, we elide it to make the job of regopt easier. Since x86 has very few registers, this situation happens very often. The result is large savings in stack variables for arithmetic-heavy functions. crypto/aes benchmark old ns/op new ns/op delta BenchmarkEncrypt 1301 392 -69.87% BenchmarkDecrypt 1309 368 -71.89% BenchmarkExpand 2913 1036 -64.44% benchmark old MB/s new MB/s speedup BenchmarkEncrypt 12.29 40.74 3.31x BenchmarkDecrypt 12.21 43.37 3.55x crypto/md5 benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1761 914 -48.10% BenchmarkHash1K 16912 5570 -67.06% BenchmarkHash8K 123895 38286 -69.10% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.54 8.75 1.93x BenchmarkHash1K 60.55 183.83 3.04x BenchmarkHash8K 66.12 213.97 3.24x bench/go1 benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 8364835000 8303154000 -0.74% BenchmarkFannkuch11 7511723000 6381729000 -15.04% BenchmarkGobDecode 27764090 27103270 -2.38% BenchmarkGobEncode 11240880 11184370 -0.50% BenchmarkGzip 1470224000 856668400 -41.73% BenchmarkGunzip 240660800 201697300 -16.19% BenchmarkJSONEncode 155225800 185571900 +19.55% BenchmarkJSONDecode 243347900 282123000 +15.93% BenchmarkMandelbrot200 12240970 12201880 -0.32% BenchmarkParse 8837445 8765210 -0.82% BenchmarkRevcomp 2556310000 1868566000 -26.90% BenchmarkTemplate 389298000 379792000 -2.44% benchmark old MB/s new MB/s speedup BenchmarkGobDecode 27.64 28.32 1.02x BenchmarkGobEncode 68.28 68.63 1.01x BenchmarkGzip 13.20 22.65 1.72x BenchmarkGunzip 80.63 96.21 1.19x BenchmarkJSONEncode 12.50 10.46 0.84x BenchmarkJSONDecode 7.97 6.88 0.86x BenchmarkParse 6.55 6.61 1.01x BenchmarkRevcomp 99.43 136.02 1.37x BenchmarkTemplate 4.98 5.11 1.03x Fixes #4035. R=golang-dev, minux.ma, rsc CC=golang-dev https://golang.org/cl/6828056
fa316ba4 · Rémy Oudompheng · 0a47d2ef · fa316ba4
Commit fa316ba4 authored Nov 13, 2012 by Rémy Oudompheng
Hide whitespace changes
Inline Side-by-side

Showing with 121 additions and 0 deletions

src/cmd/8g/reg.c src/cmd/8g/reg.c +121 -0

No files found.
--- a/src/cmd/8g/reg.c
+++ b/src/cmd/8g/reg.c
@@ -40,6 +40,7 @@
 static	int	first	= 1;

 static	void	fixjmp(Prog*);
+static	void	fixtemp(Prog*);

 Reg*
 rega(void)
@@ -135,6 +136,7 @@ regopt(Prog *firstp)
 		first = 0;
 	}
 	
+	fixtemp(firstp);
 	fixjmp(firstp);

 	// count instructions
@@ -1692,3 +1694,122 @@ fixjmp(Prog *firstp)
 		print("\n");
 	}
 }
+
+static uint32
+fnv1(Sym *sym)
+{
+	uint32 h;
+	char *s;
+
+	h = 2166136261U;
+	for(s=sym->name;*s;s++) {
+		h = (16777619 * h) ^ (uint32)(uint8)(*s);
+	}
+	return h;
+}
+
+static uint16
+hash32to16(uint32 h)
+{
+	return (h & 0xffff) ^ (h >> 16);
+}
+
+/*
+ * fixtemp eliminates sequences like:
+ *   MOV reg1, mem
+ *   OP mem, reg2
+ * when mem is a stack variable which is not mentioned
+ * anywhere else. The instructions are replaced by
+ *   OP reg1, reg2
+ * this reduces the number of variables that the register optimizer
+ * sees, which lets it do a better job and makes it less likely to turn
+ * itself off.
+ */
+void
+fixtemp(Prog *firstp)
+{
+	static uint8 counts[1<<16]; // A hash table to count variable occurences.
+	int i;
+	Prog *p, *p2;
+	uint32 h;
+
+	if(debug['R'] && debug['v'])
+		print("\nfixtemp\n");
+
+	// Count variable references. We actually use a hashtable so this
+	// is only approximate.
+	for(i=0; i<nelem(counts); i++)
+		counts[i] = 0;
+	for(p=firstp; p!=P; p=p->link) {
+		if(p->from.type == D_AUTO) {
+			h = hash32to16(fnv1(p->from.sym));
+			//print("seen %S hash %d\n", p->from.sym, hash32to16(h));
+			if(counts[h] < 10)
+				counts[h]++;
+		}
+		if(p->to.type == D_AUTO) {
+			h = hash32to16(fnv1(p->to.sym));
+			//print("seen %S hash %d\n", p->to.sym, hash32to16(h));
+			if(counts[h] < 10)
+				counts[h]++;
+		}
+	}
+
+	// Eliminate single-write, single-read stack variables.
+	for(p=firstp; p!=P; p=p->link) {
+		if(debug['R'] && debug['v'])
+			print("%P\n", p);
+		if(p->link == P
+			|| !RtoB(p->from.type)
+			|| p->to.type != D_AUTO
+			|| isfloat[p->to.etype])
+			continue;
+		switch(p->as) {
+		case AMOVB:
+			if(p->to.width == 1)
+				break;
+		case AMOVW:
+			if(p->to.width == 2)
+				break;
+		case AMOVL:
+			if(p->to.width == 4)
+				break;
+		default:
+			continue;
+		}
+		// p is a MOV reg, mem.
+		// and it is not a float.
+		p2 = p->link;
+		h = hash32to16(fnv1(p->to.sym));
+		if(counts[h] != 2) {
+			continue;
+		}
+		switch(p2->as) {
+		case ALEAL:
+		case AFMOVL: 
+		case AFMOVW:
+		case AFMOVV:
+			// funny
+			continue;
+		}
+		// p2 is OP mem, reg2
+		// and OP is not a funny instruction.
+		if(p2->from.sym == p->to.sym
+			&& p2->from.offset == p->to.offset
+			&& p2->from.type == p->to.type) {
+			if(debug['R'] && debug['v']) {
+				print(" ===elide== %D\n", &p->to);
+				print("%P", p2);
+			}
+			// p2 is OP mem, reg2.
+			// change to OP reg, reg2 and
+			// eliminate the mov.
+			p2->from = p->from;
+			*p = *p2;
+			p->link = p2->link;
+			if(debug['R'] && debug['v']) {
+				print(" ===change== %P\n", p);
+			}
+		}
+	}
+}