cmd/compile: add signed divisibility rules

"Division by invariant integers using multiplication" paper by Granlund and Montgomery contains a method for directly computing divisibility (x%c == 0 for c constant) by means of the modular inverse. The method is further elaborated in "Hacker's Delight" by Warren Section 10-17 This general rule can compute divisibilty by one multiplication, and add and a compare for odd divisors and an additional rotate for even divisors. To apply the divisibility rule, we must take into account the rules to rewrite x%c = x-((x/c)*c) and (x/c) for c constant on the first optimization pass "opt". This complicates the matching as we want to match only in the cases where the result of (x/c) is not also needed. So, we must match on the expanded form of (x/c) in the expression x == c*(x/c) in the "late opt" pass after common subexpresion elimination. Note, that if there is an intermediate opt pass introduced in the future we could simplify these rules by delaying the magic division rewrite to "late opt" and matching directly on (x/c) in the intermediate opt pass. On amd64, the divisibility check is 30-45% faster. name old time/op new time/op delta` DivisiblePow2constI64-4 0.83ns ± 1% 0.82ns ± 0% ~ (p=0.079 n=5+4) DivisibleconstI64-4 2.68ns ± 1% 1.87ns ± 0% -30.33% (p=0.000 n=5+4) DivisibleWDivconstI64-4 2.69ns ± 1% 2.71ns ± 3% ~ (p=1.000 n=5+5) DivisiblePow2constI32-4 1.15ns ± 1% 1.15ns ± 0% ~ (p=0.238 n=5+4) DivisibleconstI32-4 2.24ns ± 1% 1.20ns ± 0% -46.48% (p=0.016 n=5+4) DivisibleWDivconstI32-4 2.27ns ± 1% 2.27ns ± 1% ~ (p=0.683 n=5+5) DivisiblePow2constI16-4 0.81ns ± 1% 0.82ns ± 1% ~ (p=0.135 n=5+5) DivisibleconstI16-4 2.11ns ± 2% 1.20ns ± 1% -42.99% (p=0.008 n=5+5) DivisibleWDivconstI16-4 2.23ns ± 0% 2.27ns ± 2% +1.79% (p=0.029 n=4+4) DivisiblePow2constI8-4 0.81ns ± 1% 0.81ns ± 1% ~ (p=0.286 n=5+5) DivisibleconstI8-4 2.13ns ± 3% 1.19ns ± 1% -43.84% (p=0.008 n=5+5) DivisibleWDivconstI8-4 2.23ns ± 1% 2.25ns ± 1% ~ (p=0.183 n=5+5) Fixes #30282 Fixes #15806 Change-Id: Id20d78263a4fdfe0509229ae4dfa2fede83fc1d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/173998 Run-TryBot: Brian Kessler <brian.m.kessler@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>

cmd/compile: add signed divisibility rules
"Division by invariant integers using multiplication" paper by Granlund and Montgomery contains a method for directly computing divisibility (x%c == 0 for c constant) by means of the modular inverse. The method is further elaborated in "Hacker's Delight" by Warren Section 10-17 This general rule can compute divisibilty by one multiplication, and add and a compare for odd divisors and an additional rotate for even divisors. To apply the divisibility rule, we must take into account the rules to rewrite x%c = x-((x/c)*c) and (x/c) for c constant on the first optimization pass "opt". This complicates the matching as we want to match only in the cases where the result of (x/c) is not also needed. So, we must match on the expanded form of (x/c) in the expression x == c*(x/c) in the "late opt" pass after common subexpresion elimination. Note, that if there is an intermediate opt pass introduced in the future we could simplify these rules by delaying the magic division rewrite to "late opt" and matching directly on (x/c) in the intermediate opt pass. On amd64, the divisibility check is 30-45% faster. name old time/op new time/op delta` DivisiblePow2constI64-4 0.83ns ± 1% 0.82ns ± 0% ~ (p=0.079 n=5+4) DivisibleconstI64-4 2.68ns ± 1% 1.87ns ± 0% -30.33% (p=0.000 n=5+4) DivisibleWDivconstI64-4 2.69ns ± 1% 2.71ns ± 3% ~ (p=1.000 n=5+5) DivisiblePow2constI32-4 1.15ns ± 1% 1.15ns ± 0% ~ (p=0.238 n=5+4) DivisibleconstI32-4 2.24ns ± 1% 1.20ns ± 0% -46.48% (p=0.016 n=5+4) DivisibleWDivconstI32-4 2.27ns ± 1% 2.27ns ± 1% ~ (p=0.683 n=5+5) DivisiblePow2constI16-4 0.81ns ± 1% 0.82ns ± 1% ~ (p=0.135 n=5+5) DivisibleconstI16-4 2.11ns ± 2% 1.20ns ± 1% -42.99% (p=0.008 n=5+5) DivisibleWDivconstI16-4 2.23ns ± 0% 2.27ns ± 2% +1.79% (p=0.029 n=4+4) DivisiblePow2constI8-4 0.81ns ± 1% 0.81ns ± 1% ~ (p=0.286 n=5+5) DivisibleconstI8-4 2.13ns ± 3% 1.19ns ± 1% -43.84% (p=0.008 n=5+5) DivisibleWDivconstI8-4 2.23ns ± 1% 2.25ns ± 1% ~ (p=0.183 n=5+5) Fixes #30282 Fixes #15806 Change-Id: Id20d78263a4fdfe0509229ae4dfa2fede83fc1d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/173998 Run-TryBot: Brian Kessler <brian.m.kessler@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
4d9dd358 · Brian Kessler · Brad Fitzpatrick · e7d08b6f · 4d9dd358 · 4d9dd358
Commit 4d9dd358 authored Apr 05, 2019 by Brian Kessler Committed by Brad Fitzpatrick Apr 30, 2019
6 changed files
--- a/src/cmd/compile/internal/gc/testdata/arith_test.go
+++ b/src/cmd/compile/internal/gc/testdata/arith_test.go
@@ -1283,24 +1283,65 @@ func div19_uint64(n uint64) bool {
 	return n%19 == 0
 }

+//go:noinline
+func div6_int8(n int8) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_int16(n int16) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_int32(n int32) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_int64(n int64) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div19_int8(n int8) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_int16(n int16) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_int32(n int32) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_int64(n int64) bool {
+	return n%19 == 0
+}
+
 // testDivisibility confirms that rewrite rules x%c ==0 for c constant are correct.
 func testDivisibility(t *testing.T) {
+	// unsigned tests
 	// test an even and an odd divisor
-	var six, nineteen uint64 = 6, 19
+	var sixU, nineteenU uint64 = 6, 19
 	// test all inputs for uint8, uint16
 	for i := uint64(0); i <= math.MaxUint16; i++ {
 		if i <= math.MaxUint8 {
-			if want, got := uint8(i)%uint8(six) == 0, div6_uint8(uint8(i)); got != want {
+			if want, got := uint8(i)%uint8(sixU) == 0, div6_uint8(uint8(i)); got != want {
 				t.Errorf("div6_uint8(%d) = %v want %v", i, got, want)
 			}
-			if want, got := uint8(i)%uint8(nineteen) == 0, div19_uint8(uint8(i)); got != want {
+			if want, got := uint8(i)%uint8(nineteenU) == 0, div19_uint8(uint8(i)); got != want {
 				t.Errorf("div6_uint19(%d) = %v want %v", i, got, want)
 			}
 		}
-		if want, got := uint16(i)%uint16(six) == 0, div6_uint16(uint16(i)); got != want {
+		if want, got := uint16(i)%uint16(sixU) == 0, div6_uint16(uint16(i)); got != want {
 			t.Errorf("div6_uint16(%d) = %v want %v", i, got, want)
 		}
-		if want, got := uint16(i)%uint16(nineteen) == 0, div19_uint16(uint16(i)); got != want {
+		if want, got := uint16(i)%uint16(nineteenU) == 0, div19_uint16(uint16(i)); got != want {
 			t.Errorf("div19_uint16(%d) = %v want %v", i, got, want)
 		}
 	}
@@ -1308,35 +1349,106 @@ func testDivisibility(t *testing.T) {
 	// spot check inputs for uint32 and uint64
 	xu := []uint64{
 		0, 1, 2, 3, 4, 5,
-		six, 2 * six, 3 * six, 5 * six, 12345 * six,
-		six + 1, 2*six - 5, 3*six + 3, 5*six + 4, 12345*six - 2,
-		nineteen, 2 * nineteen, 3 * nineteen, 5 * nineteen, 12345 * nineteen,
-		nineteen + 1, 2*nineteen - 5, 3*nineteen + 3, 5*nineteen + 4, 12345*nineteen - 2,
+		sixU, 2 * sixU, 3 * sixU, 5 * sixU, 12345 * sixU,
+		sixU + 1, 2*sixU - 5, 3*sixU + 3, 5*sixU + 4, 12345*sixU - 2,
+		nineteenU, 2 * nineteenU, 3 * nineteenU, 5 * nineteenU, 12345 * nineteenU,
+		nineteenU + 1, 2*nineteenU - 5, 3*nineteenU + 3, 5*nineteenU + 4, 12345*nineteenU - 2,
 		maxU32, maxU32 - 1, maxU32 - 2, maxU32 - 3, maxU32 - 4,
-		maxU32, maxU32 - 5, maxU32 - 6, maxU32 - 7, maxU32 - 8,
-		maxU32, maxU32 - 9, maxU32 - 10, maxU32 - 11, maxU32 - 12,
-		maxU32, maxU32 - 13, maxU32 - 14, maxU32 - 15, maxU32 - 16,
-		maxU32, maxU32 - 17, maxU32 - 18, maxU32 - 19, maxU32 - 20,
+		maxU32 - 5, maxU32 - 6, maxU32 - 7, maxU32 - 8,
+		maxU32 - 9, maxU32 - 10, maxU32 - 11, maxU32 - 12,
+		maxU32 - 13, maxU32 - 14, maxU32 - 15, maxU32 - 16,
+		maxU32 - 17, maxU32 - 18, maxU32 - 19, maxU32 - 20,
 		maxU64, maxU64 - 1, maxU64 - 2, maxU64 - 3, maxU64 - 4,
-		maxU64, maxU64 - 5, maxU64 - 6, maxU64 - 7, maxU64 - 8,
-		maxU64, maxU64 - 9, maxU64 - 10, maxU64 - 11, maxU64 - 12,
-		maxU64, maxU64 - 13, maxU64 - 14, maxU64 - 15, maxU64 - 16,
-		maxU64, maxU64 - 17, maxU64 - 18, maxU64 - 19, maxU64 - 20,
+		maxU64 - 5, maxU64 - 6, maxU64 - 7, maxU64 - 8,
+		maxU64 - 9, maxU64 - 10, maxU64 - 11, maxU64 - 12,
+		maxU64 - 13, maxU64 - 14, maxU64 - 15, maxU64 - 16,
+		maxU64 - 17, maxU64 - 18, maxU64 - 19, maxU64 - 20,
 	}
 	for _, x := range xu {
 		if x <= maxU32 {
-			if want, got := uint32(x)%uint32(six) == 0, div6_uint32(uint32(x)); got != want {
+			if want, got := uint32(x)%uint32(sixU) == 0, div6_uint32(uint32(x)); got != want {
 				t.Errorf("div6_uint32(%d) = %v want %v", x, got, want)
 			}
-			if want, got := uint32(x)%uint32(nineteen) == 0, div19_uint32(uint32(x)); got != want {
+			if want, got := uint32(x)%uint32(nineteenU) == 0, div19_uint32(uint32(x)); got != want {
 				t.Errorf("div19_uint32(%d) = %v want %v", x, got, want)
 			}
 		}
-		if want, got := x%six == 0, div6_uint64(x); got != want {
+		if want, got := x%sixU == 0, div6_uint64(x); got != want {
 			t.Errorf("div6_uint64(%d) = %v want %v", x, got, want)
 		}
-		if want, got := x%nineteen == 0, div19_uint64(x); got != want {
+		if want, got := x%nineteenU == 0, div19_uint64(x); got != want {
 			t.Errorf("div19_uint64(%d) = %v want %v", x, got, want)
 		}
 	}
+
+	// signed tests
+	// test an even and an odd divisor
+	var sixS, nineteenS int64 = 6, 19
+	// test all inputs for int8, int16
+	for i := int64(math.MinInt16); i <= math.MaxInt16; i++ {
+		if math.MinInt8 <= i && i <= math.MaxInt8 {
+			if want, got := int8(i)%int8(sixS) == 0, div6_int8(int8(i)); got != want {
+				t.Errorf("div6_int8(%d) = %v want %v", i, got, want)
+			}
+			if want, got := int8(i)%int8(nineteenS) == 0, div19_int8(int8(i)); got != want {
+				t.Errorf("div6_int19(%d) = %v want %v", i, got, want)
+			}
+		}
+		if want, got := int16(i)%int16(sixS) == 0, div6_int16(int16(i)); got != want {
+			t.Errorf("div6_int16(%d) = %v want %v", i, got, want)
+		}
+		if want, got := int16(i)%int16(nineteenS) == 0, div19_int16(int16(i)); got != want {
+			t.Errorf("div19_int16(%d) = %v want %v", i, got, want)
+		}
+	}
+	var minI32, maxI32, minI64, maxI64 int64 = math.MinInt32, math.MaxInt32, math.MinInt64, math.MaxInt64
+	// spot check inputs for int32 and int64
+	xs := []int64{
+		0, 1, 2, 3, 4, 5,
+		-1, -2, -3, -4, -5,
+		sixS, 2 * sixS, 3 * sixS, 5 * sixS, 12345 * sixS,
+		sixS + 1, 2*sixS - 5, 3*sixS + 3, 5*sixS + 4, 12345*sixS - 2,
+		-sixS, -2 * sixS, -3 * sixS, -5 * sixS, -12345 * sixS,
+		-sixS + 1, -2*sixS - 5, -3*sixS + 3, -5*sixS + 4, -12345*sixS - 2,
+		nineteenS, 2 * nineteenS, 3 * nineteenS, 5 * nineteenS, 12345 * nineteenS,
+		nineteenS + 1, 2*nineteenS - 5, 3*nineteenS + 3, 5*nineteenS + 4, 12345*nineteenS - 2,
+		-nineteenS, -2 * nineteenS, -3 * nineteenS, -5 * nineteenS, -12345 * nineteenS,
+		-nineteenS + 1, -2*nineteenS - 5, -3*nineteenS + 3, -5*nineteenS + 4, -12345*nineteenS - 2,
+		minI32, minI32 + 1, minI32 + 2, minI32 + 3, minI32 + 4,
+		minI32 + 5, minI32 + 6, minI32 + 7, minI32 + 8,
+		minI32 + 9, minI32 + 10, minI32 + 11, minI32 + 12,
+		minI32 + 13, minI32 + 14, minI32 + 15, minI32 + 16,
+		minI32 + 17, minI32 + 18, minI32 + 19, minI32 + 20,
+		maxI32, maxI32 - 1, maxI32 - 2, maxI32 - 3, maxI32 - 4,
+		maxI32 - 5, maxI32 - 6, maxI32 - 7, maxI32 - 8,
+		maxI32 - 9, maxI32 - 10, maxI32 - 11, maxI32 - 12,
+		maxI32 - 13, maxI32 - 14, maxI32 - 15, maxI32 - 16,
+		maxI32 - 17, maxI32 - 18, maxI32 - 19, maxI32 - 20,
+		minI64, minI64 + 1, minI64 + 2, minI64 + 3, minI64 + 4,
+		minI64 + 5, minI64 + 6, minI64 + 7, minI64 + 8,
+		minI64 + 9, minI64 + 10, minI64 + 11, minI64 + 12,
+		minI64 + 13, minI64 + 14, minI64 + 15, minI64 + 16,
+		minI64 + 17, minI64 + 18, minI64 + 19, minI64 + 20,
+		maxI64, maxI64 - 1, maxI64 - 2, maxI64 - 3, maxI64 - 4,
+		maxI64 - 5, maxI64 - 6, maxI64 - 7, maxI64 - 8,
+		maxI64 - 9, maxI64 - 10, maxI64 - 11, maxI64 - 12,
+		maxI64 - 13, maxI64 - 14, maxI64 - 15, maxI64 - 16,
+		maxI64 - 17, maxI64 - 18, maxI64 - 19, maxI64 - 20,
+	}
+	for _, x := range xs {
+		if minI32 <= x && x <= maxI32 {
+			if want, got := int32(x)%int32(sixS) == 0, div6_int32(int32(x)); got != want {
+				t.Errorf("div6_int32(%d) = %v want %v", x, got, want)
+			}
+			if want, got := int32(x)%int32(nineteenS) == 0, div19_int32(int32(x)); got != want {
+				t.Errorf("div19_int32(%d) = %v want %v", x, got, want)
+			}
+		}
+		if want, got := x%sixS == 0, div6_int64(x); got != want {
+			t.Errorf("div6_int64(%d) = %v want %v", x, got, want)
+		}
+		if want, got := x%nineteenS == 0, div19_int64(x); got != want {
+			t.Errorf("div19_int64(%d) = %v want %v", x, got, want)
+		}
+	}
 }
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@@ -1174,6 +1174,10 @@
 	(Eq32 (Mod32u <typ.UInt32> (ZeroExt8to32 <typ.UInt32> x) (Const32 <typ.UInt32> [c&0xff])) (Const32 <typ.UInt32> [0]))
 (Eq16 (Mod16u x (Const16  [c])) (Const16 [0])) && x.Op != OpConst16 && udivisibleOK(16,c) && !hasSmallRotate(config) ->
 	(Eq32 (Mod32u <typ.UInt32> (ZeroExt16to32 <typ.UInt32> x) (Const32 <typ.UInt32> [c&0xffff])) (Const32 <typ.UInt32> [0]))
+(Eq8 (Mod8 x (Const8  [c])) (Const8 [0])) && x.Op != OpConst8 && sdivisibleOK(8,c) && !hasSmallRotate(config) ->
+	(Eq32 (Mod32 <typ.Int32> (SignExt8to32 <typ.Int32> x) (Const32 <typ.Int32> [c])) (Const32 <typ.Int32> [0]))
+(Eq16 (Mod16 x (Const16  [c])) (Const16 [0])) && x.Op != OpConst16 && sdivisibleOK(16,c) && !hasSmallRotate(config) ->
+	(Eq32 (Mod32 <typ.Int32> (SignExt16to32 <typ.Int32> x) (Const32 <typ.Int32> [c])) (Const32 <typ.Int32> [0]))

 // Divisibility checks x%c == 0 convert to multiply and rotate.
 // Note, x%c == 0 is rewritten as x == c*(x/c) during the opt pass
@@ -1184,6 +1188,7 @@
 // Note that if there were an intermediate opt pass, this rule could be applied
 // directly on the Div op and magic division rewrites could be delayed to late opt.

+// Unsigned divisibility checks convert to multiply and rotate.
 (Eq8 x (Mul8 (Const8 [c])
  (Trunc32to8
    (Rsh32Ux64
@@ -1489,8 +1494,209 @@
 			(Const64 <typ.UInt64> [int64(udivisible(64,c).max)])
 		)

+// Signed divisibility checks convert to multiply, add and rotate.
+(Eq8 x (Mul8 (Const8 [c])
+  (Sub8
+    (Rsh32x64
+      mul:(Mul32
+        (Const32 [m])
+        (SignExt8to32 x))
+      (Const64 [s]))
+    (Rsh32x64
+      (SignExt8to32 x)
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic(8,c).m) && s == 8+smagic(8,c).s
+	&& x.Op != OpConst8 && sdivisibleOK(8,c)
+ -> (Leq8U
+			(RotateLeft8 <typ.UInt8>
+				(Add8 <typ.UInt8>
+					(Mul8 <typ.UInt8>
+						(Const8 <typ.UInt8> [int64(int8(sdivisible(8,c).m))])
+						x)
+					(Const8 <typ.UInt8> [int64(int8(sdivisible(8,c).a))])
+				)
+				(Const8 <typ.UInt8> [int64(8-sdivisible(8,c).k)])
+			)
+			(Const8 <typ.UInt8> [int64(int8(sdivisible(8,c).max))])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Sub16
+    (Rsh32x64
+      mul:(Mul32
+        (Const32 [m])
+        (SignExt16to32 x))
+      (Const64 [s]))
+    (Rsh32x64
+      (SignExt16to32 x)
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic(16,c).m) && s == 16+smagic(16,c).s
+	&& x.Op != OpConst16 && sdivisibleOK(16,c)
+ -> (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Add16 <typ.UInt16>
+					(Mul16 <typ.UInt16>
+						(Const16 <typ.UInt16> [int64(int16(sdivisible(16,c).m))])
+						x)
+					(Const16 <typ.UInt16> [int64(int16(sdivisible(16,c).a))])
+				)
+				(Const16 <typ.UInt16> [int64(16-sdivisible(16,c).k)])
+			)
+			(Const16 <typ.UInt16> [int64(int16(sdivisible(16,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh64x64
+      mul:(Mul64
+        (Const64 [m])
+        (SignExt32to64 x))
+      (Const64 [s]))
+    (Rsh64x64
+      (SignExt32to64 x)
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic(32,c).m) && s == 32+smagic(32,c).s
+	&& x.Op != OpConst32 && sdivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).m))])
+						x)
+					(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).a))])
+				)
+				(Const32 <typ.UInt32> [int64(32-sdivisible(32,c).k)])
+			)
+			(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh32x64
+      mul:(Hmul32
+        (Const32 [m])
+        x)
+      (Const64 [s]))
+    (Rsh32x64
+      x
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(int32(smagic(32,c).m/2)) && s == smagic(32,c).s-1
+	&& x.Op != OpConst32 && sdivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).m))])
+						x)
+					(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).a))])
+				)
+				(Const32 <typ.UInt32> [int64(32-sdivisible(32,c).k)])
+			)
+			(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh32x64
+      (Add32
+        mul:(Hmul32
+          (Const32 [m])
+          x)
+        x)
+      (Const64 [s]))
+    (Rsh32x64
+      x
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(int32(smagic(32,c).m)) && s == smagic(32,c).s
+	&& x.Op != OpConst32 && sdivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).m))])
+						x)
+					(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).a))])
+				)
+				(Const32 <typ.UInt32> [int64(32-sdivisible(32,c).k)])
+			)
+			(Const32 <typ.UInt32> [int64(int32(sdivisible(32,c).max))])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+  (Sub64
+    (Rsh64x64
+      mul:(Hmul64
+        (Const64 [m])
+        x)
+      (Const64 [s]))
+    (Rsh64x64
+      x
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic(64,c).m/2) && s == smagic(64,c).s-1
+	&& x.Op != OpConst64 && sdivisibleOK(64,c)
+ -> (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Add64 <typ.UInt64>
+					(Mul64 <typ.UInt64>
+						(Const64 <typ.UInt64> [int64(sdivisible(64,c).m)])
+						x)
+					(Const64 <typ.UInt64> [int64(sdivisible(64,c).a)])
+				)
+				(Const64 <typ.UInt64> [int64(64-sdivisible(64,c).k)])
+			)
+			(Const64 <typ.UInt64> [int64(sdivisible(64,c).max)])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+  (Sub64
+    (Rsh64x64
+      (Add64
+        mul:(Hmul64
+          (Const64 [m])
+          x)
+        x)
+      (Const64 [s]))
+    (Rsh64x64
+      x
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic(64,c).m) && s == smagic(64,c).s
+	&& x.Op != OpConst64 && sdivisibleOK(64,c)
+ -> (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Add64 <typ.UInt64>
+					(Mul64 <typ.UInt64>
+						(Const64 <typ.UInt64> [int64(sdivisible(64,c).m)])
+						x)
+					(Const64 <typ.UInt64> [int64(sdivisible(64,c).a)])
+				)
+				(Const64 <typ.UInt64> [int64(64-sdivisible(64,c).k)])
+			)
+			(Const64 <typ.UInt64> [int64(sdivisible(64,c).max)])
+		)
+
 // Divisibility check for signed integers for power of two constant are simple mask.
-// However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c *(n/c)
+// However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c*(n/c)
 // where n/c contains fixup code to handle signed n.
 (Eq8 n (Lsh8x64
  (Rsh8x64

--- a/src/cmd/compile/internal/ssa/magic.go
+++ b/src/cmd/compile/internal/ssa/magic.go
@@ -195,7 +195,7 @@ func smagic(n uint, c int64) smagicData {
 // by using the modular inverse with respect to the word size 2^n.
 //
 // Given c, compute m such that (c * m) mod 2^n == 1
-// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*cinv mod 2^n
+// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
 //
 // x can range from 0, c, 2c, 3c, ... ⎣(2^n - 1)/c⎦ * c the maximum multiple
 // Thus, x*m mod 2^n is 0, 1, 2, 3, ... ⎣(2^n - 1)/c⎦
@@ -285,3 +285,97 @@ func udivisible(n uint, c int64) udivisibleData {
 		max: max,
 	}
 }
+
+// For signed integers, a similar method follows.
+//
+// Given c > 1 and odd, compute m such that (c * m) mod 2^n == 1
+// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
+//
+// x can range from ⎡-2^(n-1)/c⎤ * c, ... -c, 0, c, ...  ⎣(2^(n-1) - 1)/c⎦ * c
+// Thus, x*m mod 2^n is ⎡-2^(n-1)/c⎤, ... -2, -1, 0, 1, 2, ... ⎣(2^(n-1) - 1)/c⎦
+//
+// So, x is a multiple of c if and only if:
+// ⎡-2^(n-1)/c⎤ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
+//
+// Since c > 1 and odd, this can be simplified by
+// ⎡-2^(n-1)/c⎤ == ⎡(-2^(n-1) + 1)/c⎤ == -⎣(2^(n-1) - 1)/c⎦
+//
+// -⎣(2^(n-1) - 1)/c⎦ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
+//
+// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
+// We can test whether x is divisible by both d0 and 2^k.
+//
+// Let m be such that (d0 * m) mod 2^n == 1.
+// Let q = x*m mod 2^n. Then c divides x if:
+//
+// -⎣(2^(n-1) - 1)/d0⎦ <= q <= ⎣(2^(n-1) - 1)/d0⎦ and q ends in at least k 0-bits
+//
+// To transform this to a single comparison, we use the following theorem (ZRS in Hacker's Delight).
+//
+// For a >= 0 the following conditions are equivalent:
+// 1) -a <= x <= a and x ends in at least k 0-bits
+// 2) RotRight(x+a', k) <= ⎣2a'/2^k⎦
+//
+// Where a' = a & -2^k (a with its right k bits set to zero)
+//
+// To see that 1 & 2 are equivalent, note that -a <= x <= a is equivalent to
+// -a' <= x <= a' if and only if x ends in at least k 0-bits.  Adding -a' to each side gives,
+// 0 <= x + a' <= 2a' and x + a' ends in at least k 0-bits if and only if x does since a' has
+// k 0-bits by definition.  We can use theorem ZRU above with x -> x + a' and a -> 2a' giving 1) == 2).
+//
+// Let m be such that (d0 * m) mod 2^n == 1.
+// Let q = x*m mod 2^n.
+// Let a' = ⎣(2^(n-1) - 1)/d0⎦ & -2^k
+//
+// Then the divisibility test is:
+//
+// RotRight(q+a', k) <= ⎣2a'/2^k⎦
+//
+// Note that the calculation is performed using unsigned integers.
+// Since a' can have n-1 bits, 2a' may have n bits and there is no risk of overflow.
+
+// sdivisibleOK reports whether we should strength reduce a n-bit dividisibilty check by c.
+func sdivisibleOK(n uint, c int64) bool {
+	if c < 0 {
+		// Doesn't work for negative c.
+		return false
+	}
+	// Doesn't work for 0.
+	// Don't use it for powers of 2.
+	return c&(c-1) != 0
+}
+
+type sdivisibleData struct {
+	k   int64  // trailingZeros(c)
+	m   uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
+	a   uint64 // ⎣(2^(n-1) - 1)/ (c>>k)⎦ & -(1<<k) additive constant
+	max uint64 // ⎣(2 a) / (1<<k)⎦ max value to for divisibility
+}
+
+func sdivisible(n uint, c int64) sdivisibleData {
+	d := uint64(c)
+	k := bits.TrailingZeros64(d)
+	d0 := d >> uint(k) // the odd portion of the divisor
+
+	mask := ^uint64(0) >> (64 - n)
+
+	// Calculate the multiplicative inverse via Newton's method.
+	// Quadratic convergence doubles the number of correct bits per iteration.
+	m := d0            // initial guess correct to 3-bits d0*d0 mod 8 == 1
+	m = m * (2 - m*d0) // 6-bits
+	m = m * (2 - m*d0) // 12-bits
+	m = m * (2 - m*d0) // 24-bits
+	m = m * (2 - m*d0) // 48-bits
+	m = m * (2 - m*d0) // 96-bits >= 64-bits
+	m = m & mask
+
+	a := ((mask >> 1) / d0) & -(1 << uint(k))
+	max := (2 * a) >> uint(k)
+
+	return sdivisibleData{
+		k:   int64(k),
+		m:   m,
+		a:   a,
+		max: max,
+	}
+}
--- a/src/cmd/compile/internal/ssa/magic_test.go
+++ b/src/cmd/compile/internal/ssa/magic_test.go
@@ -184,7 +184,7 @@ func TestMagicSigned(t *testing.T) {
 				-c - 1, -c, -c + 1, c - 1, c, c + 1,
 				-2*c - 1, -2 * c, -2*c + 1, 2*c - 1, 2 * c, 2*c + 1,
 				-mul - 1, -mul, -mul + 1, mul - 1, mul, mul + 1,
-				int64(1)<<n - 1, -int64(1)<<n + 1,
+				int64(1)<<(n-1) - 1, -int64(1) << (n - 1),
 			} {
 				X := new(big.Int).SetInt64(x)
 				if X.Cmp(Min) < 0 || X.Cmp(Max) > 0 {
@@ -303,3 +303,108 @@ func TestDivisibleUnsigned(t *testing.T) {
 		}
 	}
 }
+
+func testDivisibleExhaustive(t *testing.T, n uint) {
+	minI := -int64(1) << (n - 1)
+	maxI := int64(1) << (n - 1)
+	for c := int64(1); c < maxI; c++ {
+		if !sdivisibleOK(n, int64(c)) {
+			continue
+		}
+		k := sdivisible(n, int64(c)).k
+		m := sdivisible(n, int64(c)).m
+		a := sdivisible(n, int64(c)).a
+		max := sdivisible(n, int64(c)).max
+		mask := ^uint64(0) >> (64 - n)
+		for i := minI; i < maxI; i++ {
+			want := i%c == 0
+			mul := (uint64(i)*m + a) & mask
+			rot := (mul>>uint(k) | mul<<(n-uint(k))) & mask
+			got := rot <= max
+			if want != got {
+				t.Errorf("signed divisible wrong for %d %% %d == 0: got %v, want %v (k=%d,m=%d,a=%d,max=%d)\n", i, c, got, want, k, m, a, max)
+			}
+		}
+	}
+}
+
+func TestDivisibleExhaustive8(t *testing.T) {
+	testDivisibleExhaustive(t, 8)
+}
+
+func TestDivisibleExhaustive16(t *testing.T) {
+	if testing.Short() {
+		t.Skip("slow test; skipping")
+	}
+	testDivisibleExhaustive(t, 16)
+}
+
+func TestDivisibleSigned(t *testing.T) {
+	One := new(big.Int).SetInt64(1)
+	for _, n := range [...]uint{8, 16, 32, 64} {
+		TwoNMinusOne := new(big.Int).Lsh(One, n-1)
+		Max := new(big.Int).Sub(TwoNMinusOne, One)
+		Min := new(big.Int).Neg(TwoNMinusOne)
+		for _, c := range [...]int64{
+			3,
+			5,
+			6,
+			7,
+			9,
+			10,
+			11,
+			12,
+			13,
+			14,
+			15,
+			17,
+			1<<7 - 1,
+			1<<7 + 1,
+			1<<15 - 1,
+			1<<15 + 1,
+			1<<31 - 1,
+			1<<31 + 1,
+			1<<63 - 1,
+		} {
+			if c>>(n-1) != 0 {
+				continue // not appropriate for the given n.
+			}
+			if !sdivisibleOK(n, int64(c)) {
+				t.Errorf("expected n=%d c=%d to pass\n", n, c)
+			}
+			k := sdivisible(n, int64(c)).k
+			m := sdivisible(n, int64(c)).m
+			a := sdivisible(n, int64(c)).a
+			max := sdivisible(n, int64(c)).max
+			mask := ^uint64(0) >> (64 - n)
+
+			C := new(big.Int).SetInt64(c)
+
+			// Find largest multiple of c.
+			Mul := new(big.Int).Div(Max, C)
+			Mul.Mul(Mul, C)
+			mul := Mul.Int64()
+
+			// Try some input values, mostly around multiples of c.
+			for _, x := range [...]int64{
+				-1, 1,
+				-c - 1, -c, -c + 1, c - 1, c, c + 1,
+				-2*c - 1, -2 * c, -2*c + 1, 2*c - 1, 2 * c, 2*c + 1,
+				-mul - 1, -mul, -mul + 1, mul - 1, mul, mul + 1,
+				int64(1)<<(n-1) - 1, -int64(1) << (n - 1),
+			} {
+				X := new(big.Int).SetInt64(x)
+				if X.Cmp(Min) < 0 || X.Cmp(Max) > 0 {
+					continue
+				}
+				want := x%c == 0
+				mul := (uint64(x)*m + a) & mask
+				rot := (mul>>uint(k) | mul<<(n-uint(k))) & mask
+				got := rot <= max
+				if want != got {
+					t.Errorf("signed divisible wrong for %d %% %d == 0: got %v, want %v (k=%d,m=%d,a=%d,max=%d)\n", x, c, got, want, k, m, a, max)
+				}
+			}
+		}
+	}
+}
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -216,14 +216,14 @@ func ConstMods(n1 uint, n2 int) (uint, int) {
 }

 // Check that divisibility checks x%c==0 are converted to MULs and rotates
-func Divisible(n uint) (even, odd bool) {
+func Divisible(n1 uint, n2 int) (bool, bool, bool, bool) {
 	// amd64:"MOVQ\t[$]-6148914691236517205","IMULQ","ROLQ\t[$]63",-"DIVQ"
 	// 386:"IMUL3L\t[$]-1431655765","ROLL\t[$]31",-"DIVQ"
 	// arm64:"MOVD\t[$]-6148914691236517205","MUL","ROR",-"DIV"
 	// arm:"MUL","CMP\t[$]715827882",-".*udiv"
 	// ppc64:"MULLD","ROTL\t[$]63"
 	// ppc64le:"MULLD","ROTL\t[$]63"
-	even = n%6 == 0
+	evenU := n1%6 == 0

 	// amd64:"MOVQ\t[$]-8737931403336103397","IMULQ",-"ROLQ",-"DIVQ"
 	// 386:"IMUL3L\t[$]678152731",-"ROLL",-"DIVQ"
@@ -231,8 +231,25 @@ func Divisible(n uint) (even, odd bool) {
 	// arm:"MUL","CMP\t[$]226050910",-".*udiv"
 	// ppc64:"MULLD",-"ROTL"
 	// ppc64le:"MULLD",-"ROTL"
-	odd = n%19 == 0
-	return
+	oddU := n1%19 == 0
+
+	// amd64:"IMULQ","ADD","ROLQ\t[$]63",-"DIVQ"
+	// 386:"IMUL3L\t[$]-1431655765","ADDL\t[$]715827882","ROLL\t[$]31",-"DIVQ"
+	// arm64:"MUL","ADD\t[$]3074457345618258602","ROR",-"DIV"
+	// arm:"MUL","ADD\t[$]715827882",-".*udiv"
+	// ppc64:"MULLD","ADD","ROTL\t[$]63"
+	// ppc64le:"MULLD","ADD","ROTL\t[$]63"
+	evenS := n2%6 == 0
+
+	// amd64:"IMULQ","ADD",-"ROLQ",-"DIVQ"
+	// 386:"IMUL3L\t[$]678152731","ADDL\t[$]113025455",-"ROLL",-"DIVQ"
+	// arm64:"MUL","ADD\t[$]485440633518672410",-"ROR",-"DIV"
+	// arm:"MUL","ADD\t[$]113025455",-".*udiv"
+	// ppc64:"MULLD","ADD",-"ROTL"
+	// ppc64le:"MULLD","ADD",-"ROTL"
+	oddS := n2%19 == 0
+
+	return evenU, oddU, evenS, oddS
 }

 // Check that fix-up code is not generated for divisions where it has been proven that