math/big: rewrite pure Go implementations to use math/bits

While we're here, delete addWW_g and subWW_g, per the TODO. They are now obsolete. Benchmarks on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta AddVV/1-8 5.24ns ± 2% 5.12ns ± 1% -2.11% (p=0.000 n=82+87) AddVV/2-8 6.44ns ± 1% 6.33ns ± 2% -1.82% (p=0.000 n=77+82) AddVV/3-8 7.89ns ± 8% 6.97ns ± 4% -11.71% (p=0.000 n=100+96) AddVV/4-8 8.60ns ± 0% 7.72ns ± 4% -10.24% (p=0.000 n=90+96) AddVV/5-8 10.3ns ± 4% 8.5ns ± 1% -17.02% (p=0.000 n=96+91) AddVV/10-8 16.2ns ± 5% 12.8ns ± 1% -21.11% (p=0.000 n=97+86) AddVV/100-8 148ns ± 1% 117ns ± 5% -21.07% (p=0.000 n=66+98) AddVV/1000-8 1.41µs ± 4% 1.13µs ± 3% -19.90% (p=0.000 n=97+97) AddVV/10000-8 14.2µs ± 5% 11.2µs ± 1% -20.82% (p=0.000 n=99+84) AddVV/100000-8 142µs ± 4% 113µs ± 4% -20.40% (p=0.000 n=91+92) SubVV/1-8 5.29ns ± 1% 5.11ns ± 0% -3.30% (p=0.000 n=87+88) SubVV/2-8 6.36ns ± 4% 6.33ns ± 2% -0.56% (p=0.002 n=98+73) SubVV/3-8 7.58ns ± 5% 6.98ns ± 4% -8.01% (p=0.000 n=97+91) SubVV/4-8 8.61ns ± 3% 7.98ns ± 2% -7.31% (p=0.000 n=95+83) SubVV/5-8 10.6ns ± 2% 8.5ns ± 1% -19.56% (p=0.000 n=79+89) SubVV/10-8 16.3ns ± 4% 12.7ns ± 1% -21.97% (p=0.000 n=98+82) SubVV/100-8 124ns ± 1% 118ns ± 1% -4.83% (p=0.000 n=85+81) SubVV/1000-8 1.14µs ± 5% 1.12µs ± 2% -1.17% (p=0.000 n=97+81) SubVV/10000-8 11.6µs ±10% 11.2µs ± 1% -3.39% (p=0.000 n=100+84) SubVV/100000-8 114µs ± 6% 114µs ± 5% ~ (p=0.396 n=83+94) AddVW/1-8 4.04ns ± 4% 4.34ns ± 4% +7.57% (p=0.000 n=96+98) AddVW/2-8 4.34ns ± 5% 4.40ns ± 5% +1.40% (p=0.000 n=99+98) AddVW/3-8 5.43ns ± 0% 5.54ns ± 2% +1.97% (p=0.000 n=85+94) AddVW/4-8 6.23ns ± 1% 6.18ns ± 2% -0.66% (p=0.000 n=77+78) AddVW/5-8 6.78ns ± 2% 6.90ns ± 4% +1.77% (p=0.000 n=80+99) AddVW/10-8 10.5ns ± 4% 9.9ns ± 1% -5.77% (p=0.000 n=97+69) AddVW/100-8 114ns ± 3% 91ns ± 0% -20.38% (p=0.000 n=98+77) AddVW/1000-8 1.12µs ± 1% 0.87µs ± 1% -22.80% (p=0.000 n=82+68) AddVW/10000-8 11.2µs ± 2% 8.5µs ± 5% -23.85% (p=0.000 n=85+100) AddVW/100000-8 112µs ± 2% 85µs ± 5% -24.22% (p=0.000 n=71+96) SubVW/1-8 4.09ns ± 2% 4.18ns ± 4% +2.32% (p=0.000 n=78+96) SubVW/2-8 4.59ns ± 5% 4.52ns ± 7% -1.54% (p=0.000 n=98+94) SubVW/3-8 5.41ns ±10% 5.55ns ± 1% +2.48% (p=0.000 n=100+89) SubVW/4-8 6.51ns ± 2% 6.19ns ± 0% -4.85% (p=0.000 n=97+81) SubVW/5-8 7.25ns ± 3% 6.90ns ± 4% -4.93% (p=0.000 n=97+96) SubVW/10-8 10.6ns ± 4% 9.8ns ± 2% -7.32% (p=0.000 n=95+96) SubVW/100-8 90.4ns ± 0% 90.8ns ± 0% +0.43% (p=0.000 n=83+78) SubVW/1000-8 853ns ± 4% 857ns ± 2% +0.42% (p=0.000 n=100+98) SubVW/10000-8 8.52µs ± 4% 8.53µs ± 2% ~ (p=0.061 n=99+97) SubVW/100000-8 84.8µs ± 5% 84.2µs ± 2% -0.78% (p=0.000 n=99+93) AddMulVVW/1-8 8.73ns ± 0% 5.33ns ± 3% -38.91% (p=0.000 n=91+96) AddMulVVW/2-8 14.8ns ± 3% 6.5ns ± 2% -56.33% (p=0.000 n=100+79) AddMulVVW/3-8 18.6ns ± 2% 7.8ns ± 5% -57.84% (p=0.000 n=89+96) AddMulVVW/4-8 24.0ns ± 2% 9.8ns ± 0% -59.09% (p=0.000 n=95+67) AddMulVVW/5-8 29.0ns ± 2% 11.5ns ± 5% -60.44% (p=0.000 n=90+97) AddMulVVW/10-8 54.1ns ± 0% 18.8ns ± 1% -65.37% (p=0.000 n=82+84) AddMulVVW/100-8 508ns ± 2% 165ns ± 4% -67.62% (p=0.000 n=72+98) AddMulVVW/1000-8 4.96µs ± 3% 1.55µs ± 1% -68.86% (p=0.000 n=99+91) AddMulVVW/10000-8 50.0µs ± 4% 15.5µs ± 4% -68.95% (p=0.000 n=97+97) AddMulVVW/100000-8 491µs ± 1% 156µs ± 8% -68.22% (p=0.000 n=79+95) Change-Id: I4c6ae0b4065f371aea8103f6a85d9e9274bf01d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/164965 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Robert Griesemer <gri@golang.org>

math/big: rewrite pure Go implementations to use math/bits
While we're here, delete addWW_g and subWW_g, per the TODO. They are now obsolete. Benchmarks on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta AddVV/1-8 5.24ns ± 2% 5.12ns ± 1% -2.11% (p=0.000 n=82+87) AddVV/2-8 6.44ns ± 1% 6.33ns ± 2% -1.82% (p=0.000 n=77+82) AddVV/3-8 7.89ns ± 8% 6.97ns ± 4% -11.71% (p=0.000 n=100+96) AddVV/4-8 8.60ns ± 0% 7.72ns ± 4% -10.24% (p=0.000 n=90+96) AddVV/5-8 10.3ns ± 4% 8.5ns ± 1% -17.02% (p=0.000 n=96+91) AddVV/10-8 16.2ns ± 5% 12.8ns ± 1% -21.11% (p=0.000 n=97+86) AddVV/100-8 148ns ± 1% 117ns ± 5% -21.07% (p=0.000 n=66+98) AddVV/1000-8 1.41µs ± 4% 1.13µs ± 3% -19.90% (p=0.000 n=97+97) AddVV/10000-8 14.2µs ± 5% 11.2µs ± 1% -20.82% (p=0.000 n=99+84) AddVV/100000-8 142µs ± 4% 113µs ± 4% -20.40% (p=0.000 n=91+92) SubVV/1-8 5.29ns ± 1% 5.11ns ± 0% -3.30% (p=0.000 n=87+88) SubVV/2-8 6.36ns ± 4% 6.33ns ± 2% -0.56% (p=0.002 n=98+73) SubVV/3-8 7.58ns ± 5% 6.98ns ± 4% -8.01% (p=0.000 n=97+91) SubVV/4-8 8.61ns ± 3% 7.98ns ± 2% -7.31% (p=0.000 n=95+83) SubVV/5-8 10.6ns ± 2% 8.5ns ± 1% -19.56% (p=0.000 n=79+89) SubVV/10-8 16.3ns ± 4% 12.7ns ± 1% -21.97% (p=0.000 n=98+82) SubVV/100-8 124ns ± 1% 118ns ± 1% -4.83% (p=0.000 n=85+81) SubVV/1000-8 1.14µs ± 5% 1.12µs ± 2% -1.17% (p=0.000 n=97+81) SubVV/10000-8 11.6µs ±10% 11.2µs ± 1% -3.39% (p=0.000 n=100+84) SubVV/100000-8 114µs ± 6% 114µs ± 5% ~ (p=0.396 n=83+94) AddVW/1-8 4.04ns ± 4% 4.34ns ± 4% +7.57% (p=0.000 n=96+98) AddVW/2-8 4.34ns ± 5% 4.40ns ± 5% +1.40% (p=0.000 n=99+98) AddVW/3-8 5.43ns ± 0% 5.54ns ± 2% +1.97% (p=0.000 n=85+94) AddVW/4-8 6.23ns ± 1% 6.18ns ± 2% -0.66% (p=0.000 n=77+78) AddVW/5-8 6.78ns ± 2% 6.90ns ± 4% +1.77% (p=0.000 n=80+99) AddVW/10-8 10.5ns ± 4% 9.9ns ± 1% -5.77% (p=0.000 n=97+69) AddVW/100-8 114ns ± 3% 91ns ± 0% -20.38% (p=0.000 n=98+77) AddVW/1000-8 1.12µs ± 1% 0.87µs ± 1% -22.80% (p=0.000 n=82+68) AddVW/10000-8 11.2µs ± 2% 8.5µs ± 5% -23.85% (p=0.000 n=85+100) AddVW/100000-8 112µs ± 2% 85µs ± 5% -24.22% (p=0.000 n=71+96) SubVW/1-8 4.09ns ± 2% 4.18ns ± 4% +2.32% (p=0.000 n=78+96) SubVW/2-8 4.59ns ± 5% 4.52ns ± 7% -1.54% (p=0.000 n=98+94) SubVW/3-8 5.41ns ±10% 5.55ns ± 1% +2.48% (p=0.000 n=100+89) SubVW/4-8 6.51ns ± 2% 6.19ns ± 0% -4.85% (p=0.000 n=97+81) SubVW/5-8 7.25ns ± 3% 6.90ns ± 4% -4.93% (p=0.000 n=97+96) SubVW/10-8 10.6ns ± 4% 9.8ns ± 2% -7.32% (p=0.000 n=95+96) SubVW/100-8 90.4ns ± 0% 90.8ns ± 0% +0.43% (p=0.000 n=83+78) SubVW/1000-8 853ns ± 4% 857ns ± 2% +0.42% (p=0.000 n=100+98) SubVW/10000-8 8.52µs ± 4% 8.53µs ± 2% ~ (p=0.061 n=99+97) SubVW/100000-8 84.8µs ± 5% 84.2µs ± 2% -0.78% (p=0.000 n=99+93) AddMulVVW/1-8 8.73ns ± 0% 5.33ns ± 3% -38.91% (p=0.000 n=91+96) AddMulVVW/2-8 14.8ns ± 3% 6.5ns ± 2% -56.33% (p=0.000 n=100+79) AddMulVVW/3-8 18.6ns ± 2% 7.8ns ± 5% -57.84% (p=0.000 n=89+96) AddMulVVW/4-8 24.0ns ± 2% 9.8ns ± 0% -59.09% (p=0.000 n=95+67) AddMulVVW/5-8 29.0ns ± 2% 11.5ns ± 5% -60.44% (p=0.000 n=90+97) AddMulVVW/10-8 54.1ns ± 0% 18.8ns ± 1% -65.37% (p=0.000 n=82+84) AddMulVVW/100-8 508ns ± 2% 165ns ± 4% -67.62% (p=0.000 n=72+98) AddMulVVW/1000-8 4.96µs ± 3% 1.55µs ± 1% -68.86% (p=0.000 n=99+91) AddMulVVW/10000-8 50.0µs ± 4% 15.5µs ± 4% -68.95% (p=0.000 n=97+97) AddMulVVW/100000-8 491µs ± 1% 156µs ± 8% -68.22% (p=0.000 n=79+95) Change-Id: I4c6ae0b4065f371aea8103f6a85d9e9274bf01d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/164965 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Robert Griesemer <gri@golang.org>
d5edbcac · Josh Bleecher Snyder · c7465929 · d5edbcac · d5edbcac
Commit d5edbcac authored Mar 02, 2019 by Josh Bleecher Snyder
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 178 deletions

src/math/big/arith.go src/math/big/arith.go +26 -136

src/math/big/arith_test.go src/math/big/arith_test.go +0 -42

No files found.
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@@ -19,10 +19,6 @@ const (
 	_W = bits.UintSize // word size in bits
 	_B = 1 << _W       // digit base
 	_M = _B - 1        // digit mask
-
-	_W2 = _W / 2   // half word size in bits
-	_B2 = 1 << _W2 // half digit base
-	_M2 = _B2 - 1  // half digit mask
 )

 // ----------------------------------------------------------------------------
@@ -30,50 +26,18 @@ const (
 //
 // These operations are used by the vector operations below.

-// z1<<_W + z0 = x+y+c, with c == 0 or 1
-func addWW_g(x, y, c Word) (z1, z0 Word) {
-	yc := y + c
-	z0 = x + yc
-	if z0 < x || yc < y {
-		z1 = 1
-	}
-	return
-}
-
-// z1<<_W + z0 = x-y-c, with c == 0 or 1
-func subWW_g(x, y, c Word) (z1, z0 Word) {
-	yc := y + c
-	z0 = x - yc
-	if z0 > x || yc < y {
-		z1 = 1
-	}
-	return
-}
-
 // z1<<_W + z0 = x*y
-// Adapted from Warren, Hacker's Delight, p. 132.
 func mulWW_g(x, y Word) (z1, z0 Word) {
-	x0 := x & _M2
-	x1 := x >> _W2
-	y0 := y & _M2
-	y1 := y >> _W2
-	w0 := x0 * y0
-	t := x1*y0 + w0>>_W2
-	w1 := t & _M2
-	w2 := t >> _W2
-	w1 += x0 * y1
-	z1 = x1*y1 + w2 + w1>>_W2
-	z0 = x * y
-	return
+	hi, lo := bits.Mul(uint(x), uint(y))
+	return Word(hi), Word(lo)
 }

 // z1<<_W + z0 = x*y + c
 func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
-	z1, zz0 := mulWW_g(x, y)
-	if z0 = zz0 + c; z0 < zz0 {
-		z1++
-	}
-	return
+	hi, lo := bits.Mul(uint(x), uint(y))
+	var cc uint
+	lo, cc = bits.Add(lo, uint(c), 0)
+	return Word(hi + cc), Word(lo)
 }

 // nlz returns the number of leading zeros in x.
@@ -83,122 +47,48 @@ func nlz(x Word) uint {
 }

 // q = (u1<<_W + u0 - r)/v
-// Adapted from Warren, Hacker's Delight, p. 152.
 func divWW_g(u1, u0, v Word) (q, r Word) {
-	if u1 >= v {
-		return 1<<_W - 1, 1<<_W - 1
-	}
-
-	s := nlz(v)
-	v <<= s
-
-	vn1 := v >> _W2
-	vn0 := v & _M2
-	un32 := u1<<s | u0>>(_W-s)
-	un10 := u0 << s
-	un1 := un10 >> _W2
-	un0 := un10 & _M2
-	q1 := un32 / vn1
-	rhat := un32 - q1*vn1
-
-	for q1 >= _B2 || q1*vn0 > _B2*rhat+un1 {
-		q1--
-		rhat += vn1
-		if rhat >= _B2 {
-			break
-		}
-	}
-
-	un21 := un32*_B2 + un1 - q1*v
-	q0 := un21 / vn1
-	rhat = un21 - q0*vn1
-
-	for q0 >= _B2 || q0*vn0 > _B2*rhat+un0 {
-		q0--
-		rhat += vn1
-		if rhat >= _B2 {
-			break
-		}
-	}
-
-	return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
+	qq, rr := bits.Div(uint(u1), uint(u0), uint(v))
+	return Word(qq), Word(rr)
 }

-// Keep for performance debugging.
-// Using addWW_g is likely slower.
-const use_addWW_g = false
-
 // The resulting carry c is either 0 or 1.
 func addVV_g(z, x, y []Word) (c Word) {
-	if use_addWW_g {
-		for i := range z {
-			c, z[i] = addWW_g(x[i], y[i], c)
-		}
-		return
-	}
-
-	for i, xi := range x[:len(z)] {
-		yi := y[i]
-		zi := xi + yi + c
-		z[i] = zi
-		// see "Hacker's Delight", section 2-12 (overflow detection)
-		c = (xi&yi | (xi|yi)&^zi) >> (_W - 1)
+	for i := range x[:len(z)] {
+		zi, cc := bits.Add(uint(x[i]), uint(y[i]), uint(c))
+		z[i] = Word(zi)
+		c = Word(cc)
 	}
 	return
 }

 // The resulting carry c is either 0 or 1.
 func subVV_g(z, x, y []Word) (c Word) {
-	if use_addWW_g {
-		for i := range z {
-			c, z[i] = subWW_g(x[i], y[i], c)
-		}
-		return
-	}
-
-	for i, xi := range x[:len(z)] {
-		yi := y[i]
-		zi := xi - yi - c
-		z[i] = zi
-		// see "Hacker's Delight", section 2-12 (overflow detection)
-		c = (yi&^xi | (yi|^xi)&zi) >> (_W - 1)
+	for i := range x[:len(z)] {
+		zi, cc := bits.Sub(uint(x[i]), uint(y[i]), uint(c))
+		z[i] = Word(zi)
+		c = Word(cc)
 	}
 	return
 }

 // The resulting carry c is either 0 or 1.
 func addVW_g(z, x []Word, y Word) (c Word) {
-	if use_addWW_g {
 	c = y
-		for i := range z {
-			c, z[i] = addWW_g(x[i], c, 0)
-		}
-		return
-	}
-
-	c = y
-	for i, xi := range x[:len(z)] {
-		zi := xi + c
-		z[i] = zi
-		c = xi &^ zi >> (_W - 1)
+	for i := range x[:len(z)] {
+		zi, cc := bits.Add(uint(x[i]), uint(c), 0)
+		z[i] = Word(zi)
+		c = Word(cc)
 	}
 	return
 }

 func subVW_g(z, x []Word, y Word) (c Word) {
-	if use_addWW_g {
-		c = y
-		for i := range z {
-			c, z[i] = subWW_g(x[i], c, 0)
-		}
-		return
-	}
-
 	c = y
-	for i, xi := range x[:len(z)] {
-		zi := xi - c
-		z[i] = zi
-		c = (zi &^ xi) >> (_W - 1)
+	for i := range x[:len(z)] {
+		zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
+		z[i] = Word(zi)
+		c = Word(cc)
 	}
 	return
 }
@@ -255,11 +145,11 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
 	return
 }

-// TODO(gri) Remove use of addWW_g here and then we can remove addWW_g and subWW_g.
 func addMulVVW_g(z, x []Word, y Word) (c Word) {
 	for i := range z {
 		z1, z0 := mulAddWWW_g(x[i], y, z[i])
-		c, z[i] = addWW_g(z0, c, 0)
+		lo, cc := bits.Add(uint(z0), uint(c), 0)
+		c, z[i] = Word(cc), Word(lo)
 		c += z1
 	}
 	return

--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@@ -14,48 +14,6 @@ import (

 var isRaceBuilder = strings.HasSuffix(testenv.Builder(), "-race")

-type funWW func(x, y, c Word) (z1, z0 Word)
-type argWW struct {
-	x, y, c, z1, z0 Word
-}
-
-var sumWW = []argWW{
-	{0, 0, 0, 0, 0},
-	{0, 1, 0, 0, 1},
-	{0, 0, 1, 0, 1},
-	{0, 1, 1, 0, 2},
-	{12345, 67890, 0, 0, 80235},
-	{12345, 67890, 1, 0, 80236},
-	{_M, 1, 0, 1, 0},
-	{_M, 0, 1, 1, 0},
-	{_M, 1, 1, 1, 1},
-	{_M, _M, 0, 1, _M - 1},
-	{_M, _M, 1, 1, _M},
-}
-
-func testFunWW(t *testing.T, msg string, f funWW, a argWW) {
-	z1, z0 := f(a.x, a.y, a.c)
-	if z1 != a.z1 || z0 != a.z0 {
-		t.Errorf("%s%+v\n\tgot z1:z0 = %#x:%#x; want %#x:%#x", msg, a, z1, z0, a.z1, a.z0)
-	}
-}
-
-func TestFunWW(t *testing.T) {
-	for _, a := range sumWW {
-		arg := a
-		testFunWW(t, "addWW_g", addWW_g, arg)
-
-		arg = argWW{a.y, a.x, a.c, a.z1, a.z0}
-		testFunWW(t, "addWW_g symmetric", addWW_g, arg)
-
-		arg = argWW{a.z0, a.x, a.c, a.z1, a.y}
-		testFunWW(t, "subWW_g", subWW_g, arg)
-
-		arg = argWW{a.z0, a.y, a.c, a.z1, a.x}
-		testFunWW(t, "subWW_g symmetric", subWW_g, arg)
-	}
-}
-
 type funVV func(z, x, y []Word) (c Word)
 type argVV struct {
 	z, x, y nat