Commit a1453781 authored by Robert Griesemer's avatar Robert Griesemer

math/big: fix rounding to smallest denormal for Float.Float32/64

Converting a big.Float value x to a float32/64 value did not correctly
round x up to the smallest denormal float32/64 if x was smaller than the
smallest denormal float32/64, but larger than 0.5 of a smallest denormal
float32/64.

Handle this case explicitly and simplify some code in the turn.

For #14651.

Change-Id: I025e24bf8f0e671581a7de0abf7c1cd7e6403a6c
Reviewed-on: https://go-review.googlesource.com/20816
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarAlan Donovan <adonovan@google.com>
parent 478b594d
...@@ -874,21 +874,43 @@ func (x *Float) Float32() (float32, Accuracy) { ...@@ -874,21 +874,43 @@ func (x *Float) Float32() (float32, Accuracy) {
emax = bias // 127 largest unbiased exponent (normal) emax = bias // 127 largest unbiased exponent (normal)
) )
// Float mantissa m is 0.5 <= m < 1.0; compute exponent for float32 mantissa. // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
p := mbits + 1 // precision of normal float
// If the exponent is too small, we may have a denormal number // Compute precision p for float32 mantissa.
// in which case we have fewer mantissa bits available: recompute // If the exponent is too small, we have a denormal number before
// precision. // rounding and fewer than p mantissa bits of precision available
// (the exponent remains fixed but the mantissa gets shifted right).
p := mbits + 1 // precision of normal float
if e < emin { if e < emin {
// recompute precision
p = mbits + 1 - emin + int(e) p = mbits + 1 - emin + int(e)
// Make sure we have at least 1 bit so that we don't // If p == 0, the mantissa of x is shifted so much to the right
// lose numbers rounded up to the smallest denormal. // that its msb falls immediately to the right of the float32
if p < 1 { // mantissa space. In other words, if the smallest denormal is
p = 1 // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
// If m == 0.5, it is rounded down to even, i.e., 0.0.
// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
// underflow to ±0
if x.neg {
var z float32
return -z, Above
}
return 0.0, Below
}
// otherwise, round up
// We handle p == 0 explicitly because it's easy and because
// Float.round doesn't support rounding to 0 bits of precision.
if p == 0 {
if x.neg {
return -math.SmallestNonzeroFloat32, Below
}
return math.SmallestNonzeroFloat32, Above
} }
} }
// p > 0
// round // round
var r Float var r Float
...@@ -898,12 +920,8 @@ func (x *Float) Float32() (float32, Accuracy) { ...@@ -898,12 +920,8 @@ func (x *Float) Float32() (float32, Accuracy) {
// Rounding may have caused r to overflow to ±Inf // Rounding may have caused r to overflow to ±Inf
// (rounding never causes underflows to 0). // (rounding never causes underflows to 0).
if r.form == inf { // If the exponent is too large, also overflow to ±Inf.
e = emax + 1 // cause overflow below if r.form == inf || e > emax {
}
// If the exponent is too large, overflow to ±Inf.
if e > emax {
// overflow // overflow
if x.neg { if x.neg {
return float32(math.Inf(-1)), Below return float32(math.Inf(-1)), Below
...@@ -921,17 +939,10 @@ func (x *Float) Float32() (float32, Accuracy) { ...@@ -921,17 +939,10 @@ func (x *Float) Float32() (float32, Accuracy) {
// Rounding may have caused a denormal number to // Rounding may have caused a denormal number to
// become normal. Check again. // become normal. Check again.
if e < emin { if e < emin {
// denormal number // denormal number: recompute precision
if e < dmin { // Since rounding may have at best increased precision
// underflow to ±0 // and we have eliminated p <= 0 early, we know p > 0.
if x.neg { // bexp == 0 for denormals
var z float32
return -z, Above
}
return 0.0, Below
}
// bexp = 0
// recompute precision
p = mbits + 1 - emin + int(e) p = mbits + 1 - emin + int(e)
mant = msb32(r.mant) >> uint(fbits-p) mant = msb32(r.mant) >> uint(fbits-p)
} else { } else {
...@@ -983,21 +994,43 @@ func (x *Float) Float64() (float64, Accuracy) { ...@@ -983,21 +994,43 @@ func (x *Float) Float64() (float64, Accuracy) {
emax = bias // 1023 largest unbiased exponent (normal) emax = bias // 1023 largest unbiased exponent (normal)
) )
// Float mantissa m is 0.5 <= m < 1.0; compute exponent for float64 mantissa. // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
p := mbits + 1 // precision of normal float
// If the exponent is too small, we may have a denormal number // Compute precision p for float64 mantissa.
// in which case we have fewer mantissa bits available: recompute // If the exponent is too small, we have a denormal number before
// precision. // rounding and fewer than p mantissa bits of precision available
// (the exponent remains fixed but the mantissa gets shifted right).
p := mbits + 1 // precision of normal float
if e < emin { if e < emin {
// recompute precision
p = mbits + 1 - emin + int(e) p = mbits + 1 - emin + int(e)
// Make sure we have at least 1 bit so that we don't // If p == 0, the mantissa of x is shifted so much to the right
// lose numbers rounded up to the smallest denormal. // that its msb falls immediately to the right of the float64
if p < 1 { // mantissa space. In other words, if the smallest denormal is
p = 1 // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
// If m == 0.5, it is rounded down to even, i.e., 0.0.
// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
// underflow to ±0
if x.neg {
var z float64
return -z, Above
}
return 0.0, Below
}
// otherwise, round up
// We handle p == 0 explicitly because it's easy and because
// Float.round doesn't support rounding to 0 bits of precision.
if p == 0 {
if x.neg {
return -math.SmallestNonzeroFloat64, Below
}
return math.SmallestNonzeroFloat64, Above
} }
} }
// p > 0
// round // round
var r Float var r Float
...@@ -1007,17 +1040,13 @@ func (x *Float) Float64() (float64, Accuracy) { ...@@ -1007,17 +1040,13 @@ func (x *Float) Float64() (float64, Accuracy) {
// Rounding may have caused r to overflow to ±Inf // Rounding may have caused r to overflow to ±Inf
// (rounding never causes underflows to 0). // (rounding never causes underflows to 0).
if r.form == inf { // If the exponent is too large, also overflow to ±Inf.
e = emax + 1 // cause overflow below if r.form == inf || e > emax {
}
// If the exponent is too large, overflow to ±Inf.
if e > emax {
// overflow // overflow
if x.neg { if x.neg {
return math.Inf(-1), Below return float64(math.Inf(-1)), Below
} }
return math.Inf(+1), Above return float64(math.Inf(+1)), Above
} }
// e <= emax // e <= emax
...@@ -1030,17 +1059,10 @@ func (x *Float) Float64() (float64, Accuracy) { ...@@ -1030,17 +1059,10 @@ func (x *Float) Float64() (float64, Accuracy) {
// Rounding may have caused a denormal number to // Rounding may have caused a denormal number to
// become normal. Check again. // become normal. Check again.
if e < emin { if e < emin {
// denormal number // denormal number: recompute precision
if e < dmin { // Since rounding may have at best increased precision
// underflow to ±0 // and we have eliminated p <= 0 early, we know p > 0.
if x.neg { // bexp == 0 for denormals
var z float64
return -z, Above
}
return 0.0, Below
}
// bexp = 0
// recompute precision
p = mbits + 1 - emin + int(e) p = mbits + 1 - emin + int(e)
mant = msb64(r.mant) >> uint(fbits-p) mant = msb64(r.mant) >> uint(fbits-p)
} else { } else {
......
...@@ -829,7 +829,7 @@ func TestFloatFloat32(t *testing.T) { ...@@ -829,7 +829,7 @@ func TestFloatFloat32(t *testing.T) {
}{ }{
{"0", 0, Exact}, {"0", 0, Exact},
// underflow // underflow to zero
{"1e-1000", 0, Below}, {"1e-1000", 0, Below},
{"0x0.000002p-127", 0, Below}, {"0x0.000002p-127", 0, Below},
{"0x.0000010p-126", 0, Below}, {"0x.0000010p-126", 0, Below},
...@@ -843,25 +843,39 @@ func TestFloatFloat32(t *testing.T) { ...@@ -843,25 +843,39 @@ func TestFloatFloat32(t *testing.T) {
{"1p-149", math.SmallestNonzeroFloat32, Exact}, {"1p-149", math.SmallestNonzeroFloat32, Exact},
{"0x.fffffep-126", math.Float32frombits(0x7fffff), Exact}, // largest denormal {"0x.fffffep-126", math.Float32frombits(0x7fffff), Exact}, // largest denormal
// special cases (see issue 14553) // special denormal cases (see issues 14553, 14651)
{"0x0.bp-149", math.Float32frombits(0x000000000), Below}, // ToNearestEven rounds down (to even) {"0x0.0000001p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero
{"0x0.cp-149", math.Float32frombits(0x000000001), Above}, {"0x0.0000008p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero
{"0x0.0000010p-126", math.Float32frombits(0x00000000), Below}, // rounded down to even
{"0x1.0p-149", math.Float32frombits(0x000000001), Exact}, {"0x0.0000011p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal
{"0x0.0000018p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal
{"0x1.0000000p-149", math.Float32frombits(0x00000001), Exact}, // smallest denormal
{"0x0.0000020p-126", math.Float32frombits(0x00000001), Exact}, // smallest denormal
{"0x0.fffffe0p-126", math.Float32frombits(0x007fffff), Exact}, // largest denormal
{"0x1.0000000p-126", math.Float32frombits(0x00800000), Exact}, // smallest normal
{"0x0.8p-149", math.Float32frombits(0x000000000), Below}, // rounded down to even
{"0x0.9p-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
{"0x0.ap-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
{"0x0.bp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
{"0x0.cp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
{"0x1.0p-149", math.Float32frombits(0x000000001), Exact}, // smallest denormal
{"0x1.7p-149", math.Float32frombits(0x000000001), Below}, {"0x1.7p-149", math.Float32frombits(0x000000001), Below},
{"0x1.8p-149", math.Float32frombits(0x000000002), Above}, {"0x1.8p-149", math.Float32frombits(0x000000002), Above},
{"0x1.9p-149", math.Float32frombits(0x000000002), Above}, {"0x1.9p-149", math.Float32frombits(0x000000002), Above},
{"0x2.0p-149", math.Float32frombits(0x000000002), Exact}, {"0x2.0p-149", math.Float32frombits(0x000000002), Exact},
{"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // ToNearestEven rounds down (to even) {"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // rounded down to even
{"0x2.9p-149", math.Float32frombits(0x000000003), Above}, {"0x2.9p-149", math.Float32frombits(0x000000003), Above},
{"0x3.0p-149", math.Float32frombits(0x000000003), Exact}, {"0x3.0p-149", math.Float32frombits(0x000000003), Exact},
{"0x3.7p-149", math.Float32frombits(0x000000003), Below}, {"0x3.7p-149", math.Float32frombits(0x000000003), Below},
{"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // ToNearestEven rounds up (to even) {"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // rounded up to even
{"0x4.0p-149", math.Float32frombits(0x000000004), Exact}, {"0x4.0p-149", math.Float32frombits(0x000000004), Exact},
{"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // ToNearestEven rounds down (to even) {"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // rounded down to even
{"0x4.9p-149", math.Float32frombits(0x000000005), Above}, {"0x4.9p-149", math.Float32frombits(0x000000005), Above},
// specific case from issue 14553 // specific case from issue 14553
...@@ -907,7 +921,7 @@ func TestFloatFloat32(t *testing.T) { ...@@ -907,7 +921,7 @@ func TestFloatFloat32(t *testing.T) {
x := makeFloat(tx) x := makeFloat(tx)
out, acc := x.Float32() out, acc := x.Float32()
if !alike32(out, tout) || acc != tacc { if !alike32(out, tout) || acc != tacc {
t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc) t.Errorf("%s: got %g (%#08x, %s); want %g (%#08x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc)
} }
// test that x.SetFloat64(float64(f)).Float32() == f // test that x.SetFloat64(float64(f)).Float32() == f
...@@ -929,21 +943,30 @@ func TestFloatFloat64(t *testing.T) { ...@@ -929,21 +943,30 @@ func TestFloatFloat64(t *testing.T) {
}{ }{
{"0", 0, Exact}, {"0", 0, Exact},
// underflow // underflow to zero
{"1e-1000", 0, Below}, {"1e-1000", 0, Below},
{"0x0.0000000000001p-1023", 0, Below}, {"0x0.0000000000001p-1023", 0, Below},
{"0x0.00000000000008p-1022", 0, Below}, {"0x0.00000000000008p-1022", 0, Below},
// denormals // denormals
{"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal {"0x0.00000000000010p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
{"0x.8p-1073", math.SmallestNonzeroFloat64, Exact}, {"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
{"1p-1074", math.SmallestNonzeroFloat64, Exact}, {"1p-1074", math.SmallestNonzeroFloat64, Exact},
{"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal {"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal
// special cases (see issue 14553) // special denormal cases (see issues 14553, 14651)
{"0x0.bp-1074", math.Float64frombits(0x00000000000000000), Below}, // ToNearestEven rounds down (to even) {"0x0.00000000000001p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero
{"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above}, {"0x0.00000000000004p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero
{"0x0.00000000000008p-1022", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even
{"0x0.00000000000009p-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x0.0000000000000ap-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x0.8p-1074", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even
{"0x0.9p-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x0.ap-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x0.bp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
{"0x1.0p-1074", math.Float64frombits(0x00000000000000001), Exact}, {"0x1.0p-1074", math.Float64frombits(0x00000000000000001), Exact},
{"0x1.7p-1074", math.Float64frombits(0x00000000000000001), Below}, {"0x1.7p-1074", math.Float64frombits(0x00000000000000001), Below},
...@@ -951,15 +974,15 @@ func TestFloatFloat64(t *testing.T) { ...@@ -951,15 +974,15 @@ func TestFloatFloat64(t *testing.T) {
{"0x1.9p-1074", math.Float64frombits(0x00000000000000002), Above}, {"0x1.9p-1074", math.Float64frombits(0x00000000000000002), Above},
{"0x2.0p-1074", math.Float64frombits(0x00000000000000002), Exact}, {"0x2.0p-1074", math.Float64frombits(0x00000000000000002), Exact},
{"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // ToNearestEven rounds down (to even) {"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // rounded down to even
{"0x2.9p-1074", math.Float64frombits(0x00000000000000003), Above}, {"0x2.9p-1074", math.Float64frombits(0x00000000000000003), Above},
{"0x3.0p-1074", math.Float64frombits(0x00000000000000003), Exact}, {"0x3.0p-1074", math.Float64frombits(0x00000000000000003), Exact},
{"0x3.7p-1074", math.Float64frombits(0x00000000000000003), Below}, {"0x3.7p-1074", math.Float64frombits(0x00000000000000003), Below},
{"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // ToNearestEven rounds up (to even) {"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // rounded up to even
{"0x4.0p-1074", math.Float64frombits(0x00000000000000004), Exact}, {"0x4.0p-1074", math.Float64frombits(0x00000000000000004), Exact},
{"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // ToNearestEven rounds down (to even) {"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // rounded down to even
{"0x4.9p-1074", math.Float64frombits(0x00000000000000005), Above}, {"0x4.9p-1074", math.Float64frombits(0x00000000000000005), Above},
// normals // normals
...@@ -1005,7 +1028,7 @@ func TestFloatFloat64(t *testing.T) { ...@@ -1005,7 +1028,7 @@ func TestFloatFloat64(t *testing.T) {
x := makeFloat(tx) x := makeFloat(tx)
out, acc := x.Float64() out, acc := x.Float64()
if !alike64(out, tout) || acc != tacc { if !alike64(out, tout) || acc != tacc {
t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc) t.Errorf("%s: got %g (%#016x, %s); want %g (%#016x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc)
} }
// test that x.SetFloat64(f).Float64() == f // test that x.SetFloat64(f).Float64() == f
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment