ref: dbe60482e84a28a55ae6d9f5a7c9728e00f690d3
parent: 77d584ee227d829e842efdc938dffa322665b22c
author: S. Gilles <sgilles@math.umd.edu>
date: Wed Mar 21 05:41:33 EDT 2018
Fix fma64 for subnormal results arising from normal inputs, start fma32
--- a/lib/math/fpmath-fma-impl.myr
+++ b/lib/math/fpmath-fma-impl.myr
@@ -9,9 +9,92 @@
const exp_mask64 : uint64 = 0x7ff << 52
pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
+ var xd : flt64 = flt64fromflt32(x)
+ var yd : flt64 = flt64fromflt32(y)
+ var zd : flt64 = flt64fromflt32(z)
+ var prod : flt64 = xd * yd
+ var r : flt64 = prod + zd
+ var r32 : flt32 = flt32fromflt64(r)
+ var rn, re, rs
+ (rn, re, rs) = std.flt64explode(r)
+
+ if re == 1023 || r - prod == zd || rs & 0x1fffffff != 0x10000000
+ -> r32
+ ;;
+
+ /*
+ This is a non-infinity, non-NaN, non-exact result, and
+ the bits we cut off in truncating to r32 are exactly a
+ halfway case, so rounding might be difficult.
+ */
-> 0.0
}
+const flt64fromflt32 = {f : flt32
+ var n, e, s
+ (n, e, s) = std.flt32explode(f)
+ var xs : uint64 = (s : uint64)
+ var xe : int64 = (e : int64)
+
+ if e == 127
+ -> std.flt64assem(n, 1023, xs)
+ elif e == -127
+ /*
+ All subnormals in single precision (except 0.0s)
+ can be upgraded to double precision, since the
+ exponent range is so much wider.
+ */
+ var first1 = find_first1_64(xs, 52)
+ if first1 < 0
+ -> std.flt64assem(n, -1023, 0)
+ ;;
+ xs = xs << (52 - (first1 : uint64))
+ xe = -127 + (52 - first1)
+ -> std.flt64assem(n, xe, xs)
+ ;;
+
+ -> std.flt64assem(n, xe, xs)
+}
+
+const flt32fromflt64 = {f : flt64
+ var n, e, s
+ (n, e, s) = std.flt64explode(f)
+ var ts : uint32
+ var te : int32 = (e : int32)
+
+ if e >= 127
+ if e == 1023 && s != 0
+ /* NaN */
+ -> std.flt32assem(n, 127, 1)
+ else
+ /* infinity */
+ -> std.flt32assem(n, 127, 0)
+ ;;
+ ;;
+
+ if e >= -126
+ /* normal */
+ ts = ((s >> (52 - 23)) : uint32)
+ if need_round_away(0, s, 52 - 23)
+ ts++
+ ;;
+ -> std.flt32assem(n, te, ts)
+ ;;
+
+ /* subnormal already, will have to go to 0 */
+ if e == -1023
+ -> std.flt32assem(n, -127, 0)
+ ;;
+
+ /* subnormal (at least, it will be) */
+ te = -127
+ ts = (shr(s, (52 - 23) + (-1022 - e)) : uint32)
+ if need_round_away(0, s, (52 - 23) + (-1022 - e))
+ ts++
+ ;;
+ -> std.flt32assem(n, te, ts)
+}
+
pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
var xn : bool, yn : bool, zn : bool
var xe : int64, ye : int64, ze : int64
@@ -155,10 +238,10 @@
var res_s = 0
if res_firstbit_e <= -1023
/* Subnormal case */
- if res_lastbit_e + 128 < -1022
+ if res_lastbit_e + 128 < 12 - 1022
res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
- elif res_lastbit_e + 64 < -1022
+ elif res_lastbit_e + 64 < 12 - 1022
res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
else
@@ -337,7 +420,7 @@
}
/* Find the first 1 bit in a bitstring */
-const find_first1_64 = {b : uint64, start : int64
+const find_first1_64 : (b : uint64, start : int64 -> int64) = {b : uint64, start : int64
for var j = start; j >= 0; --j
var m = shl(1, j)
if b & m != 0
@@ -359,9 +442,9 @@
/*
For [ h ][ l ], where bitpos_last is the position of the last
- bit that was included in the 53-bit wide result (l's last bit
- has position 0), decide whether rounding up/away is needed. This
- is true if
+ bit that was included in the truncated result (l's last bit has
+ position 0), decide whether rounding up/away is needed. This is
+ true if
- following bitpos_last is a 1, then a non-zero sequence, or
--- a/lib/math/test/fpmath-fma-impl.myr
+++ b/lib/math/test/fpmath-fma-impl.myr
@@ -17,7 +17,8 @@
var inputs : (uint64, uint64, uint64, uint64)[:] = [
/*
These are mostly obtained by running fpmath-consensus
- with seed 1234. Each covers a different corner case.
+ with seed 1234. Each (mostly) covers a different
+ corner case.
*/
(0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
(0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
@@ -33,6 +34,11 @@
(0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
(0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
(0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
+ (0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
+ (0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
+ (0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
+ (0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
+ (0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
][:]
for (x, y, z, r) : inputs