ref: 0f4c2a5b39ba47f84d9b3efda76920fd688defa8
parent: 96907bd2e4deb4adbdcb668060a9376b929e9805
author: S. Gilles <sgilles@math.umd.edu>
date: Thu Apr 19 06:43:14 EDT 2018
Subnormalize special-case floats in fltXYassem. If the exponent is the lowest possible representable, then the returned float will actually be subnormal. In this case, we do not have the implied leading 1, so we need to downshift the significand so that it doesn't get lost. We end up losing the least significant bit of the significand, but that's unavoidable.
--- a/lib/std/fltbits.myr
+++ b/lib/std/fltbits.myr
@@ -62,6 +62,23 @@
const flt64assem = {sign, exp, mant
var s, m, e
+ if exp <= -Dblbias && (mant & (1ul << 52) != 0)
+ var roundup = false
+ var shift : uint64 = ((1 - Dblbias - exp) : uint64)
+ var firstcut = mant & (1 << shift)
+ var restcut = mant & ((1 << shift) - 1)
+ var lastkept = mant & (1 << (shift + 1))
+ roundup = firstcut != 0 && (lastkept != 0 || restcut != 0)
+ mant >>= shift
+ exp = -Dblbias
+ if roundup
+ mant++
+ if (mant & (1ul << 52) != 0)
+ exp++
+ ;;
+ ;;
+ ;;
+
s = (sign : uint64)
e = (exp + Dblbias : uint64) & 0x7ff
m = (mant : uint64) & ((1ul<<52) - 1)
@@ -70,6 +87,23 @@
const flt32assem = {sign, exp, mant
var s, m, e
+
+ if exp <= -Fltbias && (mant & (1 << 23) != 0)
+ var roundup = false
+ var shift : uint32 = ((1 - Fltbias - exp) : uint32)
+ var firstcut = mant & (1 << shift)
+ var restcut = mant & ((1 << shift) - 1)
+ var lastkept = mant & (1 << (shift + 1))
+ roundup = firstcut != 0 && (lastkept != 0 || restcut != 0)
+ mant >>= shift
+ exp = -Fltbias
+ if roundup
+ mant++
+ if (mant & (1 << 23) != 0)
+ exp++
+ ;;
+ ;;
+ ;;
s = (sign : uint32)
e = (exp + Fltbias : uint32) & 0xff
--- a/lib/std/test/fltbits.myr
+++ b/lib/std/test/fltbits.myr
@@ -97,6 +97,7 @@
(1.0, 0x3f800000),
(0.0000123, 0x374e5c19),
(-993.83, 0xc478751f),
+ (0.000000000000000000000000000000000000006054601, 0x0041edc4),
][:]
var uprime = std.flt32bits(f)
testr.check(c, u == uprime, "flt32bits wrong for {}: 0x{x} != 0x{x}", f, u, uprime)
@@ -116,7 +117,7 @@
}
const exploderound32 = {c
- for f : [1.0, 0.00001, 123.45, 1111111111111111.2, -1.9, -0.0001, std.flt32nan()][:]
+ for f : [1.0, 0.00001, 123.45, 1111111111111111.2, -1.9, -0.0001, 0.000000000000000000000000000000000000006054601, std.flt32nan()][:]
var n, e, s
(n, e, s) = std.flt32explode(f)
var g = std.flt32assem(n, e, s)
@@ -135,6 +136,7 @@
(false, 45, (1 << 23) | 0x23),
(true, -12, (1 << 23) | 0x3a2),
(true, -126, (1 << 23) | 0x3a1),
+ (false, -127, 4320708),
][:]
var m, f, t
(m, f, t) = std.flt32explode(std.flt32assem(n, e, s))