ref: 96cb72770eb9c9bf3259ebb4647ef88092438808
parent: 755433e998180ba049bc22975b8ecffa23bf0357
author: S. Gilles <sgilles@math.umd.edu>
date: Tue Feb 27 15:19:32 EST 2018
Implement a few simple floating-point functions Proof of concept. Trunc/floor/ceil are probably the simplest FP functions that require bit-poking.
--- a/lib/bld.sub
+++ b/lib/bld.sub
@@ -8,6 +8,7 @@
inifile
iter
json
+ math
regex
std
sys
--- /dev/null
+++ b/lib/math/bld.sub
@@ -1,0 +1,8 @@
+lib math =
+ fpmath.myr
+
+ # trunc
+ fpmath-trunc-impl.myr
+
+ lib ../std:std
+;;
--- /dev/null
+++ b/lib/math/fpmath-trunc-impl.myr
@@ -1,0 +1,160 @@
+use std
+
+pkg math =
+ pkglocal const trunc32 : (f : flt32 -> flt32)
+ pkglocal const floor32 : (f : flt32 -> flt32)
+ pkglocal const ceil32 : (f : flt32 -> flt32)
+ pkglocal const trunc64 : (f : flt64 -> flt64)
+ pkglocal const floor64 : (f : flt64 -> flt64)
+ pkglocal const ceil64 : (f : flt64 -> flt64)
+;;
+
+pkglocal const trunc32 = {f : flt32
+ var u : uint32 = std.flt32bits(f)
+ var e : uint32 = (((u >> 23) & 0xff) : uint32) - 127
+ var e_lt_zero : uint32 = ((e >> 31) : uint32) & 0x1
+ var e_ge_zero : uint32 = 1 - e_lt_zero
+ var e_ge_23 : uint32 = 1 - ((e - 23) >> 31)
+ /*
+ The significand 1 . m1 m2 ... m23 needs to be
+ truncated, which corresponds to zeroing all mi
+ bits where i is beyond the exponent e (they are
+ the actual sub-integer portion).
+ */
+ var m : uint32 = ~(((1 << 23) - 1) >> e)
+ m |= (-1 : uint32) * e_ge_23
+ var v_ge_zero : uint32 = (u & m) * e_ge_zero
+
+ /*
+ On the other hand, if the exponent is < 0, "23
+ - e" is garbage, and we should just return +/-
+ zero
+ */
+ var v_lt_zero : uint32 = (u & (1 << 31)) * e_lt_zero
+
+ /* Try to save a branch */
+ var v : uint32 = v_ge_zero + v_lt_zero
+ -> std.flt32frombits(v)
+}
+
+pkglocal const floor32 = {f : flt32
+ var u : uint32 = std.flt32bits(f)
+ var e : int32 = (((u >> 23) & 0xff) : int32) - 127
+ var shift_e : uint32 = (e : uint32)
+
+ /* Many special cases */
+ if e >= 23 || u == 0x80000000
+ -> f
+ elif (e < 0) && (u & (1 << 31) != 0)
+ -> -1.0
+ elif e < 0
+ -> 0.0
+ ;;
+
+ if u & (1 << 31) != 0
+ var fractional_mask : uint32 = (((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 23) >> shift_e)
+ ;;
+ -> std.flt32frombits(v)
+ ;;
+
+ var m : uint32 = ~(((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & m
+ -> std.flt32frombits(v)
+}
+
+pkglocal const ceil32 = {f;
+ var u : uint32 = std.flt32bits(f)
+ var e : int32 = (((u >> 23) & 0xff) : int32) - 127
+ var shift_e : uint32 = (e : uint32)
+ if e >= 23 || u == 0x0
+ -> f
+ elif (e < 0) && (u & (1 << 31) == 0)
+ -> 1.0
+ elif e < 0
+ -> -0.0
+ ;;
+
+ if u & (1 << 31) == 0
+ var fractional_mask : uint32 = (((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 23) >> shift_e)
+ ;;
+ -> std.flt32frombits(v)
+ ;;
+
+ var m : uint32 = ~(((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & m
+ -> std.flt32frombits(v)
+}
+
+pkglocal const trunc64 = {f : flt64
+ var u : uint64 = std.flt64bits(f)
+ var e : uint64 = (((u >> 52) & 0x7ff) : uint64) - 1023
+ var e_lt_zero : uint64 = ((e >> 63) : uint64) & 0x1
+ var e_ge_zero : uint64 = 1 - e_lt_zero
+ var e_ge_52 : uint64 = 1 - ((e - 52) >> 63)
+ var m : uint64 = ~(((1 << 52) - 1) >> e)
+ m |= (-1 : uint64) * e_ge_52
+ var v_ge_zero : uint64 = (u & m) * e_ge_zero
+ var v_lt_zero : uint64 = (u & (1 << 63)) * e_lt_zero
+ var v : uint64 = v_ge_zero + v_lt_zero
+ -> std.flt64frombits(v)
+}
+
+pkglocal const floor64 = {f : flt64
+ var u : uint64 = std.flt64bits(f)
+ var e : int64 = (((u >> 52) & 0x7ff) : int64) - 1023
+ var shift_e : uint64 = (e : uint64)
+
+ if e >= 52 || u == 0x8000000000000000ul
+ -> f
+ elif (e < 0) && (u & (1 << 63) != 0)
+ -> -1.0
+ elif e < 0
+ -> 0.0
+ ;;
+
+ if u & (1 << 63) != 0
+ var fractional_mask : uint64 = (((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 52) >> shift_e)
+ ;;
+ -> std.flt64frombits(v)
+ ;;
+
+ var m : uint64 = ~(((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & m
+ -> std.flt64frombits(v)
+}
+
+pkglocal const ceil64 = {f;
+ var u : uint64 = std.flt64bits(f)
+ var e : int64 = (((u >> 52) & 0x7ff) : int64) - 1023
+ var shift_e : uint64 = (e : uint64)
+
+ if e >= 52 || u == 0x0ul
+ -> f
+ elif (e < 0) && (u & (1 << 63) == 0)
+ -> 1.0
+ elif e < 0
+ -> -0.0
+ ;;
+
+ if u & (1 << 63) == 0
+ var fractional_mask : uint64 = (((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 52) >> shift_e)
+ ;;
+ -> std.flt64frombits(v)
+ ;;
+
+ var m : uint64 = ~(((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & m
+ -> std.flt64frombits(v)
+}
--- /dev/null
+++ b/lib/math/fpmath.myr
@@ -1,0 +1,36 @@
+use std
+
+pkg math =
+ trait fpmath @f =
+
+ /* fpmath-trunc-impl */
+ trunc : (f : @f -> @f)
+ ceil : (f : @f -> @f)
+ floor : (f : @f -> @f)
+
+ /* compute (s, t) with s = round-nearest(a+b), s + t = a + b */
+// fast2sum : (a : @f, b : @f -> (@f, @f))
+ ;;
+
+ impl fpmath flt32
+ impl fpmath flt64
+;;
+
+impl fpmath flt32 =
+ trunc = {f; -> trunc32(f)}
+ floor = {f; -> floor32(f)}
+ ceil = {f; -> ceil32(f)}
+;;
+
+impl fpmath flt64 =
+ trunc = {f; -> trunc64(f)}
+ floor = {f; -> floor64(f)}
+ ceil = {f; -> ceil64(f)}
+;;
+
+extern const trunc32 : (f : flt32 -> flt32)
+extern const floor32 : (f : flt32 -> flt32)
+extern const ceil32 : (f : flt32 -> flt32)
+extern const trunc64 : (f : flt64 -> flt64)
+extern const floor64 : (f : flt64 -> flt64)
+extern const ceil64 : (f : flt64 -> flt64)
--- /dev/null
+++ b/lib/math/test/fpmath-sum-impl.myr
@@ -1,0 +1,163 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name = "trunc-01", .fn = trunc01],
+ [.name = "trunc-02", .fn = trunc02],
+ [.name = "floor-01", .fn = floor01],
+ [.name = "floor-02", .fn = floor02],
+ [.name = "ceil-01", .fn = ceil01],
+ [.name = "ceil-02", .fn = ceil02],
+ [.name = "fast2sum-01", .fn = fast2sum01],
+ ][:])
+}
+
+impl std.equatable flt32 =
+ eq = {a : flt32, b : flt32; -> std.flt32bits(a) == std.flt32bits(b)}
+;;
+
+impl std.equatable flt64 =
+ eq = {a : flt64, b : flt64; -> std.flt64bits(a) == std.flt64bits(b)}
+;;
+
+const trunc01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const trunc02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const floor01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const floor02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const ceil01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const ceil02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const fast2sum01 = {c
+ var flt32s : (flt32, flt32, flt32, flt32)[:] = [
+ (1.0, 1.0, 2.0, 0.0),
+ (10664524000000000000.0, 1.11842, 10664524000000000000.0, 1.11842),
+ (1.11843, 10664524000000000000.0, 10664524000000000000.0, 1.11843),
+ (-21897.1324, 17323.22, -4573.912, 0.0),
+ ][:]
+
+ for (a, b, s1, t1) : flt32s
+ var s2, t2
+ (s2, t2) = math.fast2sum(a, b)
+ testr.eq(c, s1, s2)
+ testr.eq(c, t1, t2)
+ ;;
+
+ var flt64s : (flt64, flt64, flt64, flt64)[:] = [
+ (1.0, 1.0, 2.0, 0.0),
+ (-21897.1324, 17323.22, -4573.912399999997, 0.0),
+ (std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884),
+ std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884)),
+ (std.flt64frombits(0x6a8c3190e27a1884), std.flt64frombits(0x78591b0672a81284),
+ std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884)),
+ (std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x7858273672ca19a0),
+ std.flt64frombits(0x7868a11e72b91612), 0.0),
+ ][:]
+
+ for (a, b, s1, t1) : flt64s
+ var s2, t2
+ (s2, t2) = math.fast2sum(a, b)
+ testr.eq(c, s1, s2)
+ testr.eq(c, std.flt64bits(t1), std.flt64bits(t2))
+ ;;
+}
--- /dev/null
+++ b/lib/math/test/fpmath-trunc-impl.myr
@@ -1,0 +1,128 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name = "trunc-01", .fn = trunc01],
+ [.name = "trunc-02", .fn = trunc02],
+ [.name = "floor-01", .fn = floor01],
+ [.name = "floor-02", .fn = floor02],
+ [.name = "ceil-01", .fn = ceil01],
+ [.name = "ceil-02", .fn = ceil02],
+ ][:])
+}
+
+impl std.equatable flt32 =
+ eq = {a : flt32, b : flt32; -> std.flt32bits(a) == std.flt32bits(b)}
+;;
+
+impl std.equatable flt64 =
+ eq = {a : flt64, b : flt64; -> std.flt64bits(a) == std.flt64bits(b)}
+;;
+
+const trunc01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const trunc02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const floor01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const floor02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const ceil01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const ceil02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}