ref: 9dd2a40974da19018eeb33338cfcc486f4eeca8d
parent: ac2d8aa21e1e6ae45573b268151fda57cada110d
author: S. Gilles <sgilles@math.umd.edu>
date: Thu May 2 03:20:05 EDT 2019
Replace costly sum with slightly faster sums in log-overkill. This still doesn't make the function fast, but it's less embarassingly slow now.
--- a/lib/math/log-overkill.myr
+++ b/lib/math/log-overkill.myr
@@ -252,13 +252,19 @@
var t1, t2, t3
(t1, t2, t3) = C_plus[k]
-> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
- elif k < 54
+ elif k < 36
var t1 = std.flt64assem(false, -k, 1 << 53) /* x [ = 2^-k ] */
var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /* x^3 / 3 */
var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
var t5 = std.flt64assem(false, -5*k - 3, one_fifth_sig) /* x^5 / 5 */
- -> triple_compensated_sum([t1, t2, t3, t4, t5][:])
+ -> fast_fivesum(t1, t2, t3, t4, t5)
+ elif k < 54
+ var t1 = std.flt64assem(false, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
+ -> fast_foursum(t1, t2, t3, t4)
else
var t1 = std.flt64assem(false, -k, 1 << 53) /* x [ = 2^-k ] */
var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
@@ -274,13 +280,19 @@
var t1, t2, t3
(t1, t2, t3) = C_minus[k]
-> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
- elif k < 54
+ elif k < 36
var t1 = std.flt64assem(true, -k, 1 << 53) /* x [ = 2^-k ] */
var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
var t3 = std.flt64assem(true, -3*k - 2, one_third_sig) /* x^3 / 3 */
var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
var t5 = std.flt64assem(true, -5*k - 3, one_fifth_sig) /* x^5 / 5 */
- -> triple_compensated_sum([t1, t2, t3, t4, t5][:])
+ -> fast_fivesum(t1, t2, t3, t4, t5)
+ elif k < 54
+ var t1 = std.flt64assem(true, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(true, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
+ -> fast_foursum(t1, t2, t3, t4)
else
var t1 = std.flt64assem(true, -k, 1 << 53) /* x [ = 2^-k ] */
var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
@@ -299,4 +311,33 @@
(s1, s2) = slow2sum(t2, s3)
-> (t1, s1, s2 + s4)
+}
+
+/*
+ Specifically for use in get_C_{plus,minus}, in which we know the
+ magnitude orders of the ais.
+ */
+const fast_foursum = {a1 : flt64, a2 : flt64, a3 : flt64, a4 : flt64
+ (a3, a4) = fast2sum(a3, a4)
+ (a2, a3) = fast2sum(a2, a3)
+ (a1, a2) = fast2sum(a1, a2)
+
+ (a3, a4) = slow2sum(a3, a4)
+ (a2, a3) = slow2sum(a2, a3)
+
+ -> (a1, a2, a3 + a4)
+}
+
+const fast_fivesum = {a1 : flt64, a2 : flt64, a3 : flt64, a4 : flt64, a5 : flt64
+ (a4, a5) = fast2sum(a4, a5)
+ (a3, a4) = fast2sum(a3, a4)
+ (a2, a3) = fast2sum(a2, a3)
+ (a1, a2) = fast2sum(a1, a2)
+
+ (a4, a5) = slow2sum(a4, a5)
+ (a3, a4) = slow2sum(a3, a4)
+ (a2, a3) = slow2sum(a2, a3)
+
+ (a4, a5) = slow2sum(a4, a5)
+ -> (a1, a2, a3 + a4)
}