ref: d850c3b7f47e58556c160f9d03ea20aa52452020
parent: 9404d16a4263a87559af64bfb18c91ccebaa601d
author: rodri <rgl@antares-labs.eu>
date: Fri Nov 24 10:39:06 EST 2023
add more avx instructions and a bench9 benchmark file.
--- /dev/null
+++ b/bench/main.c
@@ -1,0 +1,134 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include <geometry.h>
+#include "b.h"
+
+double dppd(Point2, Point2);
+double dppda(Point2, Point2);
+double dppd3(Point3, Point3);
+double dppd3a(Point3, Point3);
+Point3 xvec3(Point3, Point3);
+
+static void
+bdotvec2(int fd)
+{
+ Bgr g;
+ B *b0, *b1, *b2;
+ Point2 a, b;
+ int i;
+
+ benchinitgr(&g, "2d dot product");
+ b0 = benchadd(&g, "dotvec2");
+ b1 = benchadd(&g, "dotvec2_simd");
+ b2 = benchadd(&g, "dotvec2_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec2(truerand()*frand(), truerand()*frand());
+ b = Vec2(truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ dotvec2(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ dppd(a, b);
+ benchout(b1);
+
+ benchin(b2);
+ for(i = 0; i < 1e6; i++)
+ dppda(a, b);
+ benchout(b2);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bdotvec3(int fd)
+{
+ Bgr g;
+ B *b0, *b1, *b2;
+ Point3 a, b;
+ int i;
+
+ benchinitgr(&g, "3d dot product");
+ b0 = benchadd(&g, "dotvec3");
+ b1 = benchadd(&g, "dotvec3_simd");
+ b2 = benchadd(&g, "dotvec3_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ dotvec3(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ dppd3(a, b);
+ benchout(b1);
+
+ benchin(b2);
+ for(i = 0; i < 1e6; i++)
+ dppd3a(a, b);
+ benchout(b2);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bcrossvec3(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ Point3 a, b;
+ int i;
+
+ benchinitgr(&g, "3d cross product");
+ b0 = benchadd(&g, "crossvec3");
+ b1 = benchadd(&g, "crossvec3_simd");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ crossvec3(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ xvec3(a, b);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ ARGBEGIN{
+ }ARGEND
+
+ if(benchwire(0) != 0)
+ fprint(2, "failed to wire: %r\n");
+
+ bdotvec2(1);
+ bseparator(1);
+ bdotvec3(1);
+ bseparator(1);
+ bcrossvec3(1);
+
+ threadexitsall(nil);
+}
--- a/dppd.s
+++ b/dppd.s
@@ -12,6 +12,13 @@
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
+TEXT dppda(SB), 1, $0
+ MOVQ SP, AX
+ VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
+ VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ RET
+
TEXT dppd3(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
@@ -23,6 +30,16 @@
MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */
MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ RET
+
+TEXT dppd3a(SB), 1, $0
+ MOVQ SP, AX
+ VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
+ VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ MOVSD a+16(FP), X1
+ MOVSD b+48(FP), X2
+ VFMADD231SD(rX1, rX2, rX0)
RET
TEXT Pt2b(SB), 1, $0
--- a/main.c
+++ b/main.c
@@ -5,7 +5,9 @@
uvlong nanosec(void);
double min(double, double);
double dppd(Point2, Point2);
+double dppda(Point2, Point2);
double dppd3(Point3, Point3);
+double dppd3a(Point3, Point3);
Point2 Pt2b(double, double, double);
Point3 xvec3(Point3, Point3);
double hsubpd(double, double);
@@ -41,6 +43,8 @@
t1 = nanosec();
print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("\n");
+
p0 = Pt2b(a, 1, 1);
p1 = Pt2b(b, 3, 1);
t0 = nanosec();
@@ -51,7 +55,13 @@
r = dotvec2(p0, p1);
t1 = nanosec();
print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+ t0 = nanosec();
+ r = dppda(p0, p1);
+ t1 = nanosec();
+ print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+ print("\n");
+
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
t0 = nanosec();
@@ -62,12 +72,20 @@
r = dotvec3(p0t, p1t);
t1 = nanosec();
print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+ t0 = nanosec();
+ r = dppd3a(p0t, p1t);
+ t1 = nanosec();
+ print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+ print("\n");
+
t0 = nanosec();
r = hsubpd(a, b);
t1 = nanosec();
print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("\n");
+
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
t0 = nanosec();
@@ -78,6 +96,8 @@
pr = crossvec3(p0t, p1t);
t1 = nanosec();
print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+
+ print("\n");
t0 = nanosec();
r = fma(a, b, 21);
--- a/sse.h
+++ b/sse.h
@@ -41,6 +41,8 @@
BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
#define VOP(o, m, ro, rm) BYTE $(o); \
BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define VOPi(o, m, ro, rm, i) VOP((o), (m), (ro), (rm)); \
+ BYTE $(i)
/* MOVLPD */
//opcode = 660F12
@@ -64,6 +66,15 @@
//modrm = 11 000 001 [X1 → X0]
//imm8 = 0011 0001
#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)
+
+/* VMOVAPD */
+#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x28, 0x3, (d), (s))
+/* VDPPD */
+#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \
+ VOPi(0x41, 0x3, (d), (s1), 0x31)
/* VFMADD231SD (128 bit) */
#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \