ref: 0c51b567258d7b826e65976c7a72b081e30c2ccd
parent: 675aa84403f98776a7d463e1cc5f9bd41cdbab92
author: rodri <rgl@antares-labs.eu>
date: Sat Nov 25 07:05:33 EST 2023
add 3d point sum.
--- a/avx.h
+++ b/avx.h
@@ -122,3 +122,7 @@
/* VFMADD231PD (256 bit) */
#define VFMADD231PD_256(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_256,VEX_p_66); \
VOP(0xB8, 0x3, (d), (s1))
+
+/* VINSERTF128 */
+#define VINSERTF128(i, s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_256,VEX_p_66); \
+ VOPi(0x18, 0x3, (d), (s1), (i))
--- a/bench/main.c
+++ b/bench/main.c
@@ -14,6 +14,7 @@
double hsubpd(double, double);
double fma(double, double, double);
Point2 addpt2_avx(Point2, Point2);
+Point3 addpt3_avx(Point3, Point3);
double
fmin(double a, double b)
@@ -258,6 +259,37 @@
benchfreegr(&g);
}
+static void
+baddpt3(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ Point3 a, b;
+ int i;
+
+ benchinitgr(&g, "3d point sum");
+ b0 = benchadd(&g, "addpt3");
+ b1 = benchadd(&g, "addpt3_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Pt3(truerand()*frand(), truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Pt3(truerand()*frand(), truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ addpt3(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ addpt3_avx(a, b);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
void
threadmain(int argc, char **argv)
{
@@ -280,6 +312,8 @@
bfma(1);
bseparator(1);
baddpt2(1);
+ bseparator(1);
+ baddpt3(1);
threadexitsall(nil);
}
--- a/dppd.s
+++ b/dppd.s
@@ -99,11 +99,21 @@
VFMADD231SD(rX1, rX2, rX0)
RET
+/* TODO: write only 24 bytes */
TEXT addpt2_avx(SB), 1, $0
MOVQ SP, AX
ADDQ $8, AX
VMOVDQU_256mr(8, rAX, rX0)
VMOVDQU_256mr(32, rAX, rX1)
+ VADDPD_256rr(rX1, rX0, rX0)
+ VMOVDQU_256rm(rX0, rAX)
+ RET
+
+TEXT addpt3_avx(SB), 1, $0
+ MOVQ SP, AX
+ ADDQ $8, AX
+ VMOVDQU_256mr(8, rAX, rX0)
+ VMOVDQU_256mr(40, rAX, rX1)
VADDPD_256rr(rX1, rX0, rX0)
VMOVDQU_256rm(rX0, rAX)
RET
--- a/main.c
+++ b/main.c
@@ -12,6 +12,7 @@
double hsubpd(double, double);
double fma(double, double, double);
Point2 addpt2_avx(Point2, Point2);
+Point3 addpt3_avx(Point3, Point3);
double
fmin(double a, double b)
@@ -95,6 +96,15 @@
print("addpt2(%v, %v) = %v\n", p0, p1, pr);
pr = addpt2_avx(p0, p1);
print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr);
+
+ print("\n");
+
+ p0t = Pt3(a, 1, 1, b);
+ p1t = Pt3(b, 3, 1, a);
+ prt = addpt3(p0t, p1t);
+ print("addpt3(%V, %V) = %V\n", p0t, p1t, prt);
+ prt = addpt3_avx(p0t, p1t);
+ print("addpt3_avx(%V, %V) = %V\n", p0t, p1t, prt);
exits(nil);
}