shithub: amd64-simd

Download patch

ref: 2b3135c57863d52f012d241b54b8da6ea2072a8d
parent: 499e44ebfde8c649d48d4c05093a8e1819be5349
author: rodri <rgl@antares-labs.eu>
date: Fri Mar 14 13:37:23 EDT 2025

add more benchmarks. extend the mkfile. use mallocalign(2).

--- a/bench/main.c
+++ b/bench/main.c
@@ -5,6 +5,7 @@
 #include "../bench9/b.h"
 
 double min(double, double);
+
 double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
@@ -11,32 +12,24 @@
 double dotvec2_sse_a(Point2*, Point2*);
 double dotvec2_sse4_a(Point2*, Point2*);
 double dotvec2_avx_a(Point2*, Point2*);
+
 double dotvec3_sse4(Point3, Point3);
 double dotvec3_avx(Point3, Point3);
 double dotvec3_sse4_a(Point3*, Point3*);
+double dotvec3_avx_a(Point3*, Point3*);
+
 Point2 Pt2b(double, double, double);
+
 Point3 crossvec3_sse(Point3, Point3);
+
 double hsubpd(double, double);
+
 double fma(double, double, double);
+
 Point2 addpt2_sse(Point2, Point2);
 Point2 addpt2_avx(Point2, Point2);
 Point3 addpt3_avx(Point3, Point3);
 
-void *
-amalloc(ulong n, ulong a)
-{
-	void *p;
-
-	assert(a > 1 && (a&1) == 0);
-
-	a--;
-	p = malloc(n+a);
-	if(p == nil)
-		sysfatal("malloc: %r");
-	p = (void*)(((uintptr)p + a)&~a);
-	return p;
-}
-
 double
 fmin(double a, double b)
 {
@@ -49,6 +42,18 @@
 	return a + b*c;
 }
 
+double
+dotvec2_p(Point2 *a, Point2 *b)
+{
+	return a->x*b->x + a->y*b->y;
+}
+
+double
+dotvec3_p(Point3 *a, Point3 *b)
+{
+	return a->x*b->x + a->y*b->y + a->z*b->z;
+}
+
 static void
 bmin(int fd)
 {
@@ -84,7 +89,7 @@
 bdotvec2(int fd)
 {
 	Bgr g;
-	B *b0, *b1, *b2, *b3, *b4, *b5, *b6;
+	B *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
 	Point2 a, b;
 	Point2 *aa, *bb;
 	int i;
@@ -97,12 +102,13 @@
 	b4 = benchadd(&g, "dotvec2_sse_a");
 	b5 = benchadd(&g, "dotvec2_sse4_a");
 	b6 = benchadd(&g, "dotvec2_avx_a");
+	b7 = benchadd(&g, "dotvec2_p");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec2(truerand()*frand(), truerand()*frand());
 		b = Vec2(truerand()*frand(), truerand()*frand());
-		aa = amalloc(sizeof(Point2), 16);
-		bb = amalloc(sizeof(Point2), 16);
+		aa = mallocalign(sizeof(Point2), 16, 0, 0);
+		bb = mallocalign(sizeof(Point2), 16, 0, 0);
 		*aa = a;
 		*bb = b;
 
@@ -140,6 +146,11 @@
 		for(i = 0; i < 1e6; i++)
 			dotvec2_avx_a(aa, bb);
 		benchout(b6);
+
+		benchin(b7);
+		for(i = 0; i < 1e6; i++)
+			dotvec2_p(aa, bb);
+		benchout(b7);
 	}
 
 	benchprintgr(&g, fd);
@@ -150,7 +161,7 @@
 bdotvec3(int fd)
 {
 	Bgr g;
-	B *b0, *b1, *b2, *b3;
+	B *b0, *b1, *b2, *b3, *b4, *b5;
 	Point3 a, b;
 	Point3 *aa, *bb;
 	int i;
@@ -160,12 +171,14 @@
 	b1 = benchadd(&g, "dotvec3_sse4");
 	b2 = benchadd(&g, "dotvec3_avx");
 	b3 = benchadd(&g, "dotvec3_sse4_a");
+	b4 = benchadd(&g, "dotvec3_avx_a");
+	b5 = benchadd(&g, "dotvec3_p");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
 		b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
-		aa = amalloc(sizeof(Point3), 16);
-		bb = amalloc(sizeof(Point3), 16);
+		aa = mallocalign(sizeof(Point3), 16, 0, 0);
+		bb = mallocalign(sizeof(Point3), 16, 0, 0);
 		*aa = a;
 		*bb = b;
 
@@ -188,6 +201,16 @@
 		for(i = 0; i < 1e6; i++)
 			dotvec3_sse4_a(aa, bb);
 		benchout(b3);
+
+		benchin(b4);
+		for(i = 0; i < 1e6; i++)
+			dotvec3_avx_a(aa, bb);
+		benchout(b4);
+
+		benchin(b5);
+		for(i = 0; i < 1e6; i++)
+			dotvec3_p(aa, bb);
+		benchout(b5);
 	}
 
 	benchprintgr(&g, fd);
--- a/dppd.s
+++ b/dppd.s
@@ -96,6 +96,17 @@
 	ADDSD X1, X0
 	RET
 
+TEXT dotvec3_avx_a(SB), 1, $0
+	MOVQ b+8(FP), DX
+	VMOVAPD_128mr(0, rDX, rX0)
+	VMOVAPD_128mr(0, rBP, rX1)
+	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
+	MOVSD 16(DX), X1
+	MOVSD 16(BP), X2
+	VFMADD231SD(rX1, rX2, rX0)
+	VZEROUPPER
+	RET
+
 TEXT Pt2b(SB), 1, $0
 	MOVQ BP, DI
 	MOVSD x+8(FP), X0
--- a/main.c
+++ b/main.c
@@ -3,6 +3,7 @@
 #include <geometry.h>
 
 double min(double, double);
+
 double dotvec2_sse(Point2, Point2);
 double dotvec2_sse4(Point2, Point2);
 double dotvec2_avx(Point2, Point2);
@@ -9,34 +10,28 @@
 double dotvec2_sse_a(Point2*, Point2*);
 double dotvec2_sse4_a(Point2*, Point2*);
 double dotvec2_avx_a(Point2*, Point2*);
+
 double dotvec3_sse4(Point3, Point3);
 double dotvec3_avx(Point3, Point3);
 double dotvec3_sse4_a(Point3*, Point3*);
+double dotvec3_avx_a(Point3*, Point3*);
+
 Point2 Pt2b(double, double, double);
+
 Point3 crossvec3_sse(Point3, Point3);
+
 double hsubpd(double, double);
+
 double fma(double, double, double);
+
 Point2 addpt2_sse(Point2, Point2);
 Point2 addpt2_avx(Point2, Point2);
 Point3 addpt3_avx(Point3, Point3);
+
 void addsub_sse(double*,double*);
+
 double round(double);
 
-void *
-amalloc(ulong n, ulong a)
-{
-	void *p;
-
-	assert(a > 1 && (a&1) == 0);
-
-	a--;
-	p = malloc(n+a);
-	if(p == nil)
-		sysfatal("malloc: %r");
-	p = (void*)(((uintptr)p + a)&~a);
-	return p;
-}
-
 void
 addsub(double *a, double *b)
 {
@@ -56,6 +51,12 @@
 	return a + b*c;
 }
 
+double
+dotvec2_p(Point2 *a, Point2 *b)
+{
+	return a->x*b->x + a->y*b->y;
+}
+
 void
 main(int argc, char *argv[])
 {
@@ -73,13 +74,13 @@
 	a = strtod(argv[0], nil);
 	b = strtod(argv[1], nil);
 
-	ap0 = amalloc(sizeof(Point2), 16);
-	ap1 = amalloc(sizeof(Point2), 16);
-	apr = amalloc(sizeof(Point2), 16);
+	ap0 = mallocalign(sizeof(Point2), 16, 0, 0);
+	ap1 = mallocalign(sizeof(Point2), 16, 0, 0);
+	apr = mallocalign(sizeof(Point2), 16, 0, 0);
 
-	ap0t = amalloc(sizeof(Point3), 16);
-	ap1t = amalloc(sizeof(Point3), 16);
-	aprt = amalloc(sizeof(Point3), 16);
+	ap0t = mallocalign(sizeof(Point3), 16, 0, 0);
+	ap1t = mallocalign(sizeof(Point3), 16, 0, 0);
+	aprt = mallocalign(sizeof(Point3), 16, 0, 0);
 
 	r = 0;
 	r = fmin(a, b);
@@ -96,6 +97,9 @@
 	r = dotvec2(p0, p1);
 	print("dotvec2(%v, %v) = %g\n", p0, p1, r);
 	r = 0;
+	r = dotvec2_p(&p0, &p1);
+	print("dotvec2_p(%v, %v) = %g\n", p0, p1, r);
+	r = 0;
 	r = dotvec2_sse(p0, p1);
 	print("dotvec2_sse(%v, %v) = %g\n", p0, p1, r);
 	r = 0;
@@ -140,6 +144,9 @@
 	r = 0;
 	r = dotvec3_sse4_a(ap0t, ap1t);
 	print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
+	r = 0;
+	r = dotvec3_avx_a(ap0t, ap1t);
+	print("dotvec3_avx_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
 
 	print("\n");
 
--- a/mkfile
+++ b/mkfile
@@ -17,3 +17,9 @@
 pulldeps:VQ:
 	git/clone git://shithub.us/sigrid/bench9 || \
 	git/clone https://git.sr.ht/~ft/bench9
+
+verify:VQ: $O.out
+	$O.out 13 31
+
+bench:VQ:
+	@{cd bench; mk && $O.out}