ref: 2b3135c57863d52f012d241b54b8da6ea2072a8d
parent: 499e44ebfde8c649d48d4c05093a8e1819be5349
author: rodri <rgl@antares-labs.eu>
date: Fri Mar 14 13:37:23 EDT 2025
add more benchmarks. extend the mkfile. use mallocalign(2).
--- a/bench/main.c
+++ b/bench/main.c
@@ -5,6 +5,7 @@
#include "../bench9/b.h"
double min(double, double);
+
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
@@ -11,32 +12,24 @@
double dotvec2_sse_a(Point2*, Point2*);
double dotvec2_sse4_a(Point2*, Point2*);
double dotvec2_avx_a(Point2*, Point2*);
+
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
double dotvec3_sse4_a(Point3*, Point3*);
+double dotvec3_avx_a(Point3*, Point3*);
+
Point2 Pt2b(double, double, double);
+
Point3 crossvec3_sse(Point3, Point3);
+
double hsubpd(double, double);
+
double fma(double, double, double);
+
Point2 addpt2_sse(Point2, Point2);
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
-void *
-amalloc(ulong n, ulong a)
-{
- void *p;
-
- assert(a > 1 && (a&1) == 0);
-
- a--;
- p = malloc(n+a);
- if(p == nil)
- sysfatal("malloc: %r");
- p = (void*)(((uintptr)p + a)&~a);
- return p;
-}
-
double
fmin(double a, double b)
{
@@ -49,6 +42,18 @@
return a + b*c;
}
+double
+dotvec2_p(Point2 *a, Point2 *b)
+{
+ return a->x*b->x + a->y*b->y;
+}
+
+double
+dotvec3_p(Point3 *a, Point3 *b)
+{
+ return a->x*b->x + a->y*b->y + a->z*b->z;
+}
+
static void
bmin(int fd)
{
@@ -84,7 +89,7 @@
bdotvec2(int fd)
{
Bgr g;
- B *b0, *b1, *b2, *b3, *b4, *b5, *b6;
+ B *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
Point2 a, b;
Point2 *aa, *bb;
int i;
@@ -97,12 +102,13 @@
b4 = benchadd(&g, "dotvec2_sse_a");
b5 = benchadd(&g, "dotvec2_sse4_a");
b6 = benchadd(&g, "dotvec2_avx_a");
+ b7 = benchadd(&g, "dotvec2_p");
while(b0->n > 0 || b1->n > 0){
a = Vec2(truerand()*frand(), truerand()*frand());
b = Vec2(truerand()*frand(), truerand()*frand());
- aa = amalloc(sizeof(Point2), 16);
- bb = amalloc(sizeof(Point2), 16);
+ aa = mallocalign(sizeof(Point2), 16, 0, 0);
+ bb = mallocalign(sizeof(Point2), 16, 0, 0);
*aa = a;
*bb = b;
@@ -140,6 +146,11 @@
for(i = 0; i < 1e6; i++)
dotvec2_avx_a(aa, bb);
benchout(b6);
+
+ benchin(b7);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_p(aa, bb);
+ benchout(b7);
}
benchprintgr(&g, fd);
@@ -150,7 +161,7 @@
bdotvec3(int fd)
{
Bgr g;
- B *b0, *b1, *b2, *b3;
+ B *b0, *b1, *b2, *b3, *b4, *b5;
Point3 a, b;
Point3 *aa, *bb;
int i;
@@ -160,12 +171,14 @@
b1 = benchadd(&g, "dotvec3_sse4");
b2 = benchadd(&g, "dotvec3_avx");
b3 = benchadd(&g, "dotvec3_sse4_a");
+ b4 = benchadd(&g, "dotvec3_avx_a");
+ b5 = benchadd(&g, "dotvec3_p");
while(b0->n > 0 || b1->n > 0){
a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
- aa = amalloc(sizeof(Point3), 16);
- bb = amalloc(sizeof(Point3), 16);
+ aa = mallocalign(sizeof(Point3), 16, 0, 0);
+ bb = mallocalign(sizeof(Point3), 16, 0, 0);
*aa = a;
*bb = b;
@@ -188,6 +201,16 @@
for(i = 0; i < 1e6; i++)
dotvec3_sse4_a(aa, bb);
benchout(b3);
+
+ benchin(b4);
+ for(i = 0; i < 1e6; i++)
+ dotvec3_avx_a(aa, bb);
+ benchout(b4);
+
+ benchin(b5);
+ for(i = 0; i < 1e6; i++)
+ dotvec3_p(aa, bb);
+ benchout(b5);
}
benchprintgr(&g, fd);
--- a/dppd.s
+++ b/dppd.s
@@ -96,6 +96,17 @@
ADDSD X1, X0
RET
+TEXT dotvec3_avx_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ VMOVAPD_128mr(0, rDX, rX0)
+ VMOVAPD_128mr(0, rBP, rX1)
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ MOVSD 16(DX), X1
+ MOVSD 16(BP), X2
+ VFMADD231SD(rX1, rX2, rX0)
+ VZEROUPPER
+ RET
+
TEXT Pt2b(SB), 1, $0
MOVQ BP, DI
MOVSD x+8(FP), X0
--- a/main.c
+++ b/main.c
@@ -3,6 +3,7 @@
#include <geometry.h>
double min(double, double);
+
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
@@ -9,34 +10,28 @@
double dotvec2_sse_a(Point2*, Point2*);
double dotvec2_sse4_a(Point2*, Point2*);
double dotvec2_avx_a(Point2*, Point2*);
+
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
double dotvec3_sse4_a(Point3*, Point3*);
+double dotvec3_avx_a(Point3*, Point3*);
+
Point2 Pt2b(double, double, double);
+
Point3 crossvec3_sse(Point3, Point3);
+
double hsubpd(double, double);
+
double fma(double, double, double);
+
Point2 addpt2_sse(Point2, Point2);
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
+
void addsub_sse(double*,double*);
+
double round(double);
-void *
-amalloc(ulong n, ulong a)
-{
- void *p;
-
- assert(a > 1 && (a&1) == 0);
-
- a--;
- p = malloc(n+a);
- if(p == nil)
- sysfatal("malloc: %r");
- p = (void*)(((uintptr)p + a)&~a);
- return p;
-}
-
void
addsub(double *a, double *b)
{
@@ -56,6 +51,12 @@
return a + b*c;
}
+double
+dotvec2_p(Point2 *a, Point2 *b)
+{
+ return a->x*b->x + a->y*b->y;
+}
+
void
main(int argc, char *argv[])
{
@@ -73,13 +74,13 @@
a = strtod(argv[0], nil);
b = strtod(argv[1], nil);
- ap0 = amalloc(sizeof(Point2), 16);
- ap1 = amalloc(sizeof(Point2), 16);
- apr = amalloc(sizeof(Point2), 16);
+ ap0 = mallocalign(sizeof(Point2), 16, 0, 0);
+ ap1 = mallocalign(sizeof(Point2), 16, 0, 0);
+ apr = mallocalign(sizeof(Point2), 16, 0, 0);
- ap0t = amalloc(sizeof(Point3), 16);
- ap1t = amalloc(sizeof(Point3), 16);
- aprt = amalloc(sizeof(Point3), 16);
+ ap0t = mallocalign(sizeof(Point3), 16, 0, 0);
+ ap1t = mallocalign(sizeof(Point3), 16, 0, 0);
+ aprt = mallocalign(sizeof(Point3), 16, 0, 0);
r = 0;
r = fmin(a, b);
@@ -96,6 +97,9 @@
r = dotvec2(p0, p1);
print("dotvec2(%v, %v) = %g\n", p0, p1, r);
r = 0;
+ r = dotvec2_p(&p0, &p1);
+ print("dotvec2_p(%v, %v) = %g\n", p0, p1, r);
+ r = 0;
r = dotvec2_sse(p0, p1);
print("dotvec2_sse(%v, %v) = %g\n", p0, p1, r);
r = 0;
@@ -140,6 +144,9 @@
r = 0;
r = dotvec3_sse4_a(ap0t, ap1t);
print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
+ r = 0;
+ r = dotvec3_avx_a(ap0t, ap1t);
+ print("dotvec3_avx_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
print("\n");
--- a/mkfile
+++ b/mkfile
@@ -17,3 +17,9 @@
pulldeps:VQ:
git/clone git://shithub.us/sigrid/bench9 || \
git/clone https://git.sr.ht/~ft/bench9
+
+verify:VQ: $O.out
+ $O.out 13 31
+
+bench:VQ:
+ @{cd bench; mk && $O.out}