ref: 092bcb0cb43b4a1ca351a3085c512bf6afa89989
parent: ed1a42eb5035b0ba8d0dab7c17a6372995b403a2
author: rodri <rgl@antares-labs.eu>
date: Mon Nov 27 04:57:54 EST 2023
idem.
--- a/avx.h
+++ b/avx.h
@@ -26,14 +26,14 @@
#define VZEROALL VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_NO); BYTE $0x77
/* VMOVUPD */
-#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
- VOPi(0x10, 0x1, (d), (s), (off))
-#define VMOVUPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
- VOP(0x10, 0x3, (d), (s))
-#define VMOVUPD_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
- VOPi(0x10, 0x1, (d), (s), (off))
-#define VMOVUPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
- VOP(0x10, 0x3, (d), (s))
+#define VMOVUPD_128mr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x10, 0x0, (d), (s))
+#define VMOVUPD_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x11, 0x0, (s), (d))
+#define VMOVUPD_256mr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
+ VOP(0x10, 0x0, (d), (s))
+#define VMOVUPD_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
+ VOP(0x11, 0x0, (s), (d))
/* VMOVAPD */
#define VMOVAPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
--- a/dppd.s
+++ b/dppd.s
@@ -6,40 +6,41 @@
GLOBL one(SB), $8
TEXT dotvec2_sse4(SB), 1, $0
- MOVQ SP, AX
- MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */
- MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */
+ MOVUPD a+0(FP), X0
+ MOVUPD b+24(FP), X1
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
TEXT dotvec2_avx(SB), 1, $0
MOVQ SP, AX
- VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
- VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */
+ ADDQ $8, AX
+ VMOVUPD_128mr(rAX, rX0)
+ ADDQ $24, AX
+ VMOVUPD_128mr(rAX, rX1)
VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ VZEROUPPER
RET
TEXT dotvec3_sse4(SB), 1, $0
- MOVQ SP, AX
- MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
- MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
+ MOVUPD a+0(FP), X0
+ MOVUPD b+32(FP), X1
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
- MOVSD one(SB), X1
- MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */
- MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */
- DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ MOVSD a+16(FP), X1
+ MULSD b+48(FP), X1
+ ADDSD X1, X0
RET
TEXT dotvec3_avx(SB), 1, $0
MOVQ SP, AX
- VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
- VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */
+ ADDQ $8, AX
+ VMOVUPD_128mr(rAX, rX0)
+ ADDQ $32, AX
+ VMOVUPD_128mr(rAX, rX1)
VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
MOVSD a+16(FP), X1
MOVSD b+48(FP), X2
VFMADD231SD(rX1, rX2, rX0)
+ VZEROUPPER
RET
TEXT Pt2b(SB), 1, $0
@@ -53,21 +54,18 @@
RET
TEXT hsubpd(SB), 1, $0
- MOVQ SP, AX
- MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */
+ MOVLPD a+0(FP), X0
+ MOVHPD b+8(FP), X0
HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */
RET
TEXT crossvec3_sse(SB), 1, $0
- MOVQ SP, AX
- ADDQ $8, AX
- MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */
- MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */
- MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
- MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */
- MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */
+ MOVLPD b+40(FP), X0
+ MOVHPD a+8(FP), X0 /* X0 := [a.x][b.x] */
+ MOVLPD a+16(FP), X1
+ MOVHPD b+48(FP), X1 /* X1 := [b.y][a.y] */
+ MOVLPD b+56(FP), X2
+ MOVHPD a+24(FP), X2 /* X2 := [a.z][b.z] */
MOVAPD X1, X3
MULPD X2, X3
HSUBPD(rX3, rX3) /* x */
@@ -99,21 +97,38 @@
VFMADD231SD(rX1, rX2, rX0)
RET
+TEXT addpt2_sse(SB), 1, $0
+ MOVUPD a+8(FP), X0
+ MOVUPD b+32(FP), X1
+ ADDPD X1, X0
+ MOVSD a+24(FP), X2
+ ADDSD b+48(FP), X2
+ MOVQ BP, DI
+ MOVUPD X0, (DI)
+ MOVSD X2, 16(DI)
+ RET
+
/* TODO: write only 24 bytes */
TEXT addpt2_avx(SB), 1, $0
MOVQ SP, AX
- ADDQ $8, AX
- VMOVDQU_256mr(8, rAX, rX0)
- VMOVDQU_256mr(32, rAX, rX1)
+ ADDQ $16, AX
+ VMOVUPD_256mr(rAX, rX0)
+ ADDQ $24, AX
+ VMOVUPD_256mr(rAX, rX1)
VADDPD_256rr(rX1, rX0, rX0)
- VMOVDQU_256rm(rX0, rAX)
+ MOVQ BP, DI
+ VMOVUPD_256rm(rX0, rDI)
+ VZEROUPPER
RET
TEXT addpt3_avx(SB), 1, $0
MOVQ SP, AX
- ADDQ $8, AX
- VMOVDQU_256mr(8, rAX, rX0)
- VMOVDQU_256mr(40, rAX, rX1)
+ ADDQ $16, AX
+ VMOVUPD_256mr(rAX, rX0)
+ ADDQ $32, AX
+ VMOVUPD_256mr(rAX, rX1)
VADDPD_256rr(rX1, rX0, rX0)
- VMOVDQU_256rm(rX0, rAX)
+ MOVQ BP, DI
+ VMOVUPD_256rm(rX0, rDI)
+ VZEROUPPER
RET
--- a/main.c
+++ b/main.c
@@ -11,6 +11,7 @@
Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
double fma(double, double, double);
+Point2 addpt2_sse(Point2, Point2);
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
@@ -40,8 +41,10 @@
a = strtod(argv[0], nil);
b = strtod(argv[1], nil);
+ r = 0;
r = fmin(a, b);
print("fmin(%g, %g) = %g\n", a, b, r);
+ r = 0;
r = min(a, b);
print("min(%g, %g) = %g\n", a, b, r);
@@ -49,10 +52,13 @@
p0 = Pt2b(a, 1, 1);
p1 = Pt2b(b, 3, 1);
- r = dotvec2_sse4(p0, p1);
- print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
+ r = 0;
r = dotvec2(p0, p1);
print("dotvec2(%v, %v) = %g\n", p0, p1, r);
+ r = 0;
+ r = dotvec2_sse4(p0, p1);
+ print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
+ r = 0;
r = dotvec2_avx(p0, p1);
print("dotvec2_avx(%v, %v) = %g\n", p0, p1, r);
@@ -60,15 +66,19 @@
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
- r = dotvec3_sse4(p0t, p1t);
- print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r);
+ r = 0;
r = dotvec3(p0t, p1t);
print("dotvec3(%V, %V) = %g\n", p0t, p1t, r);
+ r = 0;
+ r = dotvec3_sse4(p0t, p1t);
+ print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r);
+ r = 0;
r = dotvec3_avx(p0t, p1t);
print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r);
print("\n");
+ r = 0;
r = hsubpd(a, b);
print("hsubpd(%g, %g) = %g\n", a, b, r);
@@ -76,15 +86,19 @@
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
- prt = crossvec3_sse(p0t, p1t);
- print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt);
+ prt = Vec3(0,0,0);
prt = crossvec3(p0t, p1t);
print("crossvec3(%V, %V) = %V\n", p0t, p1t, prt);
+ prt = Vec3(0,0,0);
+ prt = crossvec3_sse(p0t, p1t);
+ print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt);
print("\n");
+ r = 0;
r = madd(a, b, 21);
print("madd(%g, %g, 21) = %g\n", a, b, r);
+ r = 0;
r = fma(a, b, 21);
print("fma(%g, %g, 21) = %g\n", a, b, r);
@@ -92,8 +106,13 @@
p0 = Pt2b(a, 1, 1);
p1 = Pt2b(b, 3, 1);
+ pr = Vec2(0,0);
pr = addpt2(p0, p1);
print("addpt2(%v, %v) = %v\n", p0, p1, pr);
+ pr = Vec2(0,0);
+ pr = addpt2_sse(p0, p1);
+ print("addpt2_sse(%v, %v) = %v\n", p0, p1, pr);
+ pr = Vec2(0,0);
pr = addpt2_avx(p0, p1);
print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr);
@@ -101,8 +120,10 @@
p0t = Pt3(a, 1, 1, b);
p1t = Pt3(b, 3, 1, a);
+ prt = Vec3(0,0,0);
prt = addpt3(p0t, p1t);
print("addpt3(%V, %V) = %V\n", p0t, p1t, prt);
+ prt = Vec3(0,0,0);
prt = addpt3_avx(p0t, p1t);
print("addpt3_avx(%V, %V) = %V\n", p0t, p1t, prt);
--- a/sse.h
+++ b/sse.h
@@ -23,13 +23,13 @@
//opcode = 660F12
//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
//disp8 = 8 / 32
-#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
+//#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
/* MOVHPD */
//opcode = 660F16
//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
//disp8 = 16 / 40
-#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
+//#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
/* HSUBPD */
//opcode = 660F7D = 01100110 00001111 01111101