shithub: amd64-simd

Download patch

ref: cc3307440e698d58843a5273519f4988c01937f1
parent: 7cf4634e668730749aa8b7fa9ff16cf4234958fa
author: rodri <rgl@antares-labs.eu>
date: Fri Nov 24 17:13:49 EST 2023

add more avx instructions and place VZEROUPPERs.

--- a/avx.h
+++ b/avx.h
@@ -22,19 +22,83 @@
 /* VZEROUPPER */
 #define VZEROUPPER	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_NO); BYTE $0x77
 
-/* VMOVAPD */
+/* VZEROALL */
+#define VZEROALL	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_NO); BYTE $0x77
+
+/* VMOVUPD */
 #define VMOVUPD_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);	\
 				VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVUPD_128rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);		\
+			VOP(0x10, 0x3, (d), (s))
+#define VMOVUPD_256mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);	\
+				VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVUPD_256rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);		\
+			VOP(0x10, 0x3, (d), (s))
+
+/* VMOVAPD */
+#define VMOVAPD_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);	\
+				VOPi(0x28, 0x1, (d), (s), (off))
 #define VMOVAPD_128rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);		\
 			VOP(0x28, 0x3, (d), (s))
+#define VMOVAPD_256mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);	\
+				VOPi(0x28, 0x1, (d), (s), (off))
+#define VMOVAPD_256rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);		\
+			VOP(0x28, 0x3, (d), (s))
+
+/* VADDPD */
+#define VADDPD_128mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOPi(0x58, 0x1, (d), (s1), (off))
+#define VADDPD_128rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);		\
+				VOP(0x58, 0x3, (d), (s1))
+#define VADDPD_256mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOPi(0x58, 0x1, (d), (s1), (off))
+#define VADDPD_256rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);		\
+				VOP(0x58, 0x3, (d), (s1))
+
+/* VSUBPD */
+#define VSUBPD_128mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOPi(0x5C, 0x1, (d), (s1), (off))
+#define VSUBPD_128rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);		\
+				VOP(0x5C, 0x3, (d), (s1))
+#define VSUBPD_256mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOPi(0x5C, 0x1, (d), (s1), (off))
+#define VSUBPD_256rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);		\
+				VOP(0x5C, 0x3, (d), (s1))
+
+/* VHADDPD */
+#define VHADDPD_128mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOPi(0x7C, 0x1, (d), (s1), (off))
+#define VHADDPD_128rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOP(0x7C, 0x3, (d), (s1))
+#define VHADDPD_256mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOPi(0x7C, 0x1, (d), (s1), (off))
+#define VHADDPD_256rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOP(0x7C, 0x3, (d), (s1))
+
+/* VHSUBPD */
+#define VHSUBPD_128mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOPi(0x7D, 0x1, (d), (s1), (off))
+#define VHSUBPD_128rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
+				VOP(0x7D, 0x3, (d), (s1))
+#define VHSUBPD_256mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOPi(0x7D, 0x1, (d), (s1), (off))
+#define VHSUBPD_256rr(s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66);	\
+				VOP(0x7D, 0x3, (d), (s1))
+
 /* VDPPD */
 #define VDPPD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66);	\
 			VOPi(0x41, 0x3, (d), (s1), 0x31)
 
 /* VFMADD231SD (128 bit) */
-#define VFMADD231SD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
+#define VFMADD231SD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);		\
 			VOP(0xB9, 0x3, (d), (s1))
+/* VFMADD231SD (256 bit) */
+#define VFMADD231SD_256(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_256,VEX_p_66);	\
+			VOP(0xB9, 0x3, (d), (s1))
 
 /* VFMADD231PD (128 bit) */
-#define VFMADD231PD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);	\
+#define VFMADD231PD(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66);		\
+			VOP(0xB8, 0x3, (d), (s1))
+/* VFMADD231PD (256 bit) */
+#define VFMADD231PD_256(s0, s1, d)	VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_256,VEX_p_66);	\
 			VOP(0xB8, 0x3, (d), (s1))
--- a/dppd.s
+++ b/dppd.s
@@ -19,6 +19,7 @@
 	VMOVUPD_128mr(8, rAX, rX0)	/* VMOVUPD a+0(FP), X0 */
 	VMOVUPD_128mr(32, rAX, rX1)	/* VMOVUPD b+24(FP), X1 */
 	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
+	VZEROUPPER
 	RET
 
 TEXT dppd3(SB), 1, $0
@@ -42,6 +43,7 @@
 	MOVSD a+16(FP), X1
 	MOVSD b+48(FP), X2
 	VFMADD231SD(rX1, rX2, rX0)
+	VZEROUPPER
 	RET
 
 TEXT Pt2b(SB), 1, $0
@@ -89,9 +91,15 @@
 	MOVSD X0, 24(DI)
 	RET
 
+TEXT xvec3a(SB), 1, $0
+	MOVQ SP, AX
+	ADDQ $8, AX
+	
+
 TEXT fma(SB), 1, $0
 	MOVSD a+0(FP), X0
 	MOVSD b+8(FP), X1
 	MOVSD c+16(FP), X2
 	VFMADD231SD(rX1, rX2, rX0)
+	VZEROUPPER
 	RET