ref: b267163f103132497b17ae3fee5249e362c75b2e
parent: b9adbcf37cf478268ad4647d1028a9dbd0332797
author: Sindre Aamås <saamas@cisco.com>
date: Mon Jan 18 15:43:28 EST 2016
[Encoder] Add an AVX2 4x4 DCT implementation ~2.52x faster on Haswell as compared to the SSE2 version.
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -90,6 +90,7 @@
****************************************************************************/
void WelsDctT4_mmx (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
/****************************************************************************
* HDM and Quant functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -522,6 +522,9 @@
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
+ if (uiCpuFlag & WELS_CPU_AVX2) {
+ pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
+ }
//#endif//MACOS
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -42,12 +42,22 @@
%include "asm_inc.asm"
-SECTION .rodata align=16
+SECTION .rodata align=32
;***********************************************************************
; Constant
;***********************************************************************
+align 32
+wels_p1m1p1m1w_256:
+ times 8 dw 1, -1
+wels_p1p2p1p2w_256:
+ times 8 dw 1, 2
+wels_rev64w_256:
+ times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
+wels_p1m1m1p1w_256:
+ times 4 dw 1, -1, -1, 1
+
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
@@ -62,7 +72,6 @@
dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29
-
;***********************************************************************
; MMX functions
;***********************************************************************
@@ -501,4 +510,102 @@
movdqa [r0+16], xmm2
POP_XMM
+ ret
+
+;***********************************************************************
+; AVX2 functions
+;***********************************************************************
+
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 zero=%6 clobber=%7,%8
+%macro AVX2_LoadDiff16P 8
+ vmovq x%1, [%2 ]
+ vpbroadcastq y%7, [%2 + 4 * %3]
+ vpblendd y%1, y%1, y%7, 11110000b
+ vpunpcklbw y%1, y%1, y%6
+ vmovq x%7, [%4 ]
+ vpbroadcastq y%8, [%4 + 4 * %5]
+ vpblendd y%7, y%7, y%8, 11110000b
+ vpunpcklbw y%7, y%7, y%6
+ vpsubw y%1, y%1, y%7
+%endmacro
+
+; pDct=%1 data=%1,%2,%3,%4 clobber=%5
+%macro AVX2_Store4x16P 6
+ vpunpcklqdq y%6, y%2, y%3
+ vmovdqa [%1+0x00], x%6
+ vextracti128 [%1+0x40], y%6, 1
+ vpunpckhqdq y%6, y%2, y%3
+ vmovdqa [%1+0x20], x%6
+ vextracti128 [%1+0x60], y%6, 1
+ vpunpcklqdq y%6, y%4, y%5
+ vmovdqa [%1+0x10], x%6
+ vextracti128 [%1+0x50], y%6, 1
+ vpunpckhqdq y%6, y%4, y%5
+ vmovdqa [%1+0x30], x%6
+ vextracti128 [%1+0x70], y%6, 1
+%endmacro
+
+; 4-pt DCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_DCT 5
+ vpsubw %5, %1, %4 ; s3 = x0 - x3
+ vpaddw %1, %1, %4 ; s0 = x0 + x3
+ vpsubw %4, %2, %3 ; s2 = x1 - x2
+ vpaddw %2, %2, %3 ; s1 = x1 + x2
+ vpsubw %3, %1, %2 ; y2 = s0 - s1
+ vpaddw %1, %1, %2 ; y0 = s0 + s1
+ vpsllw %2, %5, 1
+ vpaddw %2, %2, %4 ; y1 = 2 * s3 + s2
+ vpsllw %4, %4, 1
+ vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2
+%endmacro
+
+; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
+; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+%macro AVX2_DCT_HORIZONTAL 3
+ vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x[0],-x[1],x[2],-x[3], ...]
+ vpshufb %1, %1, %2 ; [x[3],x[2],x[1],x[0], ...]
+ vpaddw %1, %1, %3 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+ vpmullw %3, %1, [wels_p1m1m1p1w_256] ; [s[0],-s[1],-s[2],s[3], ...]
+ vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
+ vpmullw %1, %1, [wels_p1p2p1p2w_256] ; [s[2],2*s[3],s[0],2*s[1], ...]
+ vpaddw %1, %1, %3 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+
+ vpxor ymm6, ymm6, ymm6
+
+ ;Load 4x16
+ AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
+
+ AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
+ vmovdqa ymm6, [wels_rev64w_256]
+ AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
+
+ AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -198,6 +198,10 @@
TestDctFourT4 (WelsDctFourT4_sse2);
}
+TEST (EncodeMbAuxTest, WelsDctFourT4_avx2) {
+ TestDctFourT4 (WelsDctFourT4_avx2);
+}
+
TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iDctC, 16);