ref: 3088d96978667117f07891f9e5749c218de7f2d2
parent: b267163f103132497b17ae3fee5249e362c75b2e
author: Sindre Aamås <saamas@cisco.com>
date: Mon Jan 18 15:46:20 EST 2016
[Encoder] Add an AVX2 4x4 IDCT implementation ~2.03x faster on Haswell as compared to the SSE2 version.
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -605,8 +605,8 @@
packuswb %1,%1
%endmacro
-
-
-
-
-
+%macro WELS_DW32_VEX 1
+ vpcmpeqw %1, %1, %1
+ vpsrlw %1, %1, 15
+ vpsllw %1, %1, 5
+%endmacro
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -68,6 +68,7 @@
void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
int16_t* pDctDc);
+void WelsIDctFourT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
#endif//X86_ASM
#ifdef HAVE_NEON
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -269,6 +269,9 @@
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2;
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
}
+ if (uiCpuFlag & WELS_CPU_AVX2) {
+ pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
+ }
#endif//X86_ASM
#if defined(HAVE_NEON)
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -57,6 +57,8 @@
times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
wels_p1m1m1p1w_256:
times 4 dw 1, -1, -1, 1
+wels_p1p1m1m1w_256:
+ times 4 dw 1, 1, -1, -1
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
@@ -529,6 +531,48 @@
vpsubw y%1, y%1, y%7
%endmacro
+; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 zero=%8 clobber=%9,%10
+%macro AVX2_StoreDiff32P 10
+ vpaddw y%3, y%3, y%7
+ vpsraw y%3, y%3, 6
+ vmovq x%9, [%5 ]
+ vpbroadcastq y%10, [%5 + 4 * %6]
+ add %5, %6
+ vpblendd y%9, y%9, y%10, 11110000b
+ vpunpcklbw y%9, y%9, y%8
+ vpaddsw y%3, y%3, y%9
+ vpaddw y%4, y%4, y%7
+ vpsraw y%4, y%4, 6
+ vmovq x%9, [%5 ]
+ vpbroadcastq y%10, [%5 + 4 * %6]
+ vpblendd y%9, y%9, y%10, 11110000b
+ vpunpcklbw y%9, y%9, y%8
+ vpaddsw y%4, y%4, y%9
+ vpackuswb y%3, y%3, y%4
+ vextracti128 x%4, y%3, 1
+ vmovlps [%1 ], x%3
+ vmovlps [%1 + 4 * %2], x%4
+ add %1, %2
+ vmovhps [%1 ], x%3
+ vmovhps [%1 + 4 * %2], x%4
+%endmacro
+
+; out=%1,%2,%3,%4 pDct=%5 clobber=%6
+%macro AVX2_Load4x16P 6
+ vmovdqa x%2, [%5+0x00]
+ vinserti128 y%2, [%5+0x40], 1
+ vmovdqa x%6, [%5+0x20]
+ vinserti128 y%6, [%5+0x60], 1
+ vpunpcklqdq y%1, y%2, y%6
+ vpunpckhqdq y%2, y%2, y%6
+ vmovdqa x%4, [%5+0x10]
+ vinserti128 y%4, [%5+0x50], 1
+ vmovdqa x%6, [%5+0x30]
+ vinserti128 y%6, [%5+0x70], 1
+ vpunpcklqdq y%3, y%4, y%6
+ vpunpckhqdq y%4, y%4, y%6
+%endmacro
+
; pDct=%1 data=%1,%2,%3,%4 clobber=%5
%macro AVX2_Store4x16P 6
vpunpcklqdq y%6, y%2, y%3
@@ -560,6 +604,21 @@
vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2
%endmacro
+; 4-pt IDCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_IDCT 5
+ vpsraw %5, %2, 1
+ vpsubw %5, %5, %4 ; t3 = (x1 >> 1) - x3
+ vpsraw %4, %4, 1
+ vpaddw %4, %2, %4 ; t2 = x1 + (x3 >> 1)
+ vpaddw %2, %1, %3 ; t0 = x0 + x2
+ vpsubw %3, %1, %3 ; t1 = x0 - x2
+ vpaddw %1, %2, %4 ; y0 = t0 + t2
+ vpsubw %4, %2, %4 ; y3 = t0 - t2
+ vpaddw %2, %3, %5 ; y1 = t1 + t3
+ vpsubw %3, %3, %5 ; y2 = t1 - t3
+%endmacro
+
; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
%macro AVX2_DCT_HORIZONTAL 3
@@ -572,6 +631,20 @@
vpaddw %1, %1, %3 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
%endmacro
+; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
+; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+%macro AVX2_IDCT_HORIZONTAL 3
+ vpsraw %3, %1, 1 ; [x[0]>>1,x[1]>>1,x[2]>>1,x[3]>>1, ...]
+ vpblendw %3, %1, %3, 10101010b ; [x[0],x[1]>>1,x[2],x[3]>>1, ...]
+ vpshufd %1, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
+ vpsignw %1, %1, [wels_p1m1m1p1w_256] ; [x[2],-x[3],-x[0],x[1], ...]
+ vpaddw %1, %3, %1 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
+ vpshufb %3, %1, %2 ; [s[3],s[2],s[1],s[0], ...]
+ vpmullw %1, %1, [wels_p1p1m1m1w_256] ; [s[0],s[1],-s[2],-s[3], ...]
+ vpmullw %3, %3, [wels_p1m1m1p1w_256] ; [s[3],-s[2],-s[1],s[0], ...]
+ vpaddw %1, %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
+%endmacro
+
;***********************************************************************
; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
;***********************************************************************
@@ -604,6 +677,36 @@
AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
+ vmovdqa ymm6, [wels_rev64w_256]
+ AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
+ AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
+
+ vpxor ymm6, ymm6, ymm6
+ WELS_DW32_VEX ymm7
+ AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
+ add r2, r3
+ add r0, r1
+ AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
vzeroupper
POP_XMM
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -354,6 +354,9 @@
TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_sse2) {
TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_sse2);
}
+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_avx2) {
+ TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_avx2);
+}
TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_sse2) {
int32_t iCpuCores = 0;
uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);