shithub: openh264

--- a/codec/encoder/core/inc/decode_mb_aux.h

+++ b/codec/encoder/core/inc/decode_mb_aux.h

@@ -69,6 +69,7 @@

 void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

 void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,

                                int16_t* pDctDc);

+void WelsIDctT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

 void WelsIDctFourT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

 #endif//X86_ASM

--- a/codec/encoder/core/inc/encode_mb_aux.h

+++ b/codec/encoder/core/inc/encode_mb_aux.h

@@ -91,6 +91,7 @@

 void WelsDctT4_mmx (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

 void WelsDctT4_sse2 (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

 void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

+void WelsDctT4_avx2 (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

 void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

 /****************************************************************************

--- a/codec/encoder/core/src/decode_mb_aux.cpp

+++ b/codec/encoder/core/src/decode_mb_aux.cpp

@@ -271,6 +271,7 @@

     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_sse2;

   if (uiCpuFlag & WELS_CPU_AVX2) {

+    pFuncList->pfIDctT4     = WelsIDctT4Rec_avx2;

     pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;

 #endif//X86_ASM

--- a/codec/encoder/core/src/encode_mb_aux.cpp

+++ b/codec/encoder/core/src/encode_mb_aux.cpp

@@ -524,6 +524,7 @@

     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;

   if (uiCpuFlag & WELS_CPU_AVX2) {

+    pFuncList->pfDctT4                  = WelsDctT4_avx2;

     pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;

--- a/codec/encoder/core/x86/dct.asm

+++ b/codec/encoder/core/x86/dct.asm

@@ -63,6 +63,19 @@

     times 4 dw 1, 2, -1, -2

 wels_p1p1m1m1w_256:

     times 4 dw 1, 1, -1, -1

+wels_8xp1w_8xm1w:

+    times 8 dw  1

+    times 8 dw -1

+wels_4xp1w_4xm1w_256:

+    times 4 dw  1

+    times 4 dw -1

+    times 4 dw  1

+    times 4 dw -1

+wels_4xp1w_4xp2w_4xm1w_4xm2w:

+    times 4 dw  1

+    times 4 dw  2

+    times 4 dw -1

+    times 4 dw -2

 align 16

 wels_p1m1p1m1w_128:

@@ -780,6 +793,54 @@

     vextracti128  [%1+0x70], y%6, 1

 %endmacro

+%macro AVX2_Load4x4P 2

+    vmovdqu       y%1, [%2]

+%endmacro

+%macro AVX2_Store4x4P 2

+    vmovdqu       [%1], y%2

+%endmacro

+; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit.

+; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6

+%macro AVX2_Loadzx4x4P 6

+    vmovd         x%1, [%2         ]

+    add           %2, %3

+    vpbroadcastd  x%5, [%2 + 2 * %3]

+    vpblendd      x%1, x%1, x%5, 1010b

+    vpbroadcastd  y%5, [%2         ]

+    vpbroadcastd  y%6, [%2 +     %3]

+    vpblendd      y%5, y%5, y%6, 10101010b

+    vpblendd      y%1, y%1, y%5, 11110000b

+    vpshufb       y%1, y%1, %4

+%endmacro

+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9

+%macro AVX2_LoadDiff4x4P 9

+    AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8

+    AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9

+    vpsubw        y%1, y%1, y%7

+%endmacro

+; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10

+%macro AVX2_StoreDiff4x4P 10

+    vpaddw         y%3, y%3, y%6

+    vpsraw         y%3, y%3, 6

+    AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10

+    vpaddsw        y%3, y%3, y%8

+    vpackuswb      y%3, y%3, y%3

+    vbroadcasti128 y%8, [wels_shufb0231_128]

+    vpshufb        y%3, y%3, y%8

+    vextracti128   x%8, y%3, 1

+    vmovd          [%1         ], x%3

+    add            %1, %2

+    vmovd          [%1         ], x%8

+    vpsrlq         x%8, x%8, 32

+    vmovd          [%1     + %2], x%8

+    vpsrlq         x%3, x%3, 32

+    vmovd          [%1 + 2 * %2], x%3

+%endmacro

 ; 4-pt DCT

 ; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5

 %macro AVX2_DCT 5

@@ -836,6 +897,32 @@

     vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]

 %endmacro

+; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register.

+; Uses scrambled input to save a negation.

+; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2

+%macro AVX2_DCT_4x4P 2

+    vpsignw       %2, %1, [wels_4xp1w_4xm1w_256]         ; [x0,-x3,x1,-x2]

+    vpshufd       %1, %1, 4eh                            ; [x3,x0,x2,x1]

+    vpaddw        %1, %1, %2                             ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]

+    vpmullw       %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]

+    vpermq        %1, %1, 4eh                            ; [s[2],s[3],s[0],s[1]]

+    vpaddw        %1, %1, %2                             ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]

+%endmacro

+; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register.

+; Output is scrambled to save a negation.

+; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2

+%macro AVX2_IDCT_4x4P 2

+    vpsraw        %2, %1, 1                              ; [x0>>1,x1>>1,x2>>1,x3>>1]

+    vpblendw      %2, %1, %2, 11110000b                  ; [x0,x1>>1,x2,x3>>1]

+    vpsignw       %1, %1, [wels_8xp1w_8xm1w]             ; [x0,x1,-x2,-x3]

+    vpermq        %2, %2, 4eh                            ; [x2,x3>>1,x0,x1>>1]

+    vpaddw        %1, %2, %1                             ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]

+    vpshufd       %2, %1, 4eh                            ; [s[1],s[0],s[3],s[2]]

+    vpmullw       %1, %1, [wels_4xp1w_4xm1w_256]         ; [s[0],-s[1],s[2],-s[3], ...]

+    vpaddw        %1, %1, %2                             ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]

+%endmacro

 ;***********************************************************************

 ; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)

 ;***********************************************************************

@@ -898,6 +985,51 @@

     add r2, r3

     add r0, r1

     AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4

+    vzeroupper

+    POP_XMM

+    LOAD_5_PARA_POP

+    ret

+;***********************************************************************

+; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)

+;***********************************************************************

+WELS_EXTERN WelsDctT4_avx2

+    %assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 5

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r4, r4d

+    vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]

+    AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4

+    AVX2_DCT_4x4P ymm0, ymm2

+    vbroadcasti128 ymm1, [wels_shufb2301_128]

+    AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2

+    AVX2_Store4x4P r0, mm0

+    vzeroupper

+    POP_XMM

+    LOAD_5_PARA_POP

+    ret

+;***********************************************************************

+; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);

+;***********************************************************************

+WELS_EXTERN WelsIDctT4Rec_avx2

+    %assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 6

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    AVX2_Load4x4P mm0, r4

+    vbroadcasti128 ymm4, [wels_shufb2301_128]

+    AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1

+    AVX2_IDCT_4x4P ymm0, ymm1

+    vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]

+    vbroadcasti128 ymm5, [wels_dw32_128]

+    AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3

     vzeroupper

     POP_XMM

--- a/test/encoder/EncUT_DecodeMbAux.cpp

+++ b/test/encoder/EncUT_DecodeMbAux.cpp

@@ -239,6 +239,10 @@

 TEST (DecodeMbAuxTest, WelsIDctT4Rec_sse2) {

   TestIDctT4Rec<int16_t> (WelsIDctT4Rec_sse2);

+TEST (DecodeMbAuxTest, WelsIDctT4Rec_avx2) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)

+    TestIDctT4Rec<int16_t> (WelsIDctT4Rec_avx2);

+}

 #endif

 template<typename clip_t>

 void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {

--- a/test/encoder/EncUT_EncoderMbAux.cpp

+++ b/test/encoder/EncUT_EncoderMbAux.cpp

@@ -208,6 +208,11 @@

   TestDctFourT4 (WelsDctFourT4_sse2);

+TEST (EncodeMbAuxTest, WelsDctT4_avx2) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)

+    TestDctT4 (WelsDctT4_avx2);

+}

 TEST (EncodeMbAuxTest, WelsDctFourT4_avx2) {

   if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)

     TestDctFourT4 (WelsDctFourT4_avx2);

--

⑨