shithub: openh264

Download patch

ref: c8c74903f83931816ccd7b5297d38ee7b93431b9
parent: f90960983cb365c3a772ded1e3e5c2f67157e73d
author: Sindre Aamås <saamas@cisco.com>
date: Wed Jan 27 08:23:52 EST 2016

[Encoder] Add single-block AVX2 4x4 DCT/IDCT routines

We do four blocks at a time when possible, but need to handle
single blocks at a time for intra prediction.

~3.15x speedup over MMX for the DCT on Haswell.
~2.94x speedup over MMX for the IDCT on Haswell.

Returns diminish with increasing vector length because a larger
proportion of the time is spent on load/store/shuffling.

--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -69,6 +69,7 @@
 void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
                                int16_t* pDctDc);
+void WelsIDctT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 void WelsIDctFourT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 #endif//X86_ASM
 
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -91,6 +91,7 @@
 void WelsDctT4_mmx (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 void WelsDctT4_sse2 (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctT4_avx2 (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 
 /****************************************************************************
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -271,6 +271,7 @@
     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_sse2;
   }
   if (uiCpuFlag & WELS_CPU_AVX2) {
+    pFuncList->pfIDctT4     = WelsIDctT4Rec_avx2;
     pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
   }
 #endif//X86_ASM
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -524,6 +524,7 @@
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;
   }
   if (uiCpuFlag & WELS_CPU_AVX2) {
+    pFuncList->pfDctT4                  = WelsDctT4_avx2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;
   }
 
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -63,6 +63,19 @@
     times 4 dw 1, 2, -1, -2
 wels_p1p1m1m1w_256:
     times 4 dw 1, 1, -1, -1
+wels_8xp1w_8xm1w:
+    times 8 dw  1
+    times 8 dw -1
+wels_4xp1w_4xm1w_256:
+    times 4 dw  1
+    times 4 dw -1
+    times 4 dw  1
+    times 4 dw -1
+wels_4xp1w_4xp2w_4xm1w_4xm2w:
+    times 4 dw  1
+    times 4 dw  2
+    times 4 dw -1
+    times 4 dw -2
 
 align 16
 wels_p1m1p1m1w_128:
@@ -780,6 +793,54 @@
     vextracti128  [%1+0x70], y%6, 1
 %endmacro
 
+%macro AVX2_Load4x4P 2
+    vmovdqu       y%1, [%2]
+%endmacro
+
+%macro AVX2_Store4x4P 2
+    vmovdqu       [%1], y%2
+%endmacro
+
+; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit.
+; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6
+%macro AVX2_Loadzx4x4P 6
+    vmovd         x%1, [%2         ]
+    add           %2, %3
+    vpbroadcastd  x%5, [%2 + 2 * %3]
+    vpblendd      x%1, x%1, x%5, 1010b
+    vpbroadcastd  y%5, [%2         ]
+    vpbroadcastd  y%6, [%2 +     %3]
+    vpblendd      y%5, y%5, y%6, 10101010b
+    vpblendd      y%1, y%1, y%5, 11110000b
+    vpshufb       y%1, y%1, %4
+%endmacro
+
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9
+%macro AVX2_LoadDiff4x4P 9
+    AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8
+    AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9
+    vpsubw        y%1, y%1, y%7
+%endmacro
+
+; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10
+%macro AVX2_StoreDiff4x4P 10
+    vpaddw         y%3, y%3, y%6
+    vpsraw         y%3, y%3, 6
+    AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
+    vpaddsw        y%3, y%3, y%8
+    vpackuswb      y%3, y%3, y%3
+    vbroadcasti128 y%8, [wels_shufb0231_128]
+    vpshufb        y%3, y%3, y%8
+    vextracti128   x%8, y%3, 1
+    vmovd          [%1         ], x%3
+    add            %1, %2
+    vmovd          [%1         ], x%8
+    vpsrlq         x%8, x%8, 32
+    vmovd          [%1     + %2], x%8
+    vpsrlq         x%3, x%3, 32
+    vmovd          [%1 + 2 * %2], x%3
+%endmacro
+
 ; 4-pt DCT
 ; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
 %macro AVX2_DCT 5
@@ -836,6 +897,32 @@
     vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
 %endmacro
 
+; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register.
+; Uses scrambled input to save a negation.
+; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
+%macro AVX2_DCT_4x4P 2
+    vpsignw       %2, %1, [wels_4xp1w_4xm1w_256]         ; [x0,-x3,x1,-x2]
+    vpshufd       %1, %1, 4eh                            ; [x3,x0,x2,x1]
+    vpaddw        %1, %1, %2                             ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+    vpmullw       %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
+    vpermq        %1, %1, 4eh                            ; [s[2],s[3],s[0],s[1]]
+    vpaddw        %1, %1, %2                             ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
+%endmacro
+
+; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register.
+; Output is scrambled to save a negation.
+; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2
+%macro AVX2_IDCT_4x4P 2
+    vpsraw        %2, %1, 1                              ; [x0>>1,x1>>1,x2>>1,x3>>1]
+    vpblendw      %2, %1, %2, 11110000b                  ; [x0,x1>>1,x2,x3>>1]
+    vpsignw       %1, %1, [wels_8xp1w_8xm1w]             ; [x0,x1,-x2,-x3]
+    vpermq        %2, %2, 4eh                            ; [x2,x3>>1,x0,x1>>1]
+    vpaddw        %1, %2, %1                             ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
+    vpshufd       %2, %1, 4eh                            ; [s[1],s[0],s[3],s[2]]
+    vpmullw       %1, %1, [wels_4xp1w_4xm1w_256]         ; [s[0],-s[1],s[2],-s[3], ...]
+    vpaddw        %1, %1, %2                             ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
+%endmacro
+
 ;***********************************************************************
 ; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
 ;***********************************************************************
@@ -898,6 +985,51 @@
     add r2, r3
     add r0, r1
     AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
+    vzeroupper
+
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
+
+;***********************************************************************
+; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctT4_avx2
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 5
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
+
+    vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
+    AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
+    AVX2_DCT_4x4P ymm0, ymm2
+    vbroadcasti128 ymm1, [wels_shufb2301_128]
+    AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
+    AVX2_Store4x4P r0, mm0
+    vzeroupper
+
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
+
+;***********************************************************************
+; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_avx2
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+
+    AVX2_Load4x4P mm0, r4
+    vbroadcasti128 ymm4, [wels_shufb2301_128]
+    AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
+    AVX2_IDCT_4x4P ymm0, ymm1
+    vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
+    vbroadcasti128 ymm5, [wels_dw32_128]
+    AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
     vzeroupper
 
     POP_XMM
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -239,6 +239,10 @@
 TEST (DecodeMbAuxTest, WelsIDctT4Rec_sse2) {
   TestIDctT4Rec<int16_t> (WelsIDctT4Rec_sse2);
 }
+TEST (DecodeMbAuxTest, WelsIDctT4Rec_avx2) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+    TestIDctT4Rec<int16_t> (WelsIDctT4Rec_avx2);
+}
 #endif
 template<typename clip_t>
 void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -208,6 +208,11 @@
   TestDctFourT4 (WelsDctFourT4_sse2);
 }
 
+TEST (EncodeMbAuxTest, WelsDctT4_avx2) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+    TestDctT4 (WelsDctT4_avx2);
+}
+
 TEST (EncodeMbAuxTest, WelsDctFourT4_avx2) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
     TestDctFourT4 (WelsDctFourT4_avx2);