shithub: openh264

Download patch

ref: b267163f103132497b17ae3fee5249e362c75b2e
parent: b9adbcf37cf478268ad4647d1028a9dbd0332797
author: Sindre Aamås <saamas@cisco.com>
date: Mon Jan 18 15:43:28 EST 2016

[Encoder] Add an AVX2 4x4 DCT implementation

~2.52x faster on Haswell as compared to the SSE2 version.

--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -90,6 +90,7 @@
  ****************************************************************************/
 void WelsDctT4_mmx (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 
 /****************************************************************************
  * HDM and Quant functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -522,6 +522,9 @@
   if (uiCpuFlag & WELS_CPU_SSSE3) {
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;
   }
+  if (uiCpuFlag & WELS_CPU_AVX2) {
+    pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;
+  }
 
 //#endif//MACOS
 
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -42,12 +42,22 @@
 
 %include "asm_inc.asm"
 
-SECTION .rodata align=16
+SECTION .rodata align=32
 
 ;***********************************************************************
 ; Constant
 ;***********************************************************************
 
+align 32
+wels_p1m1p1m1w_256:
+    times 8 dw 1, -1
+wels_p1p2p1p2w_256:
+    times 8 dw 1, 2
+wels_rev64w_256:
+    times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
+wels_p1m1m1p1w_256:
+    times 4 dw 1, -1, -1, 1
+
 align 16
 SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
             dw  10, 13, 10, 13, 13, 16, 13, 16,
@@ -62,7 +72,6 @@
             dw  18, 23, 18, 23, 23, 29, 23, 29,
             dw  18, 23, 18, 23, 23, 29, 23, 29
 
-
 ;***********************************************************************
 ; MMX functions
 ;***********************************************************************
@@ -501,4 +510,102 @@
     movdqa  [r0+16],   xmm2
 
     POP_XMM
+    ret
+
+;***********************************************************************
+; AVX2 functions
+;***********************************************************************
+
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 zero=%6 clobber=%7,%8
+%macro AVX2_LoadDiff16P 8
+    vmovq         x%1, [%2         ]
+    vpbroadcastq  y%7, [%2 + 4 * %3]
+    vpblendd      y%1, y%1, y%7, 11110000b
+    vpunpcklbw    y%1, y%1, y%6
+    vmovq         x%7, [%4         ]
+    vpbroadcastq  y%8, [%4 + 4 * %5]
+    vpblendd      y%7, y%7, y%8, 11110000b
+    vpunpcklbw    y%7, y%7, y%6
+    vpsubw        y%1, y%1, y%7
+%endmacro
+
+; pDct=%1 data=%1,%2,%3,%4 clobber=%5
+%macro AVX2_Store4x16P 6
+    vpunpcklqdq   y%6, y%2,  y%3
+    vmovdqa       [%1+0x00], x%6
+    vextracti128  [%1+0x40], y%6, 1
+    vpunpckhqdq   y%6, y%2,  y%3
+    vmovdqa       [%1+0x20], x%6
+    vextracti128  [%1+0x60], y%6, 1
+    vpunpcklqdq   y%6, y%4,  y%5
+    vmovdqa       [%1+0x10], x%6
+    vextracti128  [%1+0x50], y%6, 1
+    vpunpckhqdq   y%6, y%4,  y%5
+    vmovdqa       [%1+0x30], x%6
+    vextracti128  [%1+0x70], y%6, 1
+%endmacro
+
+; 4-pt DCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_DCT 5
+    vpsubw        %5, %1, %4  ; s3 = x0 - x3
+    vpaddw        %1, %1, %4  ; s0 = x0 + x3
+    vpsubw        %4, %2, %3  ; s2 = x1 - x2
+    vpaddw        %2, %2, %3  ; s1 = x1 + x2
+    vpsubw        %3, %1, %2  ; y2 = s0 - s1
+    vpaddw        %1, %1, %2  ; y0 = s0 + s1
+    vpsllw        %2, %5, 1
+    vpaddw        %2, %2, %4  ; y1 = 2 * s3 + s2
+    vpsllw        %4, %4, 1
+    vpsubw        %4, %5, %4  ; y3 = s3 - 2 * s2
+%endmacro
+
+; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
+; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+%macro AVX2_DCT_HORIZONTAL 3
+    vpsignw       %3, %1, [wels_p1m1p1m1w_256]  ; [x[0],-x[1],x[2],-x[3], ...]
+    vpshufb       %1, %1, %2                    ; [x[3],x[2],x[1],x[0], ...]
+    vpaddw        %1, %1, %3                    ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+    vpmullw       %3, %1, [wels_p1m1m1p1w_256]  ; [s[0],-s[1],-s[2],s[3], ...]
+    vpshufd       %1, %1, 0b1h                  ; [s[2],s[3],s[0],s[1], ...]
+    vpmullw       %1, %1, [wels_p1p2p1p2w_256]  ; [s[2],2*s[3],s[0],2*s[1], ...]
+    vpaddw        %1, %1, %3                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_avx2
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
+
+    vpxor ymm6, ymm6, ymm6
+
+    ;Load 4x16
+    AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
+    add r1, r2
+    add r3, r4
+    AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
+    add r1, r2
+    add r3, r4
+    AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
+    add r1, r2
+    add r3, r4
+    AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
+
+    AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
+    vmovdqa ymm6, [wels_rev64w_256]
+    AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
+    AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
+    AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
+    AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
+
+    AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
+    vzeroupper
+
+    POP_XMM
+    LOAD_5_PARA_POP
     ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -198,6 +198,10 @@
   TestDctFourT4 (WelsDctFourT4_sse2);
 }
 
+TEST (EncodeMbAuxTest, WelsDctFourT4_avx2) {
+  TestDctFourT4 (WelsDctFourT4_avx2);
+}
+
 TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
   CMemoryAlign cMemoryAlign (0);
   ALLOC_MEMORY (int16_t, iDctC, 16);