shithub: libvpx

Download patch

ref: 9412785b0241e5ada129b96b74827881b3fb0da5
parent: ff3baaef94cc8fb4548f95463ac9dc8bd662c436
parent: b466ad5efc0cee97b5e3e31f7aeb72685beed45e
author: Jingning Han <jingning@google.com>
date: Fri May 9 05:58:39 EDT 2014

Merge changes I3edd4b95,I4514f974,Ie7fa4386

* changes:
  Turn on unit tests for SSSE3 8x8 forward and inverse 2D-DCT
  Change eob threshold for partial inverse 8x8 2D-DCT to 12
  SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero

--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -340,4 +340,11 @@
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
 #endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+#endif
 }  // namespace
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -132,8 +132,8 @@
                    &vp9_idct16x16_1_add_c,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_c,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_c,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_c,
                    TX_8X8, 1),
@@ -154,8 +154,8 @@
                    &vp9_idct16x16_1_add_neon,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_neon,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_neon,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_neon,
                    TX_8X8, 1),
@@ -181,8 +181,8 @@
                    &vp9_idct16x16_1_add_sse2,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_sse2,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_sse2,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_sse2,
                    TX_8X8, 1),
@@ -189,5 +189,14 @@
         make_tuple(&vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_sse2,
                    TX_4X4, 1)));
+#endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_12_add_ssse3,
+                   TX_8X8, 12)));
 #endif
 }  // namespace
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
@@ -9,7 +9,7 @@
 ;
 
     EXPORT  |vp9_idct8x8_64_add_neon|
-    EXPORT  |vp9_idct8x8_10_add_neon|
+    EXPORT  |vp9_idct8x8_12_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -310,13 +310,13 @@
     bx              lr
     ENDP  ; |vp9_idct8x8_64_add_neon|
 
-;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_idct8x8_10_add_neon|
+    ENDP  ; |vp9_idct8x8_12_add_neon|
 
     END
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -617,7 +617,7 @@
   }
 }
 
-void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
                               int dest_stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -421,7 +421,7 @@
   }
 }
 
-void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -1348,8 +1348,8 @@
   if (eob == 1)
     // DC only DCT coefficient
     vp9_idct8x8_1_add(input, dest, stride);
-  else if (eob <= 10)
-    vp9_idct8x8_10_add(input, dest, stride);
+  else if (eob <= 12)
+    vp9_idct8x8_12_add(input, dest, stride);
   else
     vp9_idct8x8_64_add(input, dest, stride);
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -312,8 +312,8 @@
 add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
 specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
 
-add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
+add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
 specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -995,7 +995,7 @@
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
--- a/vp9/common/x86/vp9_idct_ssse3.asm
+++ b/vp9/common/x86/vp9_idct_ssse3.asm
@@ -28,6 +28,29 @@
 TRANSFORM_COEFFS    3196, 16069
 TRANSFORM_COEFFS   13623,  9102
 
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
 SECTION .text
 
 %if ARCH_X86_64
@@ -128,6 +151,7 @@
 %endmacro
 
 INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
 cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
   mova     m8, [pd_8192]
   mova    m11, [pw_16]
@@ -159,4 +183,118 @@
   ADD_STORE_8P_2X  6, 7, 9, 10, 12
 
   RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
 %endif