shithub: libvpx

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -21,7 +21,7 @@

 extern "C" {

 #include "vp9/common/vp9_entropy.h"

 #include "./vp9_rtcd.h"

-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);

+void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *output, int pitch);

 #include "vpx/vpx_integer.h"

@@ -496,7 +496,7 @@

 INSTANTIATE_TEST_CASE_P(

     C, Trans16x16DCT,

     ::testing::Values(

-        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));

+        make_tuple(&vp9_short_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));

 INSTANTIATE_TEST_CASE_P(

     C, Trans16x16HT,

     ::testing::Values(

@@ -510,7 +510,7 @@

     SSE2, Trans16x16DCT,

     ::testing::Values(

         make_tuple(&vp9_short_fdct16x16_sse2,

-                   &vp9_short_idct16x16_add_sse2, 0)));

+                   &vp9_idct16x16_256_add_sse2, 0)));

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans16x16HT,

     ::testing::Values(

--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c

+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c

@@ -11,19 +11,19 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_common.h"

-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,

+extern void vp9_idct16x16_256_add_neon_pass1(int16_t *input,

                                                int16_t *output,

                                                int output_stride);

-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,

+extern void vp9_idct16x16_256_add_neon_pass2(int16_t *src,

                                                int16_t *output,

                                                int16_t *pass1Output,

                                                int16_t skip_adding,

                                                uint8_t *dest,

                                                int dest_stride);

-extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,

+extern void vp9_idct16x16_10_add_neon_pass1(int16_t *input,

                                                int16_t *output,

                                                int output_stride);

-extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,

+extern void vp9_idct16x16_10_add_neon_pass2(int16_t *src,

                                                int16_t *output,

                                                int16_t *pass1Output,

                                                int16_t skip_adding,

@@ -34,7 +34,7 @@

 extern void vp9_push_neon(int64_t *store);

 extern void vp9_pop_neon(int64_t *store);

-void vp9_short_idct16x16_add_neon(int16_t *input,

+void vp9_idct16x16_256_add_neon(int16_t *input,

                                   uint8_t *dest, int dest_stride) {

   int64_t store_reg[8];

   int16_t pass1_output[16*16] = {0};

@@ -46,12 +46,12 @@

   /* Parallel idct on the upper 8 rows */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7

   // which will be saved into row_idct_output.

-  vp9_short_idct16x16_add_neon_pass2(input+1,

+  vp9_idct16x16_256_add_neon_pass2(input+1,

                                      row_idct_output,

                                      pass1_output,

0,

@@ -61,12 +61,12 @@

   /* Parallel idct on the lower 8 rows */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7

   // which will be saved into row_idct_output.

-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,

+  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,

                                      row_idct_output+8,

                                      pass1_output,

0,

@@ -76,12 +76,12 @@

   /* Parallel idct on the left 8 columns */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7.

   // Then add the result to the destination data.

-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

                                      row_idct_output,

                                      pass1_output,

1,

@@ -91,12 +91,12 @@

   /* Parallel idct on the right 8 columns */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7.

   // Then add the result to the destination data.

-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

                                      row_idct_output+8,

                                      pass1_output,

1,

@@ -109,7 +109,7 @@

   return;

-void vp9_short_idct16x16_10_add_neon(int16_t *input,

+void vp9_idct16x16_10_add_neon(int16_t *input,

                                   uint8_t *dest, int dest_stride) {

   int64_t store_reg[8];

   int16_t pass1_output[16*16] = {0};

@@ -121,12 +121,12 @@

   /* Parallel idct on the upper 8 rows */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);

+  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7

   // which will be saved into row_idct_output.

-  vp9_short_idct16x16_10_add_neon_pass2(input+1,

+  vp9_idct16x16_10_add_neon_pass2(input+1,

                                         row_idct_output,

                                         pass1_output,

0,

@@ -138,12 +138,12 @@

   /* Parallel idct on the left 8 columns */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7.

   // Then add the result to the destination data.

-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

                                      row_idct_output,

                                      pass1_output,

1,

@@ -153,12 +153,12 @@

   /* Parallel idct on the right 8 columns */

   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

   // stage 6 result in pass1_output.

-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

   // with result in pass1(pass1_output) to calculate final result in stage 7.

   // Then add the result to the destination data.

-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

                                      row_idct_output+8,

                                      pass1_output,

1,

--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm

+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm

@@ -8,7 +8,7 @@

-    EXPORT  |vp9_short_idct16x16_1_add_neon|

+    EXPORT  |vp9_idct16x16_1_add_neon|

ARM

     REQUIRE8

     PRESERVE8

@@ -15,7 +15,7 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,

+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,

 ;                                    int dest_stride)

 ; r0  int16_t input

@@ -22,7 +22,7 @@

 ; r1  uint8_t *dest

 ; r2  int dest_stride)

-|vp9_short_idct16x16_1_add_neon| PROC

+|vp9_idct16x16_1_add_neon| PROC

     ldrsh            r0, [r0]

     ; generate cospi_16_64 = 11585

@@ -193,6 +193,6 @@

     vst1.64          {d31}, [r12], r2

     bx               lr

-    ENDP             ; |vp9_short_idct16x16_1_add_neon|

+    ENDP             ; |vp9_idct16x16_1_add_neon|

END

--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm

+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm

@@ -8,10 +8,10 @@

 ;  be found in the AUTHORS file in the root of the source tree.

-    EXPORT  |vp9_short_idct16x16_add_neon_pass1|

-    EXPORT  |vp9_short_idct16x16_add_neon_pass2|

-    EXPORT  |vp9_short_idct16x16_10_add_neon_pass1|

-    EXPORT  |vp9_short_idct16x16_10_add_neon_pass2|

+    EXPORT  |vp9_idct16x16_256_add_neon_pass1|

+    EXPORT  |vp9_idct16x16_256_add_neon_pass2|

+    EXPORT  |vp9_idct16x16_10_add_neon_pass1|

+    EXPORT  |vp9_idct16x16_10_add_neon_pass2|

ARM

     REQUIRE8

     PRESERVE8

@@ -36,7 +36,7 @@

     MEND

     AREA    Block, CODE, READONLY ; name this block of code

-;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,

+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,

 ;                                          int16_t *output, int output_stride)

 ; r0  int16_t input

@@ -46,7 +46,7 @@

 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

 ; will be stored back into q8-q15 registers. This function will touch q0-q7

 ; registers and use them as buffer during calculation.

-|vp9_short_idct16x16_add_neon_pass1| PROC

+|vp9_idct16x16_256_add_neon_pass1| PROC

     ; TODO(hkuang): Find a better way to load the elements.

     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

@@ -273,9 +273,9 @@

     vst1.64         {d31}, [r1], r2

     bx              lr

-    ENDP  ; |vp9_short_idct16x16_add_neon_pass1|

+    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|

-;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,

+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,

 ;                                        int16_t *output,

 ;                                        int16_t *pass1Output,

 ;                                        int16_t skip_adding,

@@ -292,7 +292,7 @@

 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

 ; will be stored back into q8-q15 registers. This function will touch q0-q7

 ; registers and use them as buffer during calculation.

-|vp9_short_idct16x16_add_neon_pass2| PROC

+|vp9_idct16x16_256_add_neon_pass2| PROC

     push            {r3-r9}

     ; TODO(hkuang): Find a better way to load the elements.

@@ -784,9 +784,9 @@

 end_idct16x16_pass2

     pop             {r3-r9}

     bx              lr

-    ENDP  ; |vp9_short_idct16x16_add_neon_pass2|

+    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|

-;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input,

+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,

 ;                                             int16_t *output, int output_stride)

 ; r0  int16_t input

@@ -796,7 +796,7 @@

 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

 ; will be stored back into q8-q15 registers. This function will touch q0-q7

 ; registers and use them as buffer during calculation.

-|vp9_short_idct16x16_10_add_neon_pass1| PROC

+|vp9_idct16x16_10_add_neon_pass1| PROC

     ; TODO(hkuang): Find a better way to load the elements.

     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

@@ -905,9 +905,9 @@

     vst1.64         {d31}, [r1], r2

     bx              lr

-    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass1|

+    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|

-;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,

+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,

 ;                                           int16_t *output,

 ;                                           int16_t *pass1Output,

 ;                                           int16_t skip_adding,

@@ -924,7 +924,7 @@

 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

 ; will be stored back into q8-q15 registers. This function will touch q0-q7

 ; registers and use them as buffer during calculation.

-|vp9_short_idct16x16_10_add_neon_pass2| PROC

+|vp9_idct16x16_10_add_neon_pass2| PROC

     push            {r3-r9}

     ; TODO(hkuang): Find a better way to load the elements.

@@ -1175,5 +1175,5 @@

 end_idct10_16x16_pass2

     pop             {r3-r9}

     bx              lr

-    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass2|

+    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|

END

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -611,7 +611,7 @@

   output[15] = step2[0] - step2[15];

-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[16 * 16];

   int16_t *outptr = out;

   int i, j;

@@ -838,7 +838,7 @@

                                   + dest[j * dest_stride + i]);  }

-void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest,

+void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest,

                                   int dest_stride) {

   int16_t out[16 * 16] = { 0 };

   int16_t *outptr = out;

@@ -864,7 +864,7 @@

-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,

+void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest,

                                  int dest_stride) {

   int i, j;

   int a1;

@@ -1333,17 +1333,17 @@

-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) {

+void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) {

   /* The calculation can be simplified if there are not many non-zero dct

    * coefficients. Use eobs to separate different cases. */

   if (eob) {

     if (eob == 1)

       /* DC only DCT coefficient. */

-      vp9_short_idct16x16_1_add(input, dest, stride);

+      vp9_idct16x16_1_add(input, dest, stride);

     else if (eob <= 10)

-      vp9_short_idct16x16_10_add(input, dest, stride);

+      vp9_idct16x16_10_add(input, dest, stride);

     else

-      vp9_short_idct16x16_add(input, dest, stride);

+      vp9_idct16x16_256_add(input, dest, stride);

@@ -1379,7 +1379,7 @@

 void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,

                          int stride, int eob) {

   if (tx_type == DCT_DCT) {

-    vp9_idct_add_16x16(input, dest, stride, eob);

+    vp9_idct16x16_add(input, dest, stride, eob);

   } else {

     if (eob > 0) {

       vp9_short_iht16x16_add(input, dest, stride, tx_type);

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -91,7 +91,7 @@

 void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob);

-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob);

+void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -282,14 +282,14 @@

 prototype void vp9_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_idct8x8_10_add sse2 neon

-prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_idct16x16_1_add sse2 neon

+prototype void vp9_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_idct16x16_1_add sse2 neon

-prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_idct16x16_add sse2 neon

+prototype void vp9_idct16x16_256_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_idct16x16_256_add sse2 neon

-prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_idct16x16_10_add sse2 neon

+prototype void vp9_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_idct16x16_10_add sse2 neon

 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct32x32_add sse2 neon

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -1263,7 +1263,7 @@

                            stp2_10, stp2_13, stp2_11, stp2_12) \

-void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

   const __m128i zero = _mm_setzero_si128();

@@ -1470,7 +1470,7 @@

-void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   __m128i dc_value;

   const __m128i zero = _mm_setzero_si128();

   int a, i;

@@ -2456,7 +2456,7 @@

   write_buffer_8x16(dest, in1, stride);

-void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,

+void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,

                                      int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -454,7 +454,7 @@

       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);

       break;

     case TX_16X16:

-      vp9_idct_add_16x16(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);

+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);

       break;

     case TX_8X8:

       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);

--

⑨