shithub: libvpx

--- a/test/lpf_8_test.cc

+++ b/test/lpf_8_test.cc

@@ -523,8 +523,10 @@

     ::testing::Values(

         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_sse2>,

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_sse2>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_sse2>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>,

                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_sse2>,

@@ -538,9 +540,10 @@

 INSTANTIATE_TEST_CASE_P(

     AVX2, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,

-                   2)));

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_avx2>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_avx2>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1)));

 #endif

 #if HAVE_SSE2

@@ -597,10 +600,10 @@

 #if HAVE_NEON_ASM

 // Using #if inside the macro is unsupported on MSVS but the tests are not

 // currently built for MSVS with ARM and NEON.

-        make_tuple(&vpx_lpf_horizontal_16_neon,

-                   &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_neon,

-                   &vpx_lpf_horizontal_16_c, 8, 2),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_neon>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_neon>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_neon>,

                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>,

@@ -638,10 +641,10 @@

                    &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_dspr2,

-                   &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_dspr2,

-                   &vpx_lpf_horizontal_16_c, 8, 2),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_8>, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_16>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_dspr2>,

                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_dspr2>,

@@ -672,8 +675,10 @@

                    &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_msa>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_msa>,

+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>,

                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_msa>,

--- a/vp10/common/loopfilter.c

+++ b/vp10/common/loopfilter.c

@@ -512,12 +512,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 2);

+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                     lfi->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1);

+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                    lfi->hev_thr);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -512,12 +512,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 2);

+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                     lfi->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1);

+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                    lfi->hev_thr);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

--- a/vpx_dsp/arm/loopfilter_mb_neon.asm

+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,27 +8,28 @@

 ;  be found in the AUTHORS file in the root of the source tree.

-    EXPORT  |vpx_lpf_horizontal_16_neon|

+    EXPORT  |vpx_lpf_horizontal_edge_8_neon|

+    EXPORT  |vpx_lpf_horizontal_edge_16_neon|

     EXPORT  |vpx_lpf_vertical_16_neon|

ARM

     AREA ||.text||, CODE, READONLY, ALIGN=2

-; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,

-;                                 const uint8_t *blimit,

-;                                 const uint8_t *limit,

-;                                 const uint8_t *thresh

-;                                 int count)

+; void mb_lpf_horizontal_edge(uint8_t *s, int p,

+;                             const uint8_t *blimit,

+;                             const uint8_t *limit,

+;                             const uint8_t *thresh,

+;                             int count)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-|vpx_lpf_horizontal_16_neon| PROC

+; r12   int count

+|mb_lpf_horizontal_edge| PROC

     push        {r4-r8, lr}

     vpush       {d8-d15}

     ldr         r4, [sp, #88]              ; load thresh

-    ldr         r12, [sp, #92]             ; load count

 h_count

     vld1.8      {d16[]}, [r2]              ; load *blimit

@@ -115,7 +116,35 @@

     vpop        {d8-d15}

     pop         {r4-r8, pc}

-    ENDP        ; |vpx_lpf_horizontal_16_neon|

+    ENDP        ; |mb_lpf_horizontal_edge|

+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,

+;                                     const uint8_t *blimit,

+;                                     const uint8_t *limit,

+;                                     const uint8_t *thresh)

+; r0    uint8_t *s,

+; r1    int pitch,

+; r2    const uint8_t *blimit,

+; r3    const uint8_t *limit,

+; sp    const uint8_t *thresh

+|vpx_lpf_horizontal_edge_8_neon| PROC

+    mov r12, #1

+    b mb_lpf_horizontal_edge

+    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|

+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,

+;                                      const uint8_t *blimit,

+;                                      const uint8_t *limit,

+;                                      const uint8_t *thresh)

+; r0    uint8_t *s,

+; r1    int pitch,

+; r2    const uint8_t *blimit,

+; r3    const uint8_t *limit,

+; sp    const uint8_t *thresh

+|vpx_lpf_horizontal_edge_16_neon| PROC

+    mov r12, #2

+    b mb_lpf_horizontal_edge

+    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|

 ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,

 ;                               const uint8_t *blimit,

--- a/vpx_dsp/loopfilter.c

+++ b/vpx_dsp/loopfilter.c

@@ -289,9 +289,9 @@

-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,

-                             const uint8_t *limit, const uint8_t *thresh,

-                             int count) {

+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh, int count) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

@@ -313,6 +313,16 @@

              s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);

     ++s;

+}

+void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,

+                                 const uint8_t *limit, const uint8_t *thresh) {

+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);

+}

+void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,

+                                  const uint8_t *limit, const uint8_t *thresh) {

+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);

 static void mb_lpf_vertical_edge_w(uint8_t *s, int p,

--- a/vpx_dsp/mips/loopfilter_16_msa.c

+++ b/vpx_dsp/mips/loopfilter_16_msa.c

@@ -423,11 +423,11 @@

-void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,

-                               const uint8_t *b_limit_ptr,

-                               const uint8_t *limit_ptr,

-                               const uint8_t *thresh_ptr,

-                               int32_t count) {

+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,

+                                   const uint8_t *b_limit_ptr,

+                                   const uint8_t *limit_ptr,

+                                   const uint8_t *thresh_ptr,

+                                   int32_t count) {

   if (1 == count) {

     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;

     uint64_t dword0, dword1;

@@ -646,6 +646,20 @@

     vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,

                                    thresh_ptr, count);

+}

+void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,

+                                   const uint8_t *b_limit_ptr,

+                                   const uint8_t *limit_ptr,

+                                   const uint8_t *thresh_ptr) {

+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);

+}

+void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,

+                                    const uint8_t *b_limit_ptr,

+                                    const uint8_t *limit_ptr,

+                                    const uint8_t *thresh_ptr) {

+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);

 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,

--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -19,12 +19,12 @@

 #include "vpx_mem/vpx_mem.h"

 #if HAVE_DSPR2

-void vpx_lpf_horizontal_16_dspr2(unsigned char *s,

-                                 int pitch,

-                                 const uint8_t *blimit,

-                                 const uint8_t *limit,

-                                 const uint8_t *thresh,

-                                 int count) {

+static void mb_lpf_horizontal_edge(unsigned char *s,

+                                   int pitch,

+                                   const uint8_t *blimit,

+                                   const uint8_t *limit,

+                                   const uint8_t *thresh,

+                                   int count) {

   uint32_t  mask;

   uint32_t  hev, flat, flat2;

   uint8_t   i;

@@ -790,5 +790,19 @@

     s = s + 4;

+}

+void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,

+                                     const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh) {

+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);

+}

+void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,

+                                      const uint8_t *blimit,

+                                      const uint8_t *limit,

+                                      const uint8_t *thresh) {

+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);

 #endif  // #if HAVE_DSPR2

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -548,9 +548,13 @@

 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;

-add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

-specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;

-$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;

+add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

+specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;

+$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;

+add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

+specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;

+$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;

 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;

--- a/vpx_dsp/x86/loopfilter_avx2.c

+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -13,9 +13,10 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_ports/mem.h"

-static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,

+                                    const unsigned char *_blimit,

+                                    const unsigned char *_limit,

+                                    const unsigned char *_thresh) {

     __m128i mask, hev, flat, flat2;

     const __m128i zero = _mm_set1_epi16(0);

     const __m128i one = _mm_set1_epi8(1);

@@ -400,9 +401,10 @@

   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128

};

-static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,

+                                     const unsigned char *_blimit,

+                                     const unsigned char *_limit,

+                                     const unsigned char *_thresh) {

     __m128i mask, hev, flat, flat2;

     const __m128i zero = _mm_set1_epi16(0);

     const __m128i one = _mm_set1_epi8(1);

@@ -974,13 +976,4 @@

         q6 = _mm_or_si128(flat2_q6, q6);

         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);

-}

-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh, int count) {

-    if (count == 1)

-        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);

-    else

-        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);

--- a/vpx_dsp/x86/loopfilter_sse2.c

+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -18,11 +18,10 @@

   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));

-static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,

-                                            int p,

-                                            const unsigned char *_blimit,

-                                            const unsigned char *_limit,

-                                            const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,

+                                    const unsigned char *_blimit,

+                                    const unsigned char *_limit,

+                                    const unsigned char *_thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

@@ -383,11 +382,10 @@

   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);

-static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,

-                                             int p,

-                                             const unsigned char *_blimit,

-                                             const unsigned char *_limit,

-                                             const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,

+                                     const unsigned char *_blimit,

+                                     const unsigned char *_limit,

+                                     const unsigned char *_thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

@@ -716,17 +714,6 @@

-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.

-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,

-                                const unsigned char *_blimit,

-                                const unsigned char *_limit,

-                                const unsigned char *_thresh, int count) {

-  if (count == 1)

-    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);

-  else

-    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);

-}

 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,

                                const unsigned char *_blimit,

                                const unsigned char *_limit,

@@ -1554,7 +1541,7 @@

   transpose(src, p, dst, 8, 2);

   // Loop filtering

-  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);

+  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);

   src[0] = t_dst;

   src[1] = t_dst + 8 * 8;

@@ -1575,8 +1562,7 @@

   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

   // Loop filtering

-  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,

-                                   thresh);

+  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);

   // Transpose back

   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

--

⑨