shithub: libvpx

--- a/test/test_intra_pred_speed.cc

+++ b/test/test_intra_pred_speed.cc

@@ -362,9 +362,12 @@

 #endif  // HAVE_SSSE3

 #if HAVE_NEON

-INTRA_PRED_TEST(NEON, TestIntraPred32, NULL, NULL, NULL, NULL,

-                vp9_v_predictor_32x32_neon, vp9_h_predictor_32x32_neon, NULL,

-                NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon)

+INTRA_PRED_TEST(NEON, TestIntraPred32, vp9_dc_predictor_32x32_neon,

+                vp9_dc_left_predictor_32x32_neon,

+                vp9_dc_top_predictor_32x32_neon,

+                vp9_dc_128_predictor_32x32_neon, vp9_v_predictor_32x32_neon,

+                vp9_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,

+                vp9_tm_predictor_32x32_neon)

 #endif  // HAVE_NEON

 #if HAVE_MSA

--- a/vp9/common/arm/neon/vp9_reconintra_neon.c

+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c

@@ -161,6 +161,89 @@

   dc_16x16(dst, stride, NULL, NULL, 0, 0);

+//------------------------------------------------------------------------------

+// DC 32x32

+// 'do_above' and 'do_left' facilitate branch removal when inlined.

+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,

+                            const uint8_t *above, const uint8_t *left,

+                            int do_above, int do_left) {

+  uint16x8_t sum_top;

+  uint16x8_t sum_left;

+  uint8x8_t dc0;

+  if (do_above) {

+    const uint8x16_t A0 = vld1q_u8(above);  // top row

+    const uint8x16_t A1 = vld1q_u8(above + 16);

+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top

+    const uint16x8_t p1 = vpaddlq_u8(A1);

+    const uint16x8_t p2 = vaddq_u16(p0, p1);

+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));

+    const uint16x4_t p4 = vpadd_u16(p3, p3);

+    const uint16x4_t p5 = vpadd_u16(p4, p4);

+    sum_top = vcombine_u16(p5, p5);

+  }

+  if (do_left) {

+    const uint8x16_t L0 = vld1q_u8(left);  // left row

+    const uint8x16_t L1 = vld1q_u8(left + 16);

+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left

+    const uint16x8_t p1 = vpaddlq_u8(L1);

+    const uint16x8_t p2 = vaddq_u16(p0, p1);

+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));

+    const uint16x4_t p4 = vpadd_u16(p3, p3);

+    const uint16x4_t p5 = vpadd_u16(p4, p4);

+    sum_left = vcombine_u16(p5, p5);

+  }

+  if (do_above && do_left) {

+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);

+    dc0 = vrshrn_n_u16(sum, 6);

+  } else if (do_above) {

+    dc0 = vrshrn_n_u16(sum_top, 5);

+  } else if (do_left) {

+    dc0 = vrshrn_n_u16(sum_left, 5);

+  } else {

+    dc0 = vdup_n_u8(0x80);

+  }

+  {

+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);

+    int i;

+    for (i = 0; i < 32; ++i) {

+      vst1q_u8(dst + i * stride, dc);

+      vst1q_u8(dst + i * stride + 16, dc);

+    }

+  }

+}

+void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,

+                                 const uint8_t *above, const uint8_t *left) {

+  dc_32x32(dst, stride, above, left, 1, 1);

+}

+void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,

+                                      const uint8_t *above,

+                                      const uint8_t *left) {

+  (void)above;

+  dc_32x32(dst, stride, NULL, left, 0, 1);

+}

+void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,

+                                     const uint8_t *above,

+                                     const uint8_t *left) {

+  (void)left;

+  dc_32x32(dst, stride, above, NULL, 1, 0);

+}

+void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,

+                                     const uint8_t *above,

+                                     const uint8_t *left) {

+  (void)above;

+  (void)left;

+  dc_32x32(dst, stride, NULL, NULL, 0, 0);

+}

 #if !HAVE_NEON_ASM

 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -201,16 +201,16 @@

 specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";

 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

-specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";

+specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc";

 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

-specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";

+specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";

 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

-specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";

+specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";

 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

-specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";

+specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";

 # Loopfilter

--

⑨