shithub: libvpx

Download patch

ref: 011fdec1e69c462767844877f6e3651b338215ad
parent: 4ddae8f5240199067c9b7071695660ccd35bf3d9
parent: 85c1ee434d51994844125a18bf6b98fd0510b827
author: Linfeng Zhang <linfengz@google.com>
date: Tue Nov 15 18:30:48 EST 2016

Merge "Add high bitdepth intra prediction NEON optimization (mode tm)"

--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -481,27 +481,35 @@
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL)
+    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(
-    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
-    vpx_highbd_dc_left_predictor_16x16_neon,
-    vpx_highbd_dc_top_predictor_16x16_neon,
-    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
-    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
-    vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(
-    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
-    vpx_highbd_dc_left_predictor_32x32_neon,
-    vpx_highbd_dc_top_predictor_32x32_neon,
-    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
-    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
-    vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL)
+    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_neon,
+                       vpx_highbd_dc_left_predictor_16x16_neon,
+                       vpx_highbd_dc_top_predictor_16x16_neon,
+                       vpx_highbd_dc_128_predictor_16x16_neon,
+                       vpx_highbd_v_predictor_16x16_neon,
+                       vpx_highbd_h_predictor_16x16_neon,
+                       vpx_highbd_d45_predictor_16x16_neon,
+                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_neon,
+                       vpx_highbd_dc_left_predictor_32x32_neon,
+                       vpx_highbd_dc_top_predictor_32x32_neon,
+                       vpx_highbd_dc_128_predictor_32x32_neon,
+                       vpx_highbd_v_predictor_32x32_neon,
+                       vpx_highbd_h_predictor_32x32_neon,
+                       vpx_highbd_d45_predictor_32x32_neon,
+                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -472,6 +472,14 @@
                              &vpx_highbd_h_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
                              &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
                              &vpx_highbd_v_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
@@ -540,6 +548,14 @@
                              &vpx_highbd_h_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
                              &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
                              &vpx_highbd_v_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
@@ -608,6 +624,14 @@
                              &vpx_highbd_h_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
                              &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
                              &vpx_highbd_v_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -895,3 +895,184 @@
     h_store_32(&dst, stride, row);
   }
 }
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+  const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+  const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x8_t sum;
+  uint16x8_t row;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub,
+                               const int16x8_t max) {
+  uint16x8_t row;
+  int16x8_t sum = vaddq_s16(left_dup, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1q_u16(*dst, row);
+  *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+  const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x4_t left_s16d;
+  int16x8_t left_dup;
+  int i;
+
+  left_s16d = vget_low_s16(left_s16);
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+  }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t max) {
+  uint16x8_t row0, row1;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 2; j++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+    }
+  }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3, const int16x8_t max) {
+  uint16x8_t row0, row1, row2, row3;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  sum2 = vminq_s16(sum2, max);
+  sum3 = vminq_s16(sum3, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  row2 = vqshluq_n_s16(sum2, 0);
+  row3 = vqshluq_n_s16(sum3, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += 8;
+  vst1q_u16(*dst, row2);
+  *dst += 8;
+  vst1q_u16(*dst, row3);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+  const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  const int16x8_t sub2 = vsubq_s16(above2, top_left);
+  const int16x8_t sub3 = vsubq_s16(above3, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+    }
+  }
+}
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -233,7 +233,7 @@
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
@@ -274,7 +274,7 @@
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
@@ -315,7 +315,7 @@
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
@@ -356,7 +356,7 @@
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;