shithub: libvpx

Download patch

ref: 037d67f684683ffad22e38ab9a6381ccfedd813f
parent: 719fe0bc5f8af9cd27314e96a6eead2710666050
author: Angie Chiang <angiebird@google.com>
date: Fri Jul 12 12:15:55 EDT 2019

Use sdx8f in exhaustive_mesh_search_single_step

This speed up non_greedy_mv by 4%

Change-Id: I9288c88db56ea4201a7ec4493ca5c567d76af0f1

--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1513,6 +1513,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+// TODO(angiebird): make sdx8f available for highbitdepth if needed
 #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
   cpi->fn_ptr[BT].sdf = SDF;                             \
   cpi->fn_ptr[BT].sdaf = SDAF;                           \
@@ -1519,7 +1520,8 @@
   cpi->fn_ptr[BT].vf = VF;                               \
   cpi->fn_ptr[BT].svf = SVF;                             \
   cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
+  cpi->fn_ptr[BT].sdx8f = NULL;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -2418,62 +2420,67 @@
   CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
+  cpi->fn_ptr[BT].sdf = SDF;                             \
+  cpi->fn_ptr[BT].sdaf = SDAF;                           \
+  cpi->fn_ptr[BT].vf = VF;                               \
+  cpi->fn_ptr[BT].svf = SVF;                             \
+  cpi->fn_ptr[BT].svaf = SVAF;                           \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
+  cpi->fn_ptr[BT].sdx8f = SDX8F;
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                      \
-  cpi->fn_ptr[BT].sdaf = SDAF;                    \
-  cpi->fn_ptr[BT].vf = VF;                        \
-  cpi->fn_ptr[BT].svf = SVF;                      \
-  cpi->fn_ptr[BT].svaf = SVAF;                    \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
+  // TODO(angiebird): make sdx8f available for every block size
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
       vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d)
+      vpx_sad32x16x4d, NULL)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
       vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d)
+      vpx_sad16x32x4d, NULL)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
       vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d)
+      vpx_sad64x32x4d, NULL)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
       vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d)
+      vpx_sad32x64x4d, NULL)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
       vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d)
+      vpx_sad32x32x4d, NULL)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
       vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d)
+      vpx_sad64x64x4d, NULL)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
       vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d)
+      vpx_sad16x16x4d, vpx_sad16x16x8)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
       vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d)
+      vpx_sad16x8x4d, vpx_sad16x8x8)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
       vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d)
+      vpx_sad8x16x4d, vpx_sad8x16x8)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+      vpx_sad8x8x8)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+      NULL)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+      NULL)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+      vpx_sad4x4x8)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1979,18 +1979,16 @@
   end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
   end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
   for (r = start_row; r <= end_row; r += 1) {
-    for (c = start_col; c <= end_col; c += 4) {
-      // 4 sads in a single call if we are checking every location
-      if (c + 3 <= end_col) {
-        unsigned int sads[4];
-        const uint8_t *addrs[4];
-        for (i = 0; i < 4; ++i) {
-          const MV mv = { r, c + i };
-          addrs[i] = get_buf_from_mv(pre, &mv);
-        }
-        fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+    c = start_col;
+    // sdx8f may not be available some block size
+    if (fn_ptr->sdx8f) {
+      while (c + 7 <= end_col) {
+        unsigned int sads[8];
+        const MV mv = { r, c };
+        const uint8_t *buf = get_buf_from_mv(pre, &mv);
+        fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
 
-        for (i = 0; i < 4; ++i) {
+        for (i = 0; i < 8; ++i) {
           int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
           if (sad < best_sad) {
             const MV mv = { r, c + i };
@@ -2002,23 +2000,45 @@
             }
           }
         }
-      } else {
-        for (i = 0; i <= end_col - c; ++i) {
+        c += 8;
+      }
+    }
+    while (c + 3 <= end_col) {
+      unsigned int sads[4];
+      const uint8_t *addrs[4];
+      for (i = 0; i < 4; ++i) {
+        const MV mv = { r, c + i };
+        addrs[i] = get_buf_from_mv(pre, &mv);
+      }
+      fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+
+      for (i = 0; i < 4; ++i) {
+        int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
+        if (sad < best_sad) {
           const MV mv = { r, c + i };
-          int64_t sad =
-              (int64_t)fn_ptr->sdf(src->buf, src->stride,
-                                   get_buf_from_mv(pre, &mv), pre->stride)
-              << LOG2_PRECISION;
+          sad +=
+              lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
           if (sad < best_sad) {
-            sad += lambda *
-                   vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+            best_sad = sad;
+            *best_mv = mv;
           }
         }
       }
+      c += 4;
+    }
+    while (c <= end_col) {
+      const MV mv = { r, c };
+      int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+                                         get_buf_from_mv(pre, &mv), pre->stride)
+                    << LOG2_PRECISION;
+      if (sad < best_sad) {
+        sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      c += 1;
     }
   }
   return best_sad;
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -76,6 +76,7 @@
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
+  vpx_sad_multi_fn_t sdx8f;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9