shithub: libvpx

Download patch

ref: fc1679863b5d86a4f3f0dcb14ef37dae7c8cb94a
parent: 1db646f0de18bc4b43fb901025a9e3cedb6a8157
author: Yunqing Wang <yunqingwang@google.com>
date: Fri Oct 12 08:25:36 EDT 2018

Optimize apply_temporal_filter function

This patch optimized apply_temporal_filter function. The diff^2 for each
pixel in the 16x16 block is calculated once beforehand, so that we don't
calculate it multiple times while evaluating a pixel's neighbors. This
would speed up the function.

Change-Id: Ibdb8b041f317fd6df198950e2acf9cfcde26860d

--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -119,9 +119,14 @@
   unsigned int i, j, k, m;
   int modifier;
   const int rounding = (1 << strength) >> 1;
-  const int uv_block_width = block_width >> ss_x;
-  const int uv_block_height = block_height >> ss_y;
+  const unsigned int uv_block_width = block_width >> ss_x;
+  const unsigned int uv_block_height = block_height >> ss_y;
+  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[256]);
+  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[256]);
+  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[256]);
 
+  int idx = 0, idy;
+
   assert(strength >= 0);
   assert(strength <= 6);
 
@@ -128,20 +133,43 @@
   assert(filter_weight >= 0);
   assert(filter_weight <= 2);
 
+  memset(y_diff_sse, 0, 256 * sizeof(uint16_t));
+  memset(u_diff_sse, 0, 256 * sizeof(uint16_t));
+  memset(v_diff_sse, 0, 256 * sizeof(uint16_t));
+
+  // Calculate diff^2 for each pixel of the 16x16 block.
+  // TODO(yunqing): the following code needs to be optimized.
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int16_t diff =
+          y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j];
+      y_diff_sse[idx++] = diff * diff;
+    }
+  }
+  idx = 0;
+  for (i = 0; i < uv_block_height; i++) {
+    for (j = 0; j < uv_block_width; j++) {
+      const int16_t diffu =
+          u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j];
+      const int16_t diffv =
+          v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j];
+      u_diff_sse[idx] = diffu * diffu;
+      v_diff_sse[idx] = diffv * diffv;
+      idx++;
+    }
+  }
+
   for (i = 0, k = 0, m = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++) {
       const int pixel_value = y_pred[i * y_buf_stride + j];
 
       // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy;
       int y_index = 0;
 
       const int uv_r = i >> ss_y;
       const int uv_c = j >> ss_x;
+      modifier = 0;
 
-      int diff;
-
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
           const int row = (int)i + idy;
@@ -149,9 +177,7 @@
 
           if (row >= 0 && row < (int)block_height && col >= 0 &&
               col < (int)block_width) {
-            const int diff = y_frame1[row * (int)y_stride + col] -
-                             y_pred[row * (int)block_width + col];
-            diff_sse[y_index] = diff * diff;
+            modifier += y_diff_sse[row * (int)block_width + col];
             ++y_index;
           }
         }
@@ -159,17 +185,9 @@
 
       assert(y_index > 0);
 
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+      modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
+      modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
 
-      diff = u_frame1[uv_r * uv_stride + uv_c] -
-             u_pred[uv_r * uv_buf_stride + uv_c];
-      modifier += diff * diff;
-
-      diff = v_frame1[uv_r * uv_stride + uv_c] -
-             v_pred[uv_r * uv_buf_stride + uv_c];
-      modifier += diff * diff;
-
       y_index += 2;
 
       modifier =
@@ -186,9 +204,6 @@
         const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
 
         // non-local mean approach
-        int u_diff_sse[9] = { 0 };
-        int v_diff_sse[9] = { 0 };
-        int idx, idy;
         int cr_index = 0;
         int u_mod = 0, v_mod = 0;
         int y_diff = 0;
@@ -198,16 +213,10 @@
             const int row = uv_r + idy;
             const int col = uv_c + idx;
 
-            if (row >= 0 && row < uv_block_height && col >= 0 &&
-                col < uv_block_width) {
-              int diff = u_frame1[row * uv_stride + col] -
-                         u_pred[row * uv_buf_stride + col];
-              u_diff_sse[cr_index] = diff * diff;
-
-              diff = v_frame1[row * uv_stride + col] -
-                     v_pred[row * uv_buf_stride + col];
-              v_diff_sse[cr_index] = diff * diff;
-
+            if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
+                col < (int)uv_block_width) {
+              u_mod += u_diff_sse[row * uv_block_width + col];
+              v_mod += v_diff_sse[row * uv_block_width + col];
               ++cr_index;
             }
           }
@@ -215,18 +224,11 @@
 
         assert(cr_index > 0);
 
-        for (idx = 0; idx < 9; ++idx) {
-          u_mod += u_diff_sse[idx];
-          v_mod += v_diff_sse[idx];
-        }
-
         for (idy = 0; idy < 1 + ss_y; ++idy) {
           for (idx = 0; idx < 1 + ss_x; ++idx) {
             const int row = (uv_r << ss_y) + idy;
             const int col = (uv_c << ss_x) + idx;
-            const int diff = y_frame1[row * (int)y_stride + col] -
-                             y_pred[row * (int)block_width + col];
-            y_diff += diff * diff;
+            y_diff += y_diff_sse[row * (int)block_width + col];
             ++cr_index;
           }
         }