shithub: libvpx

Download patch

ref: b29612fbbea5d55726003a3a133d7b66bbb7226b
parent: 94806e029d774afb002e80a36007725fd29d40b2
author: JackyChen <jackychen@google.com>
date: Sun May 1 12:15:29 EDT 2016

vp9: Refactor vp9_denoiser_NxM_sse2.

Denoiser is ~1.5% faster in speed 6~8.

Change-Id: I7b350f3c50cce6773d9c4eded4c0c1b722d0a5fc

--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -147,8 +147,9 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+  for (r = 0; r < b_height; ++r) {
     memcpy(sig_buffer[r], sig, width);
     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@@ -188,8 +189,8 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
               sig_buffer[r], mc_running_buffer[r], running_buffer[r],
               k_0, k_delta, acc_diff);
@@ -235,18 +236,21 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
 
-  for (c = 0; c < 4; ++c) {
-    for (r = 0; r < 4; ++r) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
       acc_diff[c][r] = _mm_setzero_si128();
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
           sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
-          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
@@ -253,20 +257,16 @@
       running_avg_y += 16;
     }
 
-    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
       }
     }
 
     // Update pointers for next iteration.
-    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-    mc_running_avg_y = mc_running_avg_y -
-                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                       mc_avg_y_stride;
-    running_avg_y = running_avg_y -
-                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                    avg_y_stride;
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
   }
 
   {
@@ -278,15 +278,15 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
-        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
         sum_diff = 0;
-        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
                 sig, mc_running_avg_y, running_avg_y, k_0,
-                k_delta, acc_diff[c>>4][r>>4]);
+                k_delta, acc_diff[c][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
@@ -293,18 +293,14 @@
             running_avg_y += 16;
           }
 
-          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
             }
           }
-          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-          mc_running_avg_y = mc_running_avg_y -
-                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                             mc_avg_y_stride;
-          running_avg_y = running_avg_y -
-                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                          avg_y_stride;
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
         }
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;