shithub: libvpx

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -552,6 +552,9 @@

 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {

     add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";

     specialize qw/vp8_denoiser_filter sse2 neon/;

+    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";

+    specialize qw/vp8_denoiser_filter_uv sse2/;

 # End of encoder only functions

--- a/vp8/encoder/denoising.c

+++ b/vp8/encoder/denoising.c

@@ -191,6 +191,148 @@

     return FILTER_BLOCK;

+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,

+                             int mc_avg_uv_stride,

+                             unsigned char *running_avg_uv,

+                             int avg_uv_stride,

+                             unsigned char *sig,

+                             int sig_stride,

+                             unsigned int motion_magnitude,

+                             int increase_denoising) {

+    unsigned char *running_avg_uv_start = running_avg_uv;

+    unsigned char *sig_start = sig;

+    int sum_diff_thresh;

+    int r, c;

+    int sum_diff = 0;

+    int sum_block = 0;

+    int adj_val[3] = {3, 4, 6};

+    int shift_inc1 = 0;

+    int shift_inc2 = 1;

+    /* If motion_magnitude is small, making the denoiser more aggressive by

+     * increasing the adjustment for each level. Add another increment for

+     * blocks that are labeled for increase denoising. */

+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {

+      if (increase_denoising) {

+        shift_inc1 = 1;

+        shift_inc2 = 2;

+      }

+      adj_val[0] += shift_inc2;

+      adj_val[1] += shift_inc2;

+      adj_val[2] += shift_inc2;

+    }

+    // Avoid denoising color signal if its close to average level.

+    for (r = 0; r < 8; ++r) {

+      for (c = 0; c < 8; ++c) {

+        sum_block += sig[c];

+      }

+      sig += sig_stride;

+    }

+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {

+      return COPY_BLOCK;

+    }

+    sig -= sig_stride * 8;

+    for (r = 0; r < 8; ++r) {

+      for (c = 0; c < 8; ++c) {

+        int diff = 0;

+        int adjustment = 0;

+        int absdiff = 0;

+        diff = mc_running_avg_uv[c] - sig[c];

+        absdiff = abs(diff);

+        // When |diff| <= |3 + shift_inc1|, use pixel value from

+        // last denoised raw.

+        if (absdiff <= 3 + shift_inc1) {

+          running_avg_uv[c] = mc_running_avg_uv[c];

+          sum_diff += diff;

+        } else {

+          if (absdiff >= 4 && absdiff <= 7)

+            adjustment = adj_val[0];

+          else if (absdiff >= 8 && absdiff <= 15)

+            adjustment = adj_val[1];

+          else

+            adjustment = adj_val[2];

+          if (diff > 0) {

+            if ((sig[c] + adjustment) > 255)

+              running_avg_uv[c] = 255;

+            else

+              running_avg_uv[c] = sig[c] + adjustment;

+            sum_diff += adjustment;

+          } else {

+            if ((sig[c] - adjustment) < 0)

+              running_avg_uv[c] = 0;

+            else

+              running_avg_uv[c] = sig[c] - adjustment;

+            sum_diff -= adjustment;

+          }

+        }

+      }

+      /* Update pointers for next iteration. */

+      sig += sig_stride;

+      mc_running_avg_uv += mc_avg_uv_stride;

+      running_avg_uv += avg_uv_stride;

+    }

+    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;

+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;

+    if (abs(sum_diff) > sum_diff_thresh) {

+      // Before returning to copy the block (i.e., apply no denoising), check

+      // if we can still apply some (weaker) temporal filtering to this block,

+      // that would otherwise not be denoised at all. Simplest is to apply

+      // an additional adjustment to running_avg_y to bring it closer to sig.

+      // The adjustment is capped by a maximum delta, and chosen such that

+      // in most cases the resulting sum_diff will be within the

+      // accceptable range given by sum_diff_thresh.

+      // The delta is set by the excess of absolute pixel diff over threshold.

+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;

+      // Only apply the adjustment for max delta up to 3.

+      if (delta < 4) {

+        sig -= sig_stride * 8;

+        mc_running_avg_uv -= mc_avg_uv_stride * 8;

+        running_avg_uv -= avg_uv_stride * 8;

+        for (r = 0; r < 8; ++r) {

+          for (c = 0; c < 8; ++c) {

+            int diff = mc_running_avg_uv[c] - sig[c];

+            int adjustment = abs(diff);

+            if (adjustment > delta)

+              adjustment = delta;

+            if (diff > 0) {

+              // Bring denoised signal down.

+              if (running_avg_uv[c] - adjustment < 0)

+                running_avg_uv[c] = 0;

+              else

+                running_avg_uv[c] = running_avg_uv[c] - adjustment;

+              sum_diff -= adjustment;

+            } else if (diff < 0) {

+              // Bring denoised signal up.

+              if (running_avg_uv[c] + adjustment > 255)

+                running_avg_uv[c] = 255;

+              else

+                running_avg_uv[c] = running_avg_uv[c] + adjustment;

+              sum_diff += adjustment;

+            }

+          }

+          // TODO(marpan): Check here if abs(sum_diff) has gone below the

+          // threshold sum_diff_thresh, and if so, we can exit the row loop.

+          sig += sig_stride;

+          mc_running_avg_uv += mc_avg_uv_stride;

+          running_avg_uv += avg_uv_stride;

+        }

+        if (abs(sum_diff) > sum_diff_thresh)

+          return COPY_BLOCK;

+      } else {

+        return COPY_BLOCK;

+      }

+    }

+    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,

+                    sig_stride);

+    return FILTER_BLOCK;

+}

 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,

                           int num_mb_rows, int num_mb_cols)

@@ -260,6 +402,8 @@

     unsigned int motion_magnitude2;

     unsigned int sse_thresh;

     int sse_diff_thresh = 0;

+    // Denoise the UV channel.

+    int apply_color_denoise = 0;

     // Spatial loop filter: only applied selectively based on

     // temporal filter state of block relative to top/left neighbors.

     int apply_spatial_loop_filter = 1;

@@ -267,6 +411,8 @@

     MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;

     enum vp8_denoiser_decision decision = FILTER_BLOCK;

+    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;

+    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;

     if (zero_frame)

@@ -376,11 +522,37 @@

         /* Filter. */

         decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,

-                                         running_avg_y, avg_y_stride,

-                                         x->thismb, 16, motion_magnitude2,

-                                         x->increase_denoising);

+                                       running_avg_y, avg_y_stride,

+                                       x->thismb, 16, motion_magnitude2,

+                                       x->increase_denoising);

         denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?

             kFilterNonZeroMV : kFilterZeroMV;

+        // Only denoise UV for zero motion, and if y channel was denoised.

+        if (apply_color_denoise &&

+            motion_magnitude2 == 0 &&

+            decision == FILTER_BLOCK) {

+          unsigned char *mc_running_avg_u =

+              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;

+          unsigned char *running_avg_u =

+              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;

+          unsigned char *mc_running_avg_v =

+              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;

+          unsigned char *running_avg_v =

+              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;

+          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;

+          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;

+          int signal_stride = x->block[16].src_stride;

+          decision_u =

+              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,

+                                      running_avg_u, avg_uv_stride,

+                                      x->block[16].src + *x->block[16].base_src,

+                                      signal_stride, motion_magnitude2, 0);

+          decision_v =

+              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,

+                                      running_avg_v, avg_uv_stride,

+                                      x->block[20].src + *x->block[20].base_src,

+                                      signal_stride, motion_magnitude2, 0);

+        }

     if (decision == COPY_BLOCK)

@@ -393,7 +565,21 @@

                 denoiser->yv12_running_avg[INTRA_FRAME].y_stride);

         denoiser->denoise_state[block_index] = kNoFilter;

-    // Option to selectively deblock the denoised signal.

+    if (apply_color_denoise) {

+      if (decision_u == COPY_BLOCK) {

+        vp8_copy_mem8x8(

+            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,

+            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,

+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);

+      }

+      if (decision_v == COPY_BLOCK) {

+        vp8_copy_mem8x8(

+            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,

+            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,

+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);

+      }

+    }

+    // Option to selectively deblock the denoised signal, for y channel only.

     if (apply_spatial_loop_filter) {

       loop_filter_info lfi;

       int apply_filter_col = 0;

--- a/vp8/encoder/denoising.h

+++ b/vp8/encoder/denoising.h

@@ -22,6 +22,11 @@

 #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)

 #define MOTION_MAGNITUDE_THRESHOLD (8*3)

+#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)

+#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)

+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)

+#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)

 enum vp8_denoiser_decision

   COPY_BLOCK,

--- a/vp8/encoder/x86/denoising_sse2.c

+++ b/vp8/encoder/x86/denoising_sse2.c

@@ -17,11 +17,24 @@

 #include <emmintrin.h>

 #include "vpx_ports/emmintrin_compat.h"

-union sum_union {

-    __m128i v;

-    signed char e[16];

-};

+/* Compute the sum of all pixel differences of this MB. */

+static inline unsigned int abs_sum_diff_16x1(__m128i acc_diff) {

+  const __m128i k_1 = _mm_set1_epi16(1);

+  const __m128i acc_diff_lo = _mm_srai_epi16(

+      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);

+  const __m128i acc_diff_hi = _mm_srai_epi16(

+      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);

+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);

+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);

+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,

+                                          _mm_srli_si128(hg_fe_dc_ba, 8));

+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,

+                                         _mm_srli_si128(hgfe_dcba, 4));

+  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);

+  return abs(sum_diff);

+}

 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

                              int mc_avg_y_stride,

                              unsigned char *running_avg_y, int avg_y_stride,

@@ -103,16 +116,10 @@

         /* Compute the sum of all pixel differences of this MB. */

-        union sum_union s;

-        int sum_diff = 0;

-        s.v = acc_diff;

-        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]

-                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]

-                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];

+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);

         sum_diff_thresh = SUM_DIFF_THRESHOLD;

         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;

-        if (abs(sum_diff) > sum_diff_thresh) {

+        if (abs_sum_diff > sum_diff_thresh) {

           // Before returning to copy the block (i.e., apply no denoising),

           // checK if we can still apply some (weaker) temporal filtering to

           // this block, that would otherwise not be denoised at all. Simplest

@@ -123,7 +130,7 @@

           // The delta is set by the excess of absolute pixel diff over the

           // threshold.

-          int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;

+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;

           // Only apply the adjustment for max delta up to 3.

           if (delta < 4) {

             const __m128i k_delta = _mm_set1_epi8(delta);

@@ -162,16 +169,9 @@

              mc_running_avg_y += mc_avg_y_stride;

              running_avg_y += avg_y_stride;

-            {

-              // Update the sum of all pixel differences of this MB.

-              union sum_union s;

-              s.v = acc_diff;

-              sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]

-                       + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]

-                       + s.e[12] + s.e[13] + s.e[14] + s.e[15];

-              if (abs(sum_diff) > sum_diff_thresh) {

-                return COPY_BLOCK;

-              }

+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);

+            if (abs_sum_diff > sum_diff_thresh) {

+              return COPY_BLOCK;

           } else {

             return COPY_BLOCK;

@@ -180,5 +180,200 @@

     vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);

+    return FILTER_BLOCK;

+}

+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,

+                             int mc_avg_stride,

+                             unsigned char *running_avg, int avg_stride,

+                             unsigned char *sig, int sig_stride,

+                             unsigned int motion_magnitude,

+                             int increase_denoising) {

+    unsigned char *running_avg_start = running_avg;

+    unsigned char *sig_start = sig;

+    int sum_diff_thresh;

+    int r;

+    int shift_inc  = (increase_denoising &&

+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;

+    __m128i acc_diff = _mm_setzero_si128();

+    const __m128i k_0 = _mm_setzero_si128();

+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);

+    const __m128i k_8 = _mm_set1_epi8(8);

+    const __m128i k_16 = _mm_set1_epi8(16);

+    /* Modify each level's adjustment according to motion_magnitude. */

+    const __m128i l3 = _mm_set1_epi8(

+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?

+                        7 + shift_inc : 6);

+    /* Difference between level 3 and level 2 is 2. */

+    const __m128i l32 = _mm_set1_epi8(2);

+    /* Difference between level 2 and level 1 is 1. */

+    const __m128i l21 = _mm_set1_epi8(1);

+    {

+      const __m128i k_1 = _mm_set1_epi16(1);

+      __m128i vec_sum_block = _mm_setzero_si128();

+      // Avoid denoising color signal if its close to average level.

+      for (r = 0; r < 8; ++r) {

+        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));

+        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);

+        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);

+        sig += sig_stride;

+      }

+      sig -= sig_stride * 8;

+      {

+        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);

+        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,

+                                                _mm_srli_si128(hg_fe_dc_ba, 8));

+        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,

+                                               _mm_srli_si128(hgfe_dcba, 4));

+        const int sum_block = _mm_cvtsi128_si32(hgfedcba);

+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {

+          return COPY_BLOCK;

+        }

+      }

+    }

+    for (r = 0; r < 4; ++r) {

+        /* Calculate differences */

+        const __m128i v_sig_low = _mm_castpd_si128(

+            _mm_load_sd((double *)(&sig[0])));

+        const __m128i v_sig = _mm_castpd_si128(

+            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),

+                         (double *)(&sig[sig_stride])));

+        const __m128i v_mc_running_avg_low = _mm_castpd_si128(

+            _mm_load_sd((double *)(&mc_running_avg[0])));

+        const __m128i v_mc_running_avg = _mm_castpd_si128(

+            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),

+                         (double *)(&mc_running_avg[mc_avg_stride])));

+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);

+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);

+        /* Obtain the sign. FF if diff is negative. */

+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);

+        /* Clamp absolute difference to 16 to be used to get mask. Doing this

+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */

+        const __m128i clamped_absdiff = _mm_min_epu8(

+                                        _mm_or_si128(pdiff, ndiff), k_16);

+        /* Get masks for l2 l1 and l0 adjustments */

+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);

+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);

+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);

+        /* Get adjustments for l2, l1, and l0 */

+        __m128i adj2 = _mm_and_si128(mask2, l32);

+        const __m128i adj1 = _mm_and_si128(mask1, l21);

+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);

+        __m128i adj,  padj, nadj;

+        __m128i v_running_avg;

+        /* Combine the adjustments and get absolute adjustments. */

+        adj2 = _mm_add_epi8(adj2, adj1);

+        adj = _mm_sub_epi8(l3, adj2);

+        adj = _mm_andnot_si128(mask0, adj);

+        adj = _mm_or_si128(adj, adj0);

+        /* Restore the sign and get positive and negative adjustments. */

+        padj = _mm_andnot_si128(diff_sign, adj);

+        nadj = _mm_and_si128(diff_sign, adj);

+        /* Calculate filtered value. */

+        v_running_avg = _mm_adds_epu8(v_sig, padj);

+        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);

+        _mm_storel_pd((double *)&running_avg[0],

+                      _mm_castsi128_pd(v_running_avg));

+        _mm_storeh_pd((double *)&running_avg[avg_stride],

+                      _mm_castsi128_pd(v_running_avg));

+        /* Adjustments <=7, and each element in acc_diff can fit in signed

+         * char.

+         */

+        acc_diff = _mm_adds_epi8(acc_diff, padj);

+        acc_diff = _mm_subs_epi8(acc_diff, nadj);

+        /* Update pointers for next iteration. */

+        sig += sig_stride * 2;

+        mc_running_avg += mc_avg_stride * 2;

+        running_avg += avg_stride * 2;

+    }

+    {

+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);

+        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;

+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;

+        if (abs_sum_diff > sum_diff_thresh) {

+          // Before returning to copy the block (i.e., apply no denoising),

+          // checK if we can still apply some (weaker) temporal filtering to

+          // this block, that would otherwise not be denoised at all. Simplest

+          // is to apply an additional adjustment to running_avg_y to bring it

+          // closer to sig. The adjustment is capped by a maximum delta, and

+          // chosen such that in most cases the resulting sum_diff will be

+          // within the accceptable range given by sum_diff_thresh.

+          // The delta is set by the excess of absolute pixel diff over the

+          // threshold.

+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;

+          // Only apply the adjustment for max delta up to 3.

+          if (delta < 4) {

+            const __m128i k_delta = _mm_set1_epi8(delta);

+            sig -= sig_stride * 8;

+            mc_running_avg -= mc_avg_stride * 8;

+            running_avg -= avg_stride * 8;

+            for (r = 0; r < 4; ++r) {

+              // Calculate differences.

+              const __m128i v_sig_low = _mm_castpd_si128(

+                  _mm_load_sd((double *)(&sig[0])));

+              const __m128i v_sig = _mm_castpd_si128(

+                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),

+                               (double *)(&sig[sig_stride])));

+              const __m128i v_mc_running_avg_low = _mm_castpd_si128(

+                  _mm_load_sd((double *)(&mc_running_avg[0])));

+              const __m128i v_mc_running_avg = _mm_castpd_si128(

+                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),

+                               (double *)(&mc_running_avg[mc_avg_stride])));

+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);

+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);

+              // Obtain the sign. FF if diff is negative.

+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);

+              // Clamp absolute difference to delta to get the adjustment.

+              const __m128i adj =

+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);

+              // Restore the sign and get positive and negative adjustments.

+              __m128i padj, nadj;

+              const __m128i v_running_avg_low = _mm_castpd_si128(

+                  _mm_load_sd((double *)(&running_avg[0])));

+              __m128i v_running_avg = _mm_castpd_si128(

+                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),

+                               (double *)(&running_avg[avg_stride])));

+              padj = _mm_andnot_si128(diff_sign, adj);

+              nadj = _mm_and_si128(diff_sign, adj);

+              // Calculate filtered value.

+              v_running_avg = _mm_subs_epu8(v_running_avg, padj);

+              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);

+              _mm_storel_pd((double *)&running_avg[0],

+                            _mm_castsi128_pd(v_running_avg));

+              _mm_storeh_pd((double *)&running_avg[avg_stride],

+                            _mm_castsi128_pd(v_running_avg));

+             // Accumulate the adjustments.

+             acc_diff = _mm_subs_epi8(acc_diff, padj);

+             acc_diff = _mm_adds_epi8(acc_diff, nadj);

+             // Update pointers for next iteration.

+             sig += sig_stride * 2;

+             mc_running_avg += mc_avg_stride * 2;

+             running_avg += avg_stride * 2;

+            }

+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);

+            if (abs_sum_diff > sum_diff_thresh) {

+              return COPY_BLOCK;

+            }

+          } else {

+            return COPY_BLOCK;

+          }

+        }

+    }

+    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);

     return FILTER_BLOCK;

--

⑨