shithub: libvpx

--- a/test/temporal_filter_test.cc

+++ /dev/null

@@ -1,280 +1,0 @@

-/*

- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <limits>

-#include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vp9_rtcd.h"

-#include "test/acm_random.h"

-#include "test/buffer.h"

-#include "test/register_state_check.h"

-#include "vpx_ports/vpx_timer.h"

-namespace {

-using ::libvpx_test::ACMRandom;

-using ::libvpx_test::Buffer;

-typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,

-                                   const uint8_t *b, unsigned int w,

-                                   unsigned int h, int filter_strength,

-                                   int filter_weight, unsigned int *accumulator,

-                                   uint16_t *count);

-// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply

-// filter based on strength and weight. Store the resulting filter amount in

-// 'count' and apply it to 'b' and store it in 'accumulator'.

-void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,

-                      int h, int filter_strength, int filter_weight,

-                      Buffer<unsigned int> *accumulator,

-                      Buffer<uint16_t> *count) {

-  Buffer<int> diff_sq = Buffer<int>(w, h, 0);

-  ASSERT_TRUE(diff_sq.Init());

-  diff_sq.Set(0);

-  int rounding = 0;

-  if (filter_strength > 0) {

-    rounding = 1 << (filter_strength - 1);

-  }

-  ASSERT_TRUE(a.TopLeftPixel() != NULL);

-  ASSERT_TRUE(b.TopLeftPixel() != NULL);

-  ASSERT_TRUE(diff_sq.TopLeftPixel() != NULL);

-  // Calculate all the differences. Avoids re-calculating a bunch of extra

-  // values.

-  for (int height = 0; height < h; ++height) {

-    for (int width = 0; width < w; ++width) {

-      int diff = a.TopLeftPixel()[height * a.stride() + width] -

-                 b.TopLeftPixel()[height * b.stride() + width];

-      diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;

-    }

-  }

-  // For any given point, sum the neighboring values and calculate the

-  // modifier.

-  for (int height = 0; height < h; ++height) {

-    for (int width = 0; width < w; ++width) {

-      // Determine how many values are being summed.

-      int summed_values = 9;

-      if (height == 0 || height == (h - 1)) {

-        summed_values -= 3;

-      }

-      if (width == 0 || width == (w - 1)) {

-        if (summed_values == 6) {  // corner

-          summed_values -= 2;

-        } else {

-          summed_values -= 3;

-        }

-      }

-      // Sum the diff_sq of the surrounding values.

-      int sum = 0;

-      for (int idy = -1; idy <= 1; ++idy) {

-        for (int idx = -1; idx <= 1; ++idx) {

-          const int y = height + idy;

-          const int x = width + idx;

-          // If inside the border.

-          if (y >= 0 && y < h && x >= 0 && x < w) {

-            sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];

-          }

-        }

-      }

-      sum *= 3;

-      sum /= summed_values;

-      sum += rounding;

-      sum >>= filter_strength;

-      // Clamp the value and invert it.

-      if (sum > 16) sum = 16;

-      sum = 16 - sum;

-      sum *= filter_weight;

-      count->TopLeftPixel()[height * count->stride() + width] += sum;

-      accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=

-          sum * b.TopLeftPixel()[height * b.stride() + width];

-    }

-  }

-}

-class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {

- public:

-  virtual void SetUp() {

-    filter_func_ = GetParam();

-    rnd_.Reset(ACMRandom::DeterministicSeed());

-  }

- protected:

-  TemporalFilterFunc filter_func_;

-  ACMRandom rnd_;

-};

-TEST_P(TemporalFilterTest, SizeCombinations) {

-  // Depending on subsampling this function may be called with values of 8 or 16

-  // for width and height, in any combination.

-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);

-  ASSERT_TRUE(a.Init());

-  const int filter_weight = 2;

-  const int filter_strength = 6;

-  for (int width = 8; width <= 16; width += 8) {

-    for (int height = 8; height <= 16; height += 8) {

-      // The second buffer must not have any border.

-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);

-      ASSERT_TRUE(b.Init());

-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_ref.Init());

-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_chk.Init());

-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_ref.Init());

-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_chk.Init());

-      // The difference between the buffers must be small to pass the threshold

-      // to apply the filter.

-      a.Set(&rnd_, 0, 7);

-      b.Set(&rnd_, 0, 7);

-      accum_ref.Set(rnd_.Rand8());

-      accum_chk.CopyFrom(accum_ref);

-      count_ref.Set(rnd_.Rand8());

-      count_chk.CopyFrom(count_ref);

-      reference_filter(a, b, width, height, filter_strength, filter_weight,

-                       &accum_ref, &count_ref);

-      ASM_REGISTER_STATE_CHECK(

-          filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,

-                       height, filter_strength, filter_weight,

-                       accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));

-      EXPECT_TRUE(accum_chk.CheckValues(accum_ref));

-      EXPECT_TRUE(count_chk.CheckValues(count_ref));

-      if (HasFailure()) {

-        printf("Width: %d Height: %d\n", width, height);

-        count_chk.PrintDifference(count_ref);

-        accum_chk.PrintDifference(accum_ref);

-        return;

-      }

-    }

-  }

-}

-TEST_P(TemporalFilterTest, CompareReferenceRandom) {

-  for (int width = 8; width <= 16; width += 8) {

-    for (int height = 8; height <= 16; height += 8) {

-      Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);

-      ASSERT_TRUE(a.Init());

-      // The second buffer must not have any border.

-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);

-      ASSERT_TRUE(b.Init());

-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_ref.Init());

-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_chk.Init());

-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_ref.Init());

-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_chk.Init());

-      for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {

-        for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {

-          for (int repeat = 0; repeat < 100; ++repeat) {

-            if (repeat < 50) {

-              a.Set(&rnd_, 0, 7);

-              b.Set(&rnd_, 0, 7);

-            } else {

-              // Check large (but close) values as well.

-              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,

-                    std::numeric_limits<uint8_t>::max());

-              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,

-                    std::numeric_limits<uint8_t>::max());

-            }

-            accum_ref.Set(rnd_.Rand8());

-            accum_chk.CopyFrom(accum_ref);

-            count_ref.Set(rnd_.Rand8());

-            count_chk.CopyFrom(count_ref);

-            reference_filter(a, b, width, height, filter_strength,

-                             filter_weight, &accum_ref, &count_ref);

-            ASM_REGISTER_STATE_CHECK(filter_func_(

-                a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,

-                filter_strength, filter_weight, accum_chk.TopLeftPixel(),

-                count_chk.TopLeftPixel()));

-            EXPECT_TRUE(accum_chk.CheckValues(accum_ref));

-            EXPECT_TRUE(count_chk.CheckValues(count_ref));

-            if (HasFailure()) {

-              printf("Weight: %d Strength: %d\n", filter_weight,

-                     filter_strength);

-              count_chk.PrintDifference(count_ref);

-              accum_chk.PrintDifference(accum_ref);

-              return;

-            }

-          }

-        }

-      }

-    }

-  }

-}

-TEST_P(TemporalFilterTest, DISABLED_Speed) {

-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);

-  ASSERT_TRUE(a.Init());

-  const int filter_weight = 2;

-  const int filter_strength = 6;

-  for (int width = 8; width <= 16; width += 8) {

-    for (int height = 8; height <= 16; height += 8) {

-      // The second buffer must not have any border.

-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);

-      ASSERT_TRUE(b.Init());

-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_ref.Init());

-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);

-      ASSERT_TRUE(accum_chk.Init());

-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_ref.Init());

-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);

-      ASSERT_TRUE(count_chk.Init());

-      a.Set(&rnd_, 0, 7);

-      b.Set(&rnd_, 0, 7);

-      accum_chk.Set(0);

-      count_chk.Set(0);

-      vpx_usec_timer timer;

-      vpx_usec_timer_start(&timer);

-      for (int i = 0; i < 10000; ++i) {

-        filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,

-                     height, filter_strength, filter_weight,

-                     accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());

-      }

-      vpx_usec_timer_mark(&timer);

-      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));

-      printf("Temporal filter %dx%d time: %5d us\n", width, height,

-             elapsed_time);

-    }

-  }

-}

-INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,

-                        ::testing::Values(&vp9_temporal_filter_apply_c));

-#if HAVE_SSE4_1

-INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,

-                        ::testing::Values(&vp9_temporal_filter_apply_sse4_1));

-#endif  // HAVE_SSE4_1

-}  // namespace

--- a/test/test.mk

+++ b/test/test.mk

@@ -170,7 +170,6 @@

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc

 ifneq ($(CONFIG_REALTIME_ONLY),yes)

-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc

 endif

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -187,9 +187,6 @@

 # Apply temporal filter

 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {

-add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";

-specialize qw/vp9_temporal_filter_apply sse4_1/;

 add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";

 specialize qw/vp9_apply_temporal_filter sse4_1/;

--- a/vp9/encoder/vp9_temporal_filter.c

+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -386,137 +386,7 @@

-// TODO(any): This function is not used anymore. Should be removed.

-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,

-                                 const uint8_t *frame2,

-                                 unsigned int block_width,

-                                 unsigned int block_height, int strength,

-                                 int filter_weight, uint32_t *accumulator,

-                                 uint16_t *count) {

-  unsigned int i, j, k;

-  int modifier;

-  int byte = 0;

-  const int rounding = (1 << strength) >> 1;

-  assert(strength >= 0);

-  assert(strength <= 6);

-  assert(filter_weight >= 0);

-  assert(filter_weight <= 2);

-  for (i = 0, k = 0; i < block_height; i++) {

-    for (j = 0; j < block_width; j++, k++) {

-      int pixel_value = *frame2;

-      // non-local mean approach

-      int diff_sse[9] = { 0 };

-      int idx, idy, index = 0;

-      for (idy = -1; idy <= 1; ++idy) {

-        for (idx = -1; idx <= 1; ++idx) {

-          int row = (int)i + idy;

-          int col = (int)j + idx;

-          if (row >= 0 && row < (int)block_height && col >= 0 &&

-              col < (int)block_width) {

-            int diff = frame1[byte + idy * (int)stride + idx] -

-                       frame2[idy * (int)block_width + idx];

-            diff_sse[index] = diff * diff;

-            ++index;

-          }

-        }

-      }

-      assert(index > 0);

-      modifier = 0;

-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];

-      modifier *= 3;

-      modifier /= index;

-      ++frame2;

-      modifier += rounding;

-      modifier >>= strength;

-      if (modifier > 16) modifier = 16;

-      modifier = 16 - modifier;

-      modifier *= filter_weight;

-      count[k] += modifier;

-      accumulator[k] += modifier * pixel_value;

-      byte++;

-    }

-    byte += stride - block_width;

-  }

-}

 #if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_temporal_filter_apply_c(

-    const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,

-    unsigned int block_width, unsigned int block_height, int strength,

-    int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) {

-  const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);

-  const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);

-  unsigned int i, j, k;

-  int modifier;

-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;

-  int diff_sse[BLK_PELS] = { 0 };

-  int this_idx = 0;

-  for (i = 0; i < block_height; i++) {

-    for (j = 0; j < block_width; j++) {

-      const int diff =

-          frame1[i * (int)stride + j] - frame2[i * (int)block_width + j];

-      diff_sse[this_idx++] = diff * diff;

-    }

-  }

-  modifier = 0;

-  for (i = 0, k = 0; i < block_height; i++) {

-    for (j = 0; j < block_width; j++, k++) {

-      int pixel_value = frame2[i * (int)block_width + j];

-      int filter_weight =

-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);

-      int idx, idy, index = 0;

-      for (idy = -1; idy <= 1; ++idy) {

-        for (idx = -1; idx <= 1; ++idx) {

-          int row = (int)i + idy;

-          int col = (int)j + idx;

-          if (row >= 0 && row < (int)block_height && col >= 0 &&

-              col < (int)block_width) {

-            modifier += diff_sse[row * (int)block_width + col];

-            ++index;

-          }

-        }

-      }

-      assert(index > 0);

-      modifier *= 3;

-      modifier /= index;

-      modifier += rounding;

-      modifier >>= strength;

-      if (modifier > 16) modifier = 16;

-      modifier = 16 - modifier;

-      modifier *= filter_weight;

-      count[k] += modifier;

-      accumulator[k] += modifier * pixel_value;

-    }

-  }

-}

 void vp9_highbd_apply_temporal_filter_c(

     const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,

     int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,

--- a/vp9/encoder/x86/temporal_filter_sse4.c

+++ b/vp9/encoder/x86/temporal_filter_sse4.c

@@ -18,71 +18,6 @@

 #include "vp9/encoder/vp9_temporal_filter.h"

 #include "vp9/encoder/x86/temporal_filter_constants.h"

-// Load values from 'a' and 'b'. Compute the difference squared and sum

-// neighboring values such that:

-// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2

-// Values to the left and right of the row are set to 0.

-// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.

-static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {

-  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);

-  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);

-  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);

-  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);

-  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);

-  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);

-  // Shift all the values one place to the left/right so we can efficiently sum

-  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].

-  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);

-  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);

-  // It becomes necessary to treat the values as unsigned at this point. The

-  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point

-  // forward since the filter is only applied to smooth small pixel changes.

-  // Once the value has saturated to uint16_t it is well outside the useful

-  // range.

-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);

-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

-  *sum = sum_u16;

-}

-static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,

-                   __m128i *sum_1) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);

-  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

-  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);

-  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);

-  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);

-  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

-  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);

-  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);

-  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);

-  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

-  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);

-  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].

-  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);

-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

-  *sum_0 = sum_u16;

-  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);

-  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

-  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);

-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

-  *sum_1 = sum_u16;

-}

 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the

 // difference squared, and store as unsigned 16-bit integer to dst.

 static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,

@@ -310,148 +245,6 @@

                               __m128i *sum_second) {

   get_sum_8(y_dist, sum_first);

   get_sum_8(y_dist + 8, sum_second);

-}

-void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,

-                                      const uint8_t *b, unsigned int width,

-                                      unsigned int height, int strength,

-                                      int weight, uint32_t *accumulator,

-                                      uint16_t *count) {

-  unsigned int h;

-  const int rounding = (1 << strength) >> 1;

-  assert(strength >= 0);

-  assert(strength <= 6);

-  assert(weight >= 0);

-  assert(weight <= 2);

-  assert(width == 8 || width == 16);

-  if (width == 8) {

-    __m128i sum_row_a, sum_row_b, sum_row_c;

-    __m128i mul_constants = _mm_setr_epi16(

-        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

-    sum_8(a, b, &sum_row_a);

-    sum_8(a + stride, b + width, &sum_row_b);

-    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);

-    sum_row_c =

-        average_8(sum_row_c, &mul_constants, strength, rounding, weight);

-    accumulate_and_store_8(sum_row_c, b, count, accumulator);

-    a += stride + stride;

-    b += width;

-    count += width;

-    accumulator += width;

-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,

-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

-    for (h = 0; h < height - 2; ++h) {

-      sum_8(a, b + width, &sum_row_c);

-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);

-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);

-      sum_row_a =

-          average_8(sum_row_a, &mul_constants, strength, rounding, weight);

-      accumulate_and_store_8(sum_row_a, b, count, accumulator);

-      a += stride;

-      b += width;

-      count += width;

-      accumulator += width;

-      sum_row_a = sum_row_b;

-      sum_row_b = sum_row_c;

-    }

-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,

-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

-    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);

-    sum_row_a =

-        average_8(sum_row_a, &mul_constants, strength, rounding, weight);

-    accumulate_and_store_8(sum_row_a, b, count, accumulator);

-  } else {  // width == 16

-    __m128i sum_row_a_0, sum_row_a_1;

-    __m128i sum_row_b_0, sum_row_b_1;

-    __m128i sum_row_c_0, sum_row_c_1;

-    __m128i mul_constants_0 = _mm_setr_epi16(

-                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),

-            mul_constants_1 = _mm_setr_epi16(

-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

-    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);

-    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);

-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);

-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

-    average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,

-               strength, rounding, weight);

-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

-    a += stride + stride;

-    b += width;

-    count += width;

-    accumulator += width;

-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);

-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,

-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

-    for (h = 0; h < height - 2; ++h) {

-      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);

-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);

-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);

-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);

-      average_16(&sum_row_a_0, &sum_row_a_1, &mul_constants_0, &mul_constants_1,

-                 strength, rounding, weight);

-      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);

-      a += stride;

-      b += width;

-      count += width;

-      accumulator += width;

-      sum_row_a_0 = sum_row_b_0;

-      sum_row_a_1 = sum_row_b_1;

-      sum_row_b_0 = sum_row_c_0;

-      sum_row_b_1 = sum_row_c_1;

-    }

-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);

-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,

-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);

-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

-    average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,

-               strength, rounding, weight);

-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

-  }

 // Read in a row of chroma values corresponds to a row of 16 luma values.

--

⑨