shithub: libvpx

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -7,16 +7,18 @@

  *  in the file PATENTS.  All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include <stdlib.h>

+#include <cstdlib>

 #include <new>

-#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/acm_random.h"

 #include "test/clear_system_state.h"

 #include "test/register_state_check.h"

+#include "third_party/googletest/src/include/gtest/gtest.h"

-#include "vpx/vpx_integer.h"

 #include "./vpx_config.h"

+#include "vpx/vpx_codec.h"

+#include "vpx/vpx_integer.h"

 #include "vpx_mem/vpx_mem.h"

 #if CONFIG_VP8_ENCODER

 # include "./vp8_rtcd.h"

@@ -26,7 +28,6 @@

 # include "./vp9_rtcd.h"

 # include "vp9/encoder/vp9_variance.h"

 #endif

-#include "test/acm_random.h"

 namespace {

@@ -43,18 +44,50 @@

   return res;

-static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,

-                                 int l2w, int l2h, unsigned int *sse_ptr) {

+static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref,

+                                 int l2w, int l2h, int src_stride_coeff,

+                                 int ref_stride_coeff, uint32_t *sse_ptr,

+                                 bool use_high_bit_depth_,

+                                 vpx_bit_depth_t bit_depth) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  int64_t se = 0;

+  uint64_t sse = 0;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

+  for (int y = 0; y < h; y++) {

+    for (int x = 0; x < w; x++) {

+      int diff;

+      if (!use_high_bit_depth_) {

+        diff = ref[w * y * ref_stride_coeff + x] -

+               src[w * y * src_stride_coeff + x];

+        se += diff;

+        sse += diff * diff;

+      } else {

+        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -

+               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];

+        se += diff;

+        sse += diff * diff;

+      }

+    }

+  }

+  if (bit_depth > VPX_BITS_8) {

+    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));

+    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);

+  }

+#else

   int se = 0;

   unsigned int sse = 0;

-  const int w = 1 << l2w, h = 1 << l2h;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

   for (int y = 0; y < h; y++) {

     for (int x = 0; x < w; x++) {

-      int diff = ref[w * y + x] - src[w * y + x];

+      int diff = ref[w * y * ref_stride_coeff + x] -

+                 src[w * y * src_stride_coeff + x];

       se += diff;

       sse += diff * diff;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   *sse_ptr = sse;

   return sse - (((int64_t) se * se) >> (l2w + l2h));

@@ -61,13 +94,56 @@

 static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,

                                         int l2w, int l2h, int xoff, int yoff,

-                                        unsigned int *sse_ptr) {

+                                        unsigned int *sse_ptr,

+                                        bool use_high_bit_depth_,

+                                        vpx_bit_depth_t bit_depth) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  int64_t se = 0;

+  uint64_t sse = 0;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

+  for (int y = 0; y < h; y++) {

+    for (int x = 0; x < w; x++) {

+      // Bilinear interpolation at a 16th pel step.

+      if (!use_high_bit_depth_) {

+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];

+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];

+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];

+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];

+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

+        const int r = a + (((b - a) * yoff + 8) >> 4);

+        const int diff = r - src[w * y + x];

+        se += diff;

+        sse += diff * diff;

+      } else {

+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);

+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);

+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];

+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];

+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];

+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];

+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

+        const int r = a + (((b - a) * yoff + 8) >> 4);

+        const int diff = r - src16[w * y + x];

+        se += diff;

+        sse += diff * diff;

+      }

+    }

+  }

+  if (bit_depth > VPX_BITS_8) {

+    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));

+    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);

+  }

+#else

   int se = 0;

   unsigned int sse = 0;

-  const int w = 1 << l2w, h = 1 << l2h;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

   for (int y = 0; y < h; y++) {

     for (int x = 0; x < w; x++) {

-      // bilinear interpolation at a 16th pel step

+      // Bilinear interpolation at a 16th pel step.

       const int a1 = ref[(w + 1) * (y + 0) + x + 0];

       const int a2 = ref[(w + 1) * (y + 0) + x + 1];

       const int b1 = ref[(w + 1) * (y + 1) + x + 0];

@@ -75,11 +151,12 @@

       const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

       const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

       const int r = a + (((b - a) * yoff + 8) >> 4);

-      int diff = r - src[w * y + x];

+      const int diff = r - src[w * y + x];

       se += diff;

       sse += diff * diff;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   *sse_ptr = sse;

   return sse - (((int64_t) se * se) >> (l2w + l2h));

@@ -130,27 +207,57 @@

 template<typename VarianceFunctionType>

 class VarianceTest

-    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {

+    : public ::testing::TestWithParam<tuple<int, int,

+                                            VarianceFunctionType, int> > {

  public:

   virtual void SetUp() {

-    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();

+    const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();

     log2width_  = get<0>(params);

     width_ = 1 << log2width_;

     log2height_ = get<1>(params);

     height_ = 1 << log2height_;

     variance_ = get<2>(params);

+    if (get<3>(params)) {

+      bit_depth_ = static_cast<vpx_bit_depth_t>(get<3>(params));

+      use_high_bit_depth_ = true;

+    } else {

+      bit_depth_ = VPX_BITS_8;

+      use_high_bit_depth_ = false;

+    }

+    mask_ = (1 << bit_depth_) - 1;

     rnd_.Reset(ACMRandom::DeterministicSeed());

     block_size_ = width_ * height_;

-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));

-    ref_ = new uint8_t[block_size_];

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));

+      ref_ = new uint8_t[block_size_ * 2];

+    } else {

+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(

+          vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));

+      ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);

+    }

+#else

+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));

+    ref_ = new uint8_t[block_size_ * 2];

+#endif

     ASSERT_TRUE(src_ != NULL);

     ASSERT_TRUE(ref_ != NULL);

   virtual void TearDown() {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

+      vpx_free(src_);

+      delete[] ref_;

+    } else {

+      vpx_free(CONVERT_TO_SHORTPTR(src_));

+      delete[] CONVERT_TO_SHORTPTR(ref_);

+    }

+#else

     vpx_free(src_);

     delete[] ref_;

+#endif

     libvpx_test::ClearSystemState();

@@ -157,13 +264,17 @@

  protected:

   void ZeroTest();

   void RefTest();

+  void RefStrideTest();

   void OneQuarterTest();

   ACMRandom rnd_;

-  uint8_t* src_;

-  uint8_t* ref_;

+  uint8_t *src_;

+  uint8_t *ref_;

   int width_, log2width_;

   int height_, log2height_;

+  vpx_bit_depth_t bit_depth_;

+  int mask_;

+  bool use_high_bit_depth_;

   int block_size_;

   VarianceFunctionType variance_;

};

@@ -171,14 +282,32 @@

 template<typename VarianceFunctionType>

 void VarianceTest<VarianceFunctionType>::ZeroTest() {

   for (int i = 0; i <= 255; ++i) {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

+      memset(src_, i, block_size_);

+    } else {

+      vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),

+                   block_size_);

+    }

+#else

     memset(src_, i, block_size_);

+#endif

     for (int j = 0; j <= 255; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (!use_high_bit_depth_) {

+        memset(ref_, j, block_size_);

+      } else {

+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j  << (bit_depth_ - 8),

+                     block_size_);

+      }

+#else

       memset(ref_, j, block_size_);

+#endif

       unsigned int sse;

       unsigned int var;

       ASM_REGISTER_STATE_CHECK(

           var = variance_(src_, width_, ref_, width_, &sse));

-      EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;

+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;

@@ -187,15 +316,28 @@

 void VarianceTest<VarianceFunctionType>::RefTest() {

   for (int i = 0; i < 10; ++i) {

     for (int j = 0; j < block_size_; j++) {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

       src_[j] = rnd_.Rand8();

       ref_[j] = rnd_.Rand8();

+    } else {

+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;

+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;

+#else

+      src_[j] = rnd_.Rand8();

+      ref_[j] = rnd_.Rand8();

+#endif

+    }

     unsigned int sse1, sse2;

     unsigned int var1;

+    const int stride_coeff = 1;

     ASM_REGISTER_STATE_CHECK(

         var1 = variance_(src_, width_, ref_, width_, &sse1));

     const unsigned int var2 = variance_ref(src_, ref_, log2width_,

-                                           log2height_, &sse2);

+                                           log2height_, stride_coeff,

+                                           stride_coeff, &sse2,

+                                           use_high_bit_depth_, bit_depth_);

     EXPECT_EQ(sse1, sse2);

     EXPECT_EQ(var1, var2);

@@ -202,11 +344,60 @@

 template<typename VarianceFunctionType>

+void VarianceTest<VarianceFunctionType>::RefStrideTest() {

+  for (int i = 0; i < 10; ++i) {

+    int ref_stride_coeff = i % 2;

+    int src_stride_coeff = (i >> 1) % 2;

+    for (int j = 0; j < block_size_; j++) {

+      int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;

+      int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (!use_high_bit_depth_) {

+        src_[src_ind] = rnd_.Rand8();

+        ref_[ref_ind] = rnd_.Rand8();

+      } else {

+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;

+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;

+      }

+#else

+      src_[src_ind] = rnd_.Rand8();

+      ref_[ref_ind] = rnd_.Rand8();

+#endif

+    }

+    unsigned int sse1, sse2;

+    unsigned int var1;

+    ASM_REGISTER_STATE_CHECK(

+        var1 = variance_(src_, width_ * src_stride_coeff,

+                         ref_, width_ * ref_stride_coeff, &sse1));

+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,

+                                           log2height_, src_stride_coeff,

+                                           ref_stride_coeff, &sse2,

+                                           use_high_bit_depth_, bit_depth_);

+    EXPECT_EQ(sse1, sse2);

+    EXPECT_EQ(var1, var2);

+  }

+}

+template<typename VarianceFunctionType>

 void VarianceTest<VarianceFunctionType>::OneQuarterTest() {

-  memset(src_, 255, block_size_);

   const int half = block_size_ / 2;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (!use_high_bit_depth_) {

+    memset(src_, 255, block_size_);

+    memset(ref_, 255, half);

+    memset(ref_ + half, 0, half);

+  } else {

+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),

+                 block_size_);

+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);

+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);

+  }

+#else

+  memset(src_, 255, block_size_);

   memset(ref_, 255, half);

   memset(ref_ + half, 0, half);

+#endif

   unsigned int sse;

   unsigned int var;

   ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));

@@ -264,8 +455,10 @@

       ref_[j] = rnd.Rand8();

     unsigned int sse1, sse2;

+    const int stride_coeff = 1;

     ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));

-    variance_ref(src_, ref_, log2width_, log2height_, &sse2);

+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,

+                 stride_coeff, &sse2, false, VPX_BITS_8);

     EXPECT_EQ(sse1, sse2);

@@ -279,9 +472,10 @@

     unsigned int sse2;

     unsigned int var1;

-    ASM_REGISTER_STATE_CHECK(

-        var1 = mse_(src_, width_, ref_, width_));

-    variance_ref(src_, ref_, log2width_, log2height_, &sse2);

+    const int stride_coeff = 1;

+    ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));

+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,

+                 stride_coeff, &sse2, false, VPX_BITS_8);

     EXPECT_EQ(var1, sse2);

@@ -308,16 +502,59 @@

 #endif

 #if CONFIG_VP9_ENCODER

 unsigned int subpel_avg_variance_ref(const uint8_t *ref,

                                      const uint8_t *src,

                                      const uint8_t *second_pred,

                                      int l2w, int l2h,

                                      int xoff, int yoff,

-                                     unsigned int *sse_ptr) {

+                                     unsigned int *sse_ptr,

+                                     bool use_high_bit_depth,

+                                     vpx_bit_depth_t bit_depth) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  int64_t se = 0;

+  uint64_t sse = 0;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

+  for (int y = 0; y < h; y++) {

+    for (int x = 0; x < w; x++) {

+      // bilinear interpolation at a 16th pel step

+      if (!use_high_bit_depth) {

+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];

+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];

+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];

+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];

+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

+        const int r = a + (((b - a) * yoff + 8) >> 4);

+        const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];

+        se += diff;

+        sse += diff * diff;

+      } else {

+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);

+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);

+        uint16_t *sec16   = CONVERT_TO_SHORTPTR(second_pred);

+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];

+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];

+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];

+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];

+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

+        const int r = a + (((b - a) * yoff + 8) >> 4);

+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];

+        se += diff;

+        sse += diff * diff;

+      }

+    }

+  }

+  if (bit_depth > 8) {

+    sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8));

+    se = ROUND_POWER_OF_TWO(se, bit_depth-8);

+  }

+#else

   int se = 0;

   unsigned int sse = 0;

-  const int w = 1 << l2w, h = 1 << l2h;

+  const int w = 1 << l2w;

+  const int h = 1 << l2h;

   for (int y = 0; y < h; y++) {

     for (int x = 0; x < w; x++) {

       // bilinear interpolation at a 16th pel step

@@ -328,11 +565,12 @@

       const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);

       const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);

       const int r = a + (((b - a) * yoff + 8) >> 4);

-      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];

+      const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];

       se += diff;

       sse += diff * diff;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   *sse_ptr = sse;

   return sse - (((int64_t) se * se) >> (l2w + l2h));

@@ -340,10 +578,10 @@

 template<typename SubpelVarianceFunctionType>

 class SubpelVarianceTest

     : public ::testing::TestWithParam<tuple<int, int,

-                                            SubpelVarianceFunctionType> > {

+                                            SubpelVarianceFunctionType, int> > {

  public:

   virtual void SetUp() {

-    const tuple<int, int, SubpelVarianceFunctionType>& params =

+    const tuple<int, int, SubpelVarianceFunctionType, int>& params =

         this->GetParam();

     log2width_  = get<0>(params);

     width_ = 1 << log2width_;

@@ -350,12 +588,37 @@

     log2height_ = get<1>(params);

     height_ = 1 << log2height_;

     subpel_variance_ = get<2>(params);

+    if (get<3>(params)) {

+      bit_depth_ = (vpx_bit_depth_t) get<3>(params);

+      use_high_bit_depth_ = true;

+    } else {

+      bit_depth_ = VPX_BITS_8;

+      use_high_bit_depth_ = false;

+    }

+    mask_ = (1 << bit_depth_)-1;

     rnd_.Reset(ACMRandom::DeterministicSeed());

     block_size_ = width_ * height_;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));

+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));

+      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];

+    } else {

+      src_ = CONVERT_TO_BYTEPTR(

+          reinterpret_cast<uint16_t *>(

+              vpx_memalign(16, block_size_*sizeof(uint16_t))));

+      sec_ = CONVERT_TO_BYTEPTR(

+          reinterpret_cast<uint16_t *>(

+              vpx_memalign(16, block_size_*sizeof(uint16_t))));

+      ref_ = CONVERT_TO_BYTEPTR(

+          new uint16_t[block_size_ + width_ + height_ + 1]);

+    }

+#else

     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));

     sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));

     ref_ = new uint8_t[block_size_ + width_ + height_ + 1];

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     ASSERT_TRUE(src_ != NULL);

     ASSERT_TRUE(sec_ != NULL);

     ASSERT_TRUE(ref_ != NULL);

@@ -362,22 +625,37 @@

   virtual void TearDown() {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (!use_high_bit_depth_) {

+      vpx_free(src_);

+      delete[] ref_;

+      vpx_free(sec_);

+    } else {

+      vpx_free(CONVERT_TO_SHORTPTR(src_));

+      delete[] CONVERT_TO_SHORTPTR(ref_);

+      vpx_free(CONVERT_TO_SHORTPTR(sec_));

+    }

+#else

     vpx_free(src_);

     delete[] ref_;

     vpx_free(sec_);

+#endif

     libvpx_test::ClearSystemState();

  protected:

   void RefTest();

+  void ExtremeRefTest();

   ACMRandom rnd_;

   uint8_t *src_;

   uint8_t *ref_;

   uint8_t *sec_;

+  bool use_high_bit_depth_;

+  vpx_bit_depth_t bit_depth_;

   int width_, log2width_;

   int height_, log2height_;

-  int block_size_;

+  int block_size_,  mask_;

   SubpelVarianceFunctionType subpel_variance_;

};

@@ -385,6 +663,23 @@

 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {

   for (int x = 0; x < 16; ++x) {

     for (int y = 0; y < 16; ++y) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (!use_high_bit_depth_) {

+        for (int j = 0; j < block_size_; j++) {

+          src_[j] = rnd_.Rand8();

+        }

+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

+          ref_[j] = rnd_.Rand8();

+        }

+      } else {

+        for (int j = 0; j < block_size_; j++) {

+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;

+        }

+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;

+        }

+      }

+#else

       for (int j = 0; j < block_size_; j++) {

         src_[j] = rnd_.Rand8();

@@ -391,12 +686,15 @@

       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

         ref_[j] = rnd_.Rand8();

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       unsigned int sse1, sse2;

       unsigned int var1;

       ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,

                                                        src_, width_, &sse1));

       const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,

-                                                    log2height_, x, y, &sse2);

+                                                    log2height_, x, y, &sse2,

+                                                    use_high_bit_depth_,

+                                                    bit_depth_);

       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;

       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;

@@ -403,10 +701,69 @@

+template<typename SubpelVarianceFunctionType>

+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {

+  // Compare against reference.

+  // Src: Set the first half of values to 0, the second half to the maximum.

+  // Ref: Set the first half of values to the maximum, the second half to 0.

+  for (int x = 0; x < 16; ++x) {

+    for (int y = 0; y < 16; ++y) {

+      const int half = block_size_ / 2;

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (!use_high_bit_depth_) {

+        memset(src_, 0, half);

+        memset(src_ + half, 255, half);

+        memset(ref_, 255, half);

+        memset(ref_ + half, 0, half + width_ + height_ + 1);

+      } else {

+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);

+        vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);

+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);

+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,

+                     half + width_ + height_ + 1);

+      }

+#else

+      memset(src_, 0, half);

+      memset(src_ + half, 255, half);

+      memset(ref_, 255, half);

+      memset(ref_ + half, 0, half + width_ + height_ + 1);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+      unsigned int sse1, sse2;

+      unsigned int var1;

+      ASM_REGISTER_STATE_CHECK(

+          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));

+      const unsigned int var2 =

+          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,

+                              use_high_bit_depth_, bit_depth_);

+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;

+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;

+    }

+  }

+}

 template<>

 void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {

   for (int x = 0; x < 16; ++x) {

     for (int y = 0; y < 16; ++y) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (!use_high_bit_depth_) {

+        for (int j = 0; j < block_size_; j++) {

+          src_[j] = rnd_.Rand8();

+          sec_[j] = rnd_.Rand8();

+        }

+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

+          ref_[j] = rnd_.Rand8();

+        }

+      } else {

+        for (int j = 0; j < block_size_; j++) {

+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;

+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;

+        }

+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;

+        }

+      }

+#else

       for (int j = 0; j < block_size_; j++) {

         src_[j] = rnd_.Rand8();

         sec_[j] = rnd_.Rand8();

@@ -414,6 +771,7 @@

       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {

         ref_[j] = rnd_.Rand8();

+#endif

       unsigned int sse1, sse2;

       unsigned int var1;

       ASM_REGISTER_STATE_CHECK(

@@ -421,7 +779,9 @@

                                   src_, width_, &sse1, sec_));

       const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,

                                                         log2width_, log2height_,

-                                                        x, y, &sse2);

+                                                        x, y, &sse2,

+                                                        use_high_bit_depth_,

+                                                        bit_depth_);

       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;

       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;

@@ -468,11 +828,11 @@

 const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;

 INSTANTIATE_TEST_CASE_P(

     C, VP8VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_c),

-                      make_tuple(3, 3, variance8x8_c),

-                      make_tuple(3, 4, variance8x16_c),

-                      make_tuple(4, 3, variance16x8_c),

-                      make_tuple(4, 4, variance16x16_c)));

+    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),

+                      make_tuple(3, 3, variance8x8_c, 0),

+                      make_tuple(3, 4, variance8x16_c, 0),

+                      make_tuple(4, 3, variance16x8_c, 0),

+                      make_tuple(4, 4, variance16x16_c, 0)));

 #if HAVE_NEON

 const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon;

@@ -491,13 +851,12 @@

 const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;

 INSTANTIATE_TEST_CASE_P(

     NEON, VP8VarianceTest,

-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),

-                      make_tuple(3, 4, variance8x16_neon),

-                      make_tuple(4, 3, variance16x8_neon),

-                      make_tuple(4, 4, variance16x16_neon)));

+    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),

+                      make_tuple(3, 4, variance8x16_neon, 0),

+                      make_tuple(4, 3, variance16x8_neon, 0),

+                      make_tuple(4, 4, variance16x16_neon, 0)));

 #endif

 #if HAVE_MMX

 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;

 const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;

@@ -506,11 +865,11 @@

 const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;

 INSTANTIATE_TEST_CASE_P(

     MMX, VP8VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),

-                      make_tuple(3, 3, variance8x8_mmx),

-                      make_tuple(3, 4, variance8x16_mmx),

-                      make_tuple(4, 3, variance16x8_mmx),

-                      make_tuple(4, 4, variance16x16_mmx)));

+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0),

+                      make_tuple(3, 3, variance8x8_mmx, 0),

+                      make_tuple(3, 4, variance8x16_mmx, 0),

+                      make_tuple(4, 3, variance16x8_mmx, 0),

+                      make_tuple(4, 4, variance16x16_mmx, 0)));

 #endif

 #if HAVE_SSE2

@@ -521,11 +880,11 @@

 const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP8VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_wmt),

-                      make_tuple(3, 3, variance8x8_wmt),

-                      make_tuple(3, 4, variance8x16_wmt),

-                      make_tuple(4, 3, variance16x8_wmt),

-                      make_tuple(4, 4, variance16x16_wmt)));

+    ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0),

+                      make_tuple(3, 3, variance8x8_wmt, 0),

+                      make_tuple(3, 4, variance8x16_wmt, 0),

+                      make_tuple(4, 3, variance16x8_wmt, 0),

+                      make_tuple(4, 4, variance16x16_wmt, 0)));

 #endif

 #endif  // CONFIG_VP8_ENCODER

@@ -537,7 +896,6 @@

 namespace vp9 {

 #if CONFIG_VP9_ENCODER

 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }

 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }

@@ -550,10 +908,27 @@

 TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }

 TEST_P(VP9VarianceTest, Ref) { RefTest(); }

+TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); }

 TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }

+TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }

 TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }

 TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }

+#if CONFIG_VP9_HIGHBITDEPTH

+typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest;

+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest;

+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>

+    VP9SubpelAvgVarianceHighTest;

+TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); }

+TEST_P(VP9VarianceHighTest, Ref) { RefTest(); }

+TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); }

+TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }

+TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }

+TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }

+TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;

 const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;

 const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;

@@ -569,20 +944,115 @@

 const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;

 INSTANTIATE_TEST_CASE_P(

     C, VP9VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_c),

-                      make_tuple(2, 3, variance4x8_c),

-                      make_tuple(3, 2, variance8x4_c),

-                      make_tuple(3, 3, variance8x8_c),

-                      make_tuple(3, 4, variance8x16_c),

-                      make_tuple(4, 3, variance16x8_c),

-                      make_tuple(4, 4, variance16x16_c),

-                      make_tuple(4, 5, variance16x32_c),

-                      make_tuple(5, 4, variance32x16_c),

-                      make_tuple(5, 5, variance32x32_c),

-                      make_tuple(5, 6, variance32x64_c),

-                      make_tuple(6, 5, variance64x32_c),

-                      make_tuple(6, 6, variance64x64_c)));

+    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),

+                      make_tuple(2, 3, variance4x8_c, 0),

+                      make_tuple(3, 2, variance8x4_c, 0),

+                      make_tuple(3, 3, variance8x8_c, 0),

+                      make_tuple(3, 4, variance8x16_c, 0),

+                      make_tuple(4, 3, variance16x8_c, 0),

+                      make_tuple(4, 4, variance16x16_c, 0),

+                      make_tuple(4, 5, variance16x32_c, 0),

+                      make_tuple(5, 4, variance32x16_c, 0),

+                      make_tuple(5, 5, variance32x32_c, 0),

+                      make_tuple(5, 6, variance32x64_c, 0),

+                      make_tuple(6, 5, variance64x32_c, 0),

+                      make_tuple(6, 6, variance64x64_c, 0)));

+#if CONFIG_VP9_HIGHBITDEPTH

+const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c;

+const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c;

+const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c;

+const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c;

+const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c;

+const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c;

+const vp9_variance_fn_t highbd_10_variance16x16_c =

+    vp9_highbd_10_variance16x16_c;

+const vp9_variance_fn_t highbd_10_variance16x32_c =

+    vp9_highbd_10_variance16x32_c;

+const vp9_variance_fn_t highbd_10_variance32x16_c =

+    vp9_highbd_10_variance32x16_c;

+const vp9_variance_fn_t highbd_10_variance32x32_c =

+    vp9_highbd_10_variance32x32_c;

+const vp9_variance_fn_t highbd_10_variance32x64_c =

+    vp9_highbd_10_variance32x64_c;

+const vp9_variance_fn_t highbd_10_variance64x32_c =

+    vp9_highbd_10_variance64x32_c;

+const vp9_variance_fn_t highbd_10_variance64x64_c =

+    vp9_highbd_10_variance64x64_c;

+const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c;

+const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c;

+const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c;

+const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c;

+const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c;

+const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c;

+const vp9_variance_fn_t highbd_12_variance16x16_c =

+    vp9_highbd_12_variance16x16_c;

+const vp9_variance_fn_t highbd_12_variance16x32_c =

+    vp9_highbd_12_variance16x32_c;

+const vp9_variance_fn_t highbd_12_variance32x16_c =

+    vp9_highbd_12_variance32x16_c;

+const vp9_variance_fn_t highbd_12_variance32x32_c =

+    vp9_highbd_12_variance32x32_c;

+const vp9_variance_fn_t highbd_12_variance32x64_c =

+    vp9_highbd_12_variance32x64_c;

+const vp9_variance_fn_t highbd_12_variance64x32_c =

+    vp9_highbd_12_variance64x32_c;

+const vp9_variance_fn_t highbd_12_variance64x64_c =

+    vp9_highbd_12_variance64x64_c;

+const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c;

+const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c;

+const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c;

+const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c;

+const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c;

+const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c;

+const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c;

+const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c;

+const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c;

+const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c;

+const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c;

+const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c;

+const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VP9VarianceHighTest,

+    ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10),

+                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),

+                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),

+                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),

+                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),

+                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),

+                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),

+                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),

+                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),

+                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),

+                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),

+                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),

+                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),

+                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),

+                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),

+                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),

+                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),

+                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),

+                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),

+                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),

+                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),

+                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),

+                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),

+                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),

+                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),

+                      make_tuple(6, 6, highbd_12_variance64x64_c, 12),

+                      make_tuple(2, 2, highbd_variance4x4_c, 8),

+                      make_tuple(2, 3, highbd_variance4x8_c, 8),

+                      make_tuple(3, 2, highbd_variance8x4_c, 8),

+                      make_tuple(3, 3, highbd_variance8x8_c, 8),

+                      make_tuple(3, 4, highbd_variance8x16_c, 8),

+                      make_tuple(4, 3, highbd_variance16x8_c, 8),

+                      make_tuple(4, 4, highbd_variance16x16_c, 8),

+                      make_tuple(4, 5, highbd_variance16x32_c, 8),

+                      make_tuple(5, 4, highbd_variance32x16_c, 8),

+                      make_tuple(5, 5, highbd_variance32x32_c, 8),

+                      make_tuple(5, 6, highbd_variance32x64_c, 8),

+                      make_tuple(6, 5, highbd_variance64x32_c, 8),

+                      make_tuple(6, 6, highbd_variance64x64_c, 8)));

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 const vp9_subpixvariance_fn_t subpel_variance4x4_c =

     vp9_sub_pixel_variance4x4_c;

 const vp9_subpixvariance_fn_t subpel_variance4x8_c =

@@ -611,20 +1081,19 @@

     vp9_sub_pixel_variance64x64_c;

 INSTANTIATE_TEST_CASE_P(

     C, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),

-                      make_tuple(2, 3, subpel_variance4x8_c),

-                      make_tuple(3, 2, subpel_variance8x4_c),

-                      make_tuple(3, 3, subpel_variance8x8_c),

-                      make_tuple(3, 4, subpel_variance8x16_c),

-                      make_tuple(4, 3, subpel_variance16x8_c),

-                      make_tuple(4, 4, subpel_variance16x16_c),

-                      make_tuple(4, 5, subpel_variance16x32_c),

-                      make_tuple(5, 4, subpel_variance32x16_c),

-                      make_tuple(5, 5, subpel_variance32x32_c),

-                      make_tuple(5, 6, subpel_variance32x64_c),

-                      make_tuple(6, 5, subpel_variance64x32_c),

-                      make_tuple(6, 6, subpel_variance64x64_c)));

+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),

+                      make_tuple(2, 3, subpel_variance4x8_c, 0),

+                      make_tuple(3, 2, subpel_variance8x4_c, 0),

+                      make_tuple(3, 3, subpel_variance8x8_c, 0),

+                      make_tuple(3, 4, subpel_variance8x16_c, 0),

+                      make_tuple(4, 3, subpel_variance16x8_c, 0),

+                      make_tuple(4, 4, subpel_variance16x16_c, 0),

+                      make_tuple(4, 5, subpel_variance16x32_c, 0),

+                      make_tuple(5, 4, subpel_variance32x16_c, 0),

+                      make_tuple(5, 5, subpel_variance32x32_c, 0),

+                      make_tuple(5, 6, subpel_variance32x64_c, 0),

+                      make_tuple(6, 5, subpel_variance64x32_c, 0),

+                      make_tuple(6, 6, subpel_variance64x64_c, 0)));

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =

     vp9_sub_pixel_avg_variance4x4_c;

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =

@@ -653,23 +1122,263 @@

     vp9_sub_pixel_avg_variance64x64_c;

 INSTANTIATE_TEST_CASE_P(

     C, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),

-                      make_tuple(2, 3, subpel_avg_variance4x8_c),

-                      make_tuple(3, 2, subpel_avg_variance8x4_c),

-                      make_tuple(3, 3, subpel_avg_variance8x8_c),

-                      make_tuple(3, 4, subpel_avg_variance8x16_c),

-                      make_tuple(4, 3, subpel_avg_variance16x8_c),

-                      make_tuple(4, 4, subpel_avg_variance16x16_c),

-                      make_tuple(4, 5, subpel_avg_variance16x32_c),

-                      make_tuple(5, 4, subpel_avg_variance32x16_c),

-                      make_tuple(5, 5, subpel_avg_variance32x32_c),

-                      make_tuple(5, 6, subpel_avg_variance32x64_c),

-                      make_tuple(6, 5, subpel_avg_variance64x32_c),

-                      make_tuple(6, 6, subpel_avg_variance64x64_c)));

+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),

+                      make_tuple(2, 3, subpel_avg_variance4x8_c, 0),

+                      make_tuple(3, 2, subpel_avg_variance8x4_c, 0),

+                      make_tuple(3, 3, subpel_avg_variance8x8_c, 0),

+                      make_tuple(3, 4, subpel_avg_variance8x16_c, 0),

+                      make_tuple(4, 3, subpel_avg_variance16x8_c, 0),

+                      make_tuple(4, 4, subpel_avg_variance16x16_c, 0),

+                      make_tuple(4, 5, subpel_avg_variance16x32_c, 0),

+                      make_tuple(5, 4, subpel_avg_variance32x16_c, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_c, 0),

+                      make_tuple(5, 6, subpel_avg_variance32x64_c, 0),

+                      make_tuple(6, 5, subpel_avg_variance64x32_c, 0),

+                      make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));

+#if CONFIG_VP9_HIGHBITDEPTH

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x4_c =

+    vp9_highbd_10_sub_pixel_variance4x4_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x8_c =

+    vp9_highbd_10_sub_pixel_variance4x8_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_c =

+    vp9_highbd_10_sub_pixel_variance8x4_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_c =

+    vp9_highbd_10_sub_pixel_variance8x8_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_c =

+    vp9_highbd_10_sub_pixel_variance8x16_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_c =

+    vp9_highbd_10_sub_pixel_variance16x8_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_c =

+    vp9_highbd_10_sub_pixel_variance16x16_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_c =

+    vp9_highbd_10_sub_pixel_variance16x32_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_c =

+    vp9_highbd_10_sub_pixel_variance32x16_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_c =

+    vp9_highbd_10_sub_pixel_variance32x32_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_c =

+    vp9_highbd_10_sub_pixel_variance32x64_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_c =

+    vp9_highbd_10_sub_pixel_variance64x32_c;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_c =

+    vp9_highbd_10_sub_pixel_variance64x64_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x4_c =

+    vp9_highbd_12_sub_pixel_variance4x4_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x8_c =

+    vp9_highbd_12_sub_pixel_variance4x8_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_c =

+    vp9_highbd_12_sub_pixel_variance8x4_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_c =

+    vp9_highbd_12_sub_pixel_variance8x8_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_c =

+    vp9_highbd_12_sub_pixel_variance8x16_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_c =

+    vp9_highbd_12_sub_pixel_variance16x8_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_c =

+    vp9_highbd_12_sub_pixel_variance16x16_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_c =

+    vp9_highbd_12_sub_pixel_variance16x32_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_c =

+    vp9_highbd_12_sub_pixel_variance32x16_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_c =

+    vp9_highbd_12_sub_pixel_variance32x32_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_c =

+    vp9_highbd_12_sub_pixel_variance32x64_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_c =

+    vp9_highbd_12_sub_pixel_variance64x32_c;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_c =

+    vp9_highbd_12_sub_pixel_variance64x64_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance4x4_c =

+    vp9_highbd_sub_pixel_variance4x4_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance4x8_c =

+    vp9_highbd_sub_pixel_variance4x8_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_c =

+    vp9_highbd_sub_pixel_variance8x4_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_c =

+    vp9_highbd_sub_pixel_variance8x8_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_c =

+    vp9_highbd_sub_pixel_variance8x16_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_c =

+    vp9_highbd_sub_pixel_variance16x8_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_c =

+    vp9_highbd_sub_pixel_variance16x16_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_c =

+    vp9_highbd_sub_pixel_variance16x32_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_c =

+    vp9_highbd_sub_pixel_variance32x16_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_c =

+    vp9_highbd_sub_pixel_variance32x32_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_c =

+    vp9_highbd_sub_pixel_variance32x64_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_c =

+    vp9_highbd_sub_pixel_variance64x32_c;

+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_c =

+    vp9_highbd_sub_pixel_variance64x64_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VP9SubpelVarianceHighTest,

+    ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),

+                      make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),

+                      make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),

+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),

+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),

+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),

+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),

+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),

+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),

+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),

+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),

+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),

+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),

+                      make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),

+                      make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),

+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),

+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),

+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),

+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),

+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),

+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),

+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),

+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),

+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),

+                      make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),

+                      make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),

+                      make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),

+                      make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),

+                      make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),

+                      make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),

+                      make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),

+                      make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),

+                      make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),

+                      make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),

+                      make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),

+                      make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),

+                      make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),

+                      make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),

+                      make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =

+    vp9_highbd_10_sub_pixel_avg_variance4x4_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =

+    vp9_highbd_10_sub_pixel_avg_variance4x8_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =

+    vp9_highbd_10_sub_pixel_avg_variance8x4_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =

+    vp9_highbd_10_sub_pixel_avg_variance8x8_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =

+    vp9_highbd_10_sub_pixel_avg_variance8x16_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =

+    vp9_highbd_10_sub_pixel_avg_variance16x8_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =

+    vp9_highbd_10_sub_pixel_avg_variance16x16_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =

+    vp9_highbd_10_sub_pixel_avg_variance16x32_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =

+    vp9_highbd_10_sub_pixel_avg_variance32x16_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =

+    vp9_highbd_10_sub_pixel_avg_variance32x32_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =

+    vp9_highbd_10_sub_pixel_avg_variance32x64_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =

+    vp9_highbd_10_sub_pixel_avg_variance64x32_c;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =

+    vp9_highbd_10_sub_pixel_avg_variance64x64_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =

+    vp9_highbd_12_sub_pixel_avg_variance4x4_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =

+    vp9_highbd_12_sub_pixel_avg_variance4x8_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =

+    vp9_highbd_12_sub_pixel_avg_variance8x4_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =

+    vp9_highbd_12_sub_pixel_avg_variance8x8_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =

+    vp9_highbd_12_sub_pixel_avg_variance8x16_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =

+    vp9_highbd_12_sub_pixel_avg_variance16x8_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =

+    vp9_highbd_12_sub_pixel_avg_variance16x16_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =

+    vp9_highbd_12_sub_pixel_avg_variance16x32_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =

+    vp9_highbd_12_sub_pixel_avg_variance32x16_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =

+    vp9_highbd_12_sub_pixel_avg_variance32x32_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =

+    vp9_highbd_12_sub_pixel_avg_variance32x64_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =

+    vp9_highbd_12_sub_pixel_avg_variance64x32_c;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =

+    vp9_highbd_12_sub_pixel_avg_variance64x64_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =

+    vp9_highbd_sub_pixel_avg_variance4x4_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =

+    vp9_highbd_sub_pixel_avg_variance4x8_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =

+    vp9_highbd_sub_pixel_avg_variance8x4_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =

+    vp9_highbd_sub_pixel_avg_variance8x8_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =

+    vp9_highbd_sub_pixel_avg_variance8x16_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =

+    vp9_highbd_sub_pixel_avg_variance16x8_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =

+    vp9_highbd_sub_pixel_avg_variance16x16_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =

+    vp9_highbd_sub_pixel_avg_variance16x32_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =

+    vp9_highbd_sub_pixel_avg_variance32x16_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =

+    vp9_highbd_sub_pixel_avg_variance32x32_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =

+    vp9_highbd_sub_pixel_avg_variance32x64_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =

+    vp9_highbd_sub_pixel_avg_variance64x32_c;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =

+    vp9_highbd_sub_pixel_avg_variance64x64_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VP9SubpelAvgVarianceHighTest,

+    ::testing::Values(

+        make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),

+        make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),

+        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),

+        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),

+        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),

+        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),

+        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),

+        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),

+        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),

+        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),

+        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),

+        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),

+        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),

+        make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),

+        make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),

+        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),

+        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),

+        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),

+        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),

+        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),

+        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),

+        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),

+        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),

+        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),

+        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),

+        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),

+        make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),

+        make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),

+        make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),

+        make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),

+        make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),

+        make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),

+        make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),

+        make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),

+        make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),

+        make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),

+        make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),

+        make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),

+        make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_SSE2

 #if CONFIG_USE_X86INC

 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,

                         ::testing::Values(vp9_get_mb_ss_sse2));

@@ -688,20 +1397,19 @@

 const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP9VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_sse2),

-                      make_tuple(2, 3, variance4x8_sse2),

-                      make_tuple(3, 2, variance8x4_sse2),

-                      make_tuple(3, 3, variance8x8_sse2),

-                      make_tuple(3, 4, variance8x16_sse2),

-                      make_tuple(4, 3, variance16x8_sse2),

-                      make_tuple(4, 4, variance16x16_sse2),

-                      make_tuple(4, 5, variance16x32_sse2),

-                      make_tuple(5, 4, variance32x16_sse2),

-                      make_tuple(5, 5, variance32x32_sse2),

-                      make_tuple(5, 6, variance32x64_sse2),

-                      make_tuple(6, 5, variance64x32_sse2),

-                      make_tuple(6, 6, variance64x64_sse2)));

+    ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0),

+                      make_tuple(2, 3, variance4x8_sse2, 0),

+                      make_tuple(3, 2, variance8x4_sse2, 0),

+                      make_tuple(3, 3, variance8x8_sse2, 0),

+                      make_tuple(3, 4, variance8x16_sse2, 0),

+                      make_tuple(4, 3, variance16x8_sse2, 0),

+                      make_tuple(4, 4, variance16x16_sse2, 0),

+                      make_tuple(4, 5, variance16x32_sse2, 0),

+                      make_tuple(5, 4, variance32x16_sse2, 0),

+                      make_tuple(5, 5, variance32x32_sse2, 0),

+                      make_tuple(5, 6, variance32x64_sse2, 0),

+                      make_tuple(6, 5, variance64x32_sse2, 0),

+                      make_tuple(6, 6, variance64x64_sse2, 0)));

 const vp9_subpixvariance_fn_t subpel_variance4x4_sse =

     vp9_sub_pixel_variance4x4_sse;

 const vp9_subpixvariance_fn_t subpel_variance4x8_sse =

@@ -730,20 +1438,19 @@

     vp9_sub_pixel_variance64x64_sse2;

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),

-                      make_tuple(2, 3, subpel_variance4x8_sse),

-                      make_tuple(3, 2, subpel_variance8x4_sse2),

-                      make_tuple(3, 3, subpel_variance8x8_sse2),

-                      make_tuple(3, 4, subpel_variance8x16_sse2),

-                      make_tuple(4, 3, subpel_variance16x8_sse2),

-                      make_tuple(4, 4, subpel_variance16x16_sse2),

-                      make_tuple(4, 5, subpel_variance16x32_sse2),

-                      make_tuple(5, 4, subpel_variance32x16_sse2),

-                      make_tuple(5, 5, subpel_variance32x32_sse2),

-                      make_tuple(5, 6, subpel_variance32x64_sse2),

-                      make_tuple(6, 5, subpel_variance64x32_sse2),

-                      make_tuple(6, 6, subpel_variance64x64_sse2)));

+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),

+                      make_tuple(2, 3, subpel_variance4x8_sse, 0),

+                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),

+                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),

+                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),

+                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),

+                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),

+                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),

+                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),

+                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),

+                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),

+                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),

+                      make_tuple(6, 6, subpel_variance64x64_sse2, 0)));

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =

     vp9_sub_pixel_avg_variance4x4_sse;

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =

@@ -772,22 +1479,316 @@

     vp9_sub_pixel_avg_variance64x64_sse2;

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),

-                      make_tuple(2, 3, subpel_avg_variance4x8_sse),

-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2),

-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2),

-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2),

-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2),

-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2),

-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2),

-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2),

-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2),

-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2),

-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),

-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));

-#endif

-#endif

+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),

+                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),

+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),

+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),

+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),

+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),

+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),

+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),

+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),

+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),

+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),

+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));

+#if CONFIG_VP9_HIGHBITDEPTH

+const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2;

+const vp9_variance_fn_t highbd_10_variance8x8_sse2 =

+    vp9_highbd_10_variance8x8_sse2;

+const vp9_variance_fn_t highbd_12_variance8x8_sse2 =

+    vp9_highbd_12_variance8x8_sse2;

+const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2;

+const vp9_variance_fn_t highbd_10_variance8x16_sse2 =

+    vp9_highbd_10_variance8x16_sse2;

+const vp9_variance_fn_t highbd_12_variance8x16_sse2 =

+    vp9_highbd_12_variance8x16_sse2;

+const vp9_variance_fn_t highbd_variance16x8_sse2 =

+    vp9_highbd_variance16x8_sse2;

+const vp9_variance_fn_t highbd_10_variance16x8_sse2 =

+    vp9_highbd_10_variance16x8_sse2;

+const vp9_variance_fn_t highbd_12_variance16x8_sse2 =

+    vp9_highbd_12_variance16x8_sse2;

+const vp9_variance_fn_t highbd_variance16x16_sse2 =

+    vp9_highbd_variance16x16_sse2;

+const vp9_variance_fn_t highbd_10_variance16x16_sse2 =

+    vp9_highbd_10_variance16x16_sse2;

+const vp9_variance_fn_t highbd_12_variance16x16_sse2 =

+    vp9_highbd_12_variance16x16_sse2;

+const vp9_variance_fn_t highbd_variance16x32_sse2 =

+    vp9_highbd_variance16x32_sse2;

+const vp9_variance_fn_t highbd_10_variance16x32_sse2 =

+    vp9_highbd_10_variance16x32_sse2;

+const vp9_variance_fn_t highbd_12_variance16x32_sse2 =

+    vp9_highbd_12_variance16x32_sse2;

+const vp9_variance_fn_t highbd_variance32x16_sse2 =

+    vp9_highbd_variance32x16_sse2;

+const vp9_variance_fn_t highbd_10_variance32x16_sse2 =

+    vp9_highbd_10_variance32x16_sse2;

+const vp9_variance_fn_t highbd_12_variance32x16_sse2 =

+    vp9_highbd_12_variance32x16_sse2;

+const vp9_variance_fn_t highbd_variance32x32_sse2 =

+    vp9_highbd_variance32x32_sse2;

+const vp9_variance_fn_t highbd_10_variance32x32_sse2 =

+    vp9_highbd_10_variance32x32_sse2;

+const vp9_variance_fn_t highbd_12_variance32x32_sse2 =

+    vp9_highbd_12_variance32x32_sse2;

+const vp9_variance_fn_t highbd_variance32x64_sse2 =

+    vp9_highbd_variance32x64_sse2;

+const vp9_variance_fn_t highbd_10_variance32x64_sse2 =

+    vp9_highbd_10_variance32x64_sse2;

+const vp9_variance_fn_t highbd_12_variance32x64_sse2 =

+    vp9_highbd_12_variance32x64_sse2;

+const vp9_variance_fn_t highbd_variance64x32_sse2 =

+    vp9_highbd_variance64x32_sse2;

+const vp9_variance_fn_t highbd_10_variance64x32_sse2 =

+    vp9_highbd_10_variance64x32_sse2;

+const vp9_variance_fn_t highbd_12_variance64x32_sse2 =

+    vp9_highbd_12_variance64x32_sse2;

+const vp9_variance_fn_t highbd_variance64x64_sse2 =

+    vp9_highbd_variance64x64_sse2;

+const vp9_variance_fn_t highbd_10_variance64x64_sse2 =

+    vp9_highbd_10_variance64x64_sse2;

+const vp9_variance_fn_t highbd_12_variance64x64_sse2 =

+    vp9_highbd_12_variance64x64_sse2;

+INSTANTIATE_TEST_CASE_P(

+    SSE2, VP9VarianceHighTest,

+    ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),

+                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),

+                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),

+                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),

+                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),

+                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),

+                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),

+                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),

+                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),

+                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),

+                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),

+                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),

+                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),

+                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),

+                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),

+                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),

+                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),

+                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),

+                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),

+                      make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),

+                      make_tuple(3, 3, highbd_variance8x8_sse2, 8),

+                      make_tuple(3, 4, highbd_variance8x16_sse2, 8),

+                      make_tuple(4, 3, highbd_variance16x8_sse2, 8),

+                      make_tuple(4, 4, highbd_variance16x16_sse2, 8),

+                      make_tuple(4, 5, highbd_variance16x32_sse2, 8),

+                      make_tuple(5, 4, highbd_variance32x16_sse2, 8),

+                      make_tuple(5, 5, highbd_variance32x32_sse2, 8),

+                      make_tuple(5, 6, highbd_variance32x64_sse2, 8),

+                      make_tuple(6, 5, highbd_variance64x32_sse2, 8),

+                      make_tuple(6, 6, highbd_variance64x64_sse2, 8)));

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 =

+    vp9_highbd_sub_pixel_variance8x4_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 =

+    vp9_highbd_sub_pixel_variance8x8_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_sse2 =

+    vp9_highbd_sub_pixel_variance8x16_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_sse2 =

+    vp9_highbd_sub_pixel_variance16x8_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_sse2 =

+    vp9_highbd_sub_pixel_variance16x16_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_sse2 =

+    vp9_highbd_sub_pixel_variance16x32_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_sse2 =

+    vp9_highbd_sub_pixel_variance32x16_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_sse2 =

+    vp9_highbd_sub_pixel_variance32x32_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_sse2 =

+    vp9_highbd_sub_pixel_variance32x64_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_sse2 =

+    vp9_highbd_sub_pixel_variance64x32_sse2;

+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_sse2 =

+    vp9_highbd_sub_pixel_variance64x64_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_sse2 =

+    vp9_highbd_10_sub_pixel_variance8x4_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_sse2 =

+    vp9_highbd_10_sub_pixel_variance8x8_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_sse2 =

+    vp9_highbd_10_sub_pixel_variance8x16_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_sse2 =

+    vp9_highbd_10_sub_pixel_variance16x8_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_sse2 =

+    vp9_highbd_10_sub_pixel_variance16x16_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_sse2 =

+    vp9_highbd_10_sub_pixel_variance16x32_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_sse2 =

+    vp9_highbd_10_sub_pixel_variance32x16_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_sse2 =

+    vp9_highbd_10_sub_pixel_variance32x32_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_sse2 =

+    vp9_highbd_10_sub_pixel_variance32x64_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_sse2 =

+    vp9_highbd_10_sub_pixel_variance64x32_sse2;

+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_sse2 =

+    vp9_highbd_10_sub_pixel_variance64x64_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_sse2 =

+    vp9_highbd_12_sub_pixel_variance8x4_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_sse2 =

+    vp9_highbd_12_sub_pixel_variance8x8_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_sse2 =

+    vp9_highbd_12_sub_pixel_variance8x16_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_sse2 =

+    vp9_highbd_12_sub_pixel_variance16x8_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_sse2 =

+    vp9_highbd_12_sub_pixel_variance16x16_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_sse2 =

+    vp9_highbd_12_sub_pixel_variance16x32_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_sse2 =

+    vp9_highbd_12_sub_pixel_variance32x16_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_sse2 =

+    vp9_highbd_12_sub_pixel_variance32x32_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_sse2 =

+    vp9_highbd_12_sub_pixel_variance32x64_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_sse2 =

+    vp9_highbd_12_sub_pixel_variance64x32_sse2;

+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_sse2 =

+    vp9_highbd_12_sub_pixel_variance64x64_sse2;

+INSTANTIATE_TEST_CASE_P(

+    SSE2, VP9SubpelVarianceHighTest,

+    ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),

+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),

+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),

+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),

+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),

+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),

+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),

+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),

+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),

+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),

+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),

+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),

+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),

+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),

+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),

+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),

+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),

+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),

+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),

+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),

+                      make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),

+                      make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),

+                      make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),

+                      make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),

+                      make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),

+                      make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),

+                      make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),

+                      make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),

+                      make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),

+                      make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),

+                      make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),

+                      make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),

+                      make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =

+    vp9_highbd_sub_pixel_avg_variance8x4_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =

+    vp9_highbd_sub_pixel_avg_variance8x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =

+    vp9_highbd_sub_pixel_avg_variance8x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =

+    vp9_highbd_sub_pixel_avg_variance16x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =

+    vp9_highbd_sub_pixel_avg_variance16x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =

+    vp9_highbd_sub_pixel_avg_variance16x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =

+    vp9_highbd_sub_pixel_avg_variance32x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =

+    vp9_highbd_sub_pixel_avg_variance32x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =

+    vp9_highbd_sub_pixel_avg_variance32x64_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =

+    vp9_highbd_sub_pixel_avg_variance64x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =

+    vp9_highbd_sub_pixel_avg_variance64x64_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =

+    vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;

+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =

+    vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;

+INSTANTIATE_TEST_CASE_P(

+    SSE2, VP9SubpelAvgVarianceHighTest,

+    ::testing::Values(

+                  make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),

+                  make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),

+                  make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),

+                  make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),

+                  make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),

+                  make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),

+                  make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),

+                  make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),

+                  make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),

+                  make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),

+                  make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),

+                  make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),

+                  make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),

+                  make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),

+                  make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),

+                  make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),

+                  make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),

+                  make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),

+                  make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),

+                  make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),

+                  make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),

+                  make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),

+                  make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),

+                  make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),

+                  make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),

+                  make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),

+                  make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),

+                  make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),

+                  make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),

+                  make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),

+                  make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),

+                  make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),

+                  make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#endif  // CONFIG_USE_X86INC

+#endif  // HAVE_SSE2

 #if HAVE_SSSE3

 #if CONFIG_USE_X86INC

@@ -819,20 +1820,19 @@

     vp9_sub_pixel_variance64x64_ssse3;

 INSTANTIATE_TEST_CASE_P(

     SSSE3, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),

-                      make_tuple(2, 3, subpel_variance4x8_ssse3),

-                      make_tuple(3, 2, subpel_variance8x4_ssse3),

-                      make_tuple(3, 3, subpel_variance8x8_ssse3),

-                      make_tuple(3, 4, subpel_variance8x16_ssse3),

-                      make_tuple(4, 3, subpel_variance16x8_ssse3),

-                      make_tuple(4, 4, subpel_variance16x16_ssse3),

-                      make_tuple(4, 5, subpel_variance16x32_ssse3),

-                      make_tuple(5, 4, subpel_variance32x16_ssse3),

-                      make_tuple(5, 5, subpel_variance32x32_ssse3),

-                      make_tuple(5, 6, subpel_variance32x64_ssse3),

-                      make_tuple(6, 5, subpel_variance64x32_ssse3),

-                      make_tuple(6, 6, subpel_variance64x64_ssse3)));

+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),

+                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),

+                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),

+                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),

+                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),

+                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),

+                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),

+                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),

+                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),

+                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),

+                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),

+                      make_tuple(6, 5, subpel_variance64x32_ssse3, 0),

+                      make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =

     vp9_sub_pixel_avg_variance4x4_ssse3;

 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =

@@ -861,21 +1861,21 @@

     vp9_sub_pixel_avg_variance64x64_ssse3;

 INSTANTIATE_TEST_CASE_P(

     SSSE3, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),

-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3),

-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3),

-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3),

-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3),

-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3),

-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3),

-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3),

-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3),

-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3),

-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3),

-                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),

-                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));

-#endif

-#endif

+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),

+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),

+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),

+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),

+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),

+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),

+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),

+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),

+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),

+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),

+                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),

+                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));

+#endif  // CONFIG_USE_X86INC

+#endif  // HAVE_SSSE3

 #if HAVE_AVX2

@@ -886,11 +1886,11 @@

 const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;

 INSTANTIATE_TEST_CASE_P(

     AVX2, VP9VarianceTest,

-    ::testing::Values(make_tuple(4, 4, variance16x16_avx2),

-                      make_tuple(5, 4, variance32x16_avx2),

-                      make_tuple(5, 5, variance32x32_avx2),

-                      make_tuple(6, 5, variance64x32_avx2),

-                      make_tuple(6, 6, variance64x64_avx2)));

+    ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0),

+                      make_tuple(5, 4, variance32x16_avx2, 0),

+                      make_tuple(5, 5, variance32x32_avx2, 0),

+                      make_tuple(6, 5, variance64x32_avx2, 0),

+                      make_tuple(6, 6, variance64x64_avx2, 0)));

 const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =

     vp9_sub_pixel_variance32x32_avx2;

@@ -898,8 +1898,8 @@

     vp9_sub_pixel_variance64x64_avx2;

 INSTANTIATE_TEST_CASE_P(

     AVX2, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),

-                      make_tuple(6, 6, subpel_variance64x64_avx2)));

+    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),

+                      make_tuple(6, 6, subpel_variance64x64_avx2, 0)));

 const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =

     vp9_sub_pixel_avg_variance32x32_avx2;

@@ -907,8 +1907,8 @@

     vp9_sub_pixel_avg_variance64x64_avx2;

 INSTANTIATE_TEST_CASE_P(

     AVX2, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),

-                      make_tuple(6, 6, subpel_avg_variance64x64_avx2)));

+    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),

+                      make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));

 #endif  // HAVE_AVX2

 #if HAVE_NEON

 const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;

@@ -916,9 +1916,9 @@

 const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;

 INSTANTIATE_TEST_CASE_P(

     NEON, VP9VarianceTest,

-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),

-                      make_tuple(4, 4, variance16x16_neon),

-                      make_tuple(5, 5, variance32x32_neon)));

+    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),

+                      make_tuple(4, 4, variance16x16_neon, 0),

+                      make_tuple(5, 5, variance32x32_neon, 0)));

 const vp9_subpixvariance_fn_t subpel_variance8x8_neon =

     vp9_sub_pixel_variance8x8_neon;

@@ -928,12 +1928,11 @@

     vp9_sub_pixel_variance32x32_neon;

 INSTANTIATE_TEST_CASE_P(

     NEON, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon),

-                      make_tuple(4, 4, subpel_variance16x16_neon),

-                      make_tuple(5, 5, subpel_variance32x32_neon)));

+    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),

+                      make_tuple(4, 4, subpel_variance16x16_neon, 0),

+                      make_tuple(5, 5, subpel_variance32x32_neon, 0)));

 #endif  // HAVE_NEON

 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9

 }  // namespace

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -112,6 +112,9 @@

   // Common for both INTER and INTRA blocks

   BLOCK_SIZE sb_type;

   PREDICTION_MODE mode;

+#if CONFIG_FILTERINTRA

+  int filterbit, uv_filterbit;

+#endif

   TX_SIZE tx_size;

   int8_t skip;

   int8_t segment_id;

@@ -126,11 +129,18 @@

   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];

   uint8_t mode_context[MAX_REF_FRAMES];

   INTERP_FILTER interp_filter;

+#if CONFIG_EXT_TX

+  EXT_TX_TYPE ext_txfrm;

+#endif

 } MB_MODE_INFO;

 typedef struct MODE_INFO {

   struct MODE_INFO *src_mi;

   MB_MODE_INFO mbmi;

+#if CONFIG_FILTERINTRA

+  int b_filter_info[4];

+#endif

   b_mode_info bmi[4];

 } MODE_INFO;

@@ -139,6 +149,17 @@

                                       : mi->mbmi.mode;

+#if CONFIG_FILTERINTRA

+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {

+  (void)mode;

+  return 1;

+}

+static INLINE int is_filter_enabled(TX_SIZE txsize) {

+  return (txsize < TX_SIZES);

+}

+#endif

 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {

   return mbmi->ref_frame[0] > INTRA_FRAME;

@@ -236,12 +257,33 @@

 extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];

+#if CONFIG_EXT_TX

+static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {

+  switch (ext_tx) {

+    case NORM:

+    default:

+      return DCT_DCT;

+    case ALT:

+      return ADST_ADST;

+  }

+}

+#endif

 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,

                                   const MACROBLOCKD *xd) {

   const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;

-  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))

+#if CONFIG_EXT_TX

+  if (plane_type != PLANE_TYPE_Y || xd->lossless)

+      return DCT_DCT;

+  if (is_inter_block(mbmi)) {

+    return ext_tx_to_txtype(mbmi->ext_txfrm);

+  }

+#else

+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))

     return DCT_DCT;

+#endif

   return intra_mode_to_tx_type_lookup[mbmi->mode];

@@ -249,8 +291,17 @@

                                       const MACROBLOCKD *xd, int ib) {

   const MODE_INFO *const mi = xd->mi[0].src_mi;

+#if CONFIG_EXT_TX

+  if (plane_type != PLANE_TYPE_Y || xd->lossless)

+      return DCT_DCT;

+  if (is_inter_block(&mi->mbmi)) {

+    return ext_tx_to_txtype(mi->mbmi.ext_txfrm);

+  }

+#else

   if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))

     return DCT_DCT;

+#endif

   return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -1283,34 +1283,34 @@

   # variance

   add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance32x16/;

+  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance16x32/;

+  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance64x32/;

+  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance32x64/;

+  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance32x32/;

+  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance64x64/;

+  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance16x16/;

+  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance16x8/;

+  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance8x16/;

+  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_variance8x8/;

+  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_variance8x4/;

@@ -1322,40 +1322,40 @@

   specialize qw/vp9_highbd_variance4x4/;

   add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_get8x8var/;

+  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";

   add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_get16x16var/;

+  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance32x16/;

+  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance16x32/;

+  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance64x32/;

+  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance32x64/;

+  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance32x32/;

+  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance64x64/;

+  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance16x16/;

+  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance16x8/;

+  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance8x16/;

+  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_variance8x8/;

+  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_10_variance8x4/;

@@ -1367,40 +1367,40 @@

   specialize qw/vp9_highbd_10_variance4x4/;

   add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_10_get8x8var/;

+  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";

   add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_10_get16x16var/;

+  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance32x16/;

+  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance16x32/;

+  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance64x32/;

+  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance32x64/;

+  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance32x32/;

+  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance64x64/;

+  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance16x16/;

+  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance16x8/;

+  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance8x16/;

+  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_variance8x8/;

+  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_12_variance8x4/;

@@ -1412,76 +1412,76 @@

   specialize qw/vp9_highbd_12_variance4x4/;

   add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_12_get8x8var/;

+  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";

   add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vp9_highbd_12_get16x16var/;

+  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance64x64/;

+  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x64/;

+  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance64x32/;

+  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x16/;

+  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x32/;

+  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x32/;

+  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x16/;

+  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x16/;

+  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x8/;

+  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x8/;

+  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x4/;

+  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;

+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_sub_pixel_variance4x8/;

@@ -1496,70 +1496,70 @@

   specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;

+  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;

+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;

@@ -1574,70 +1574,70 @@

   specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;

+  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;

+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;

@@ -1817,7 +1817,7 @@

   specialize qw/vp9_highbd_sad4x4x4d sse2/;

   add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_mse16x16/;

+  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

   specialize qw/vp9_highbd_mse8x16/;

@@ -1826,10 +1826,10 @@

   specialize qw/vp9_highbd_mse16x8/;

   add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_mse8x8/;

+  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_mse16x16/;

+  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

   specialize qw/vp9_highbd_10_mse8x16/;

@@ -1838,10 +1838,10 @@

   specialize qw/vp9_highbd_10_mse16x8/;

   add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_mse8x8/;

+  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_mse16x16/;

+  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";

   add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

   specialize qw/vp9_highbd_12_mse8x16/;

@@ -1850,7 +1850,7 @@

   specialize qw/vp9_highbd_12_mse16x8/;

   add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_mse8x8/;

+  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";

   # ENCODEMB INVOKE

--- /dev/null

+++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm

@@ -1,0 +1,1043 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+pw_8: times  8 dw  8

+bilin_filter_m_sse2: times  8 dw 16

+                     times  8 dw  0

+                     times  8 dw 15

+                     times  8 dw  1

+                     times  8 dw 14

+                     times  8 dw  2

+                     times  8 dw 13

+                     times  8 dw  3

+                     times  8 dw 12

+                     times  8 dw  4

+                     times  8 dw 11

+                     times  8 dw  5

+                     times  8 dw 10

+                     times  8 dw  6

+                     times  8 dw  9

+                     times  8 dw  7

+                     times 16 dw  8

+                     times  8 dw  7

+                     times  8 dw  9

+                     times  8 dw  6

+                     times  8 dw 10

+                     times  8 dw  5

+                     times  8 dw 11

+                     times  8 dw  4

+                     times  8 dw 12

+                     times  8 dw  3

+                     times  8 dw 13

+                     times  8 dw  2

+                     times  8 dw 14

+                     times  8 dw  1

+                     times  8 dw 15

+SECTION .text

+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

+;                               int x_offset, int y_offset,

+;                               const uint8_t *dst, ptrdiff_t dst_stride,

+;                               int height, unsigned int *sse);

+;

+; This function returns the SE and stores SSE in the given pointer.

+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

+  psubw                %3, %4

+  psubw                %1, %2

+  mova                 %4, %3       ; make copies to manipulate to calc sum

+  mova                 %2, %1       ; use originals for calc sse

+  pmaddwd              %3, %3

+  paddw                %4, %2

+  pmaddwd              %1, %1

+  movhlps              %2, %4

+  paddd                %6, %3

+  paddw                %4, %2

+  pxor                 %2, %2

+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)

+  punpcklwd            %4, %2       ; sign-extend word to dword

+  paddd                %6, %1

+  paddd                %5, %4

+%endmacro

+%macro STORE_AND_RET 0

+%if mmsize == 16

+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

+  ; We have to sign-extend it before adding the words within the register

+  ; and outputing to a dword.

+  movhlps              m3, m7

+  movhlps              m4, m6

+  paddd                m7, m3

+  paddd                m6, m4

+  pshufd               m3, m7, 0x1

+  pshufd               m4, m6, 0x1

+  paddd                m7, m3

+  paddd                m6, m4

+  mov                  r1, ssem         ; r1 = unsigned int *sse

+  movd               [r1], m7           ; store sse

+  movd                rax, m6           ; store sum as return value

+%endif

+  RET

+%endmacro

+%macro INC_SRC_BY_SRC_STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  lea                srcq, [srcq + src_stridemp*2]

+%else

+  lea                srcq, [srcq + src_strideq*2]

+%endif

+%endmacro

+%macro INC_SRC_BY_SRC_2STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  lea                srcq, [srcq + src_stridemp*4]

+%else

+  lea                srcq, [srcq + src_strideq*4]

+%endif

+%endmacro

+%macro SUBPEL_VARIANCE 1-2 0 ; W

+%define bilin_filter_m bilin_filter_m_sse2

+%define filter_idx_shift 5

+%ifdef PIC    ; 64bit PIC

+  %if %2 == 1 ; avg

+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

+                                      x_offset, y_offset, \

+                                      dst, dst_stride, \

+                                      sec, sec_stride, height, sse

+    %define sec_str sec_strideq

+  %else

+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

+                                  y_offset, dst, dst_stride, height, sse

+  %endif

+  %define h heightd

+  %define bilin_filter sseq

+%else

+  %if ARCH_X86=1 && CONFIG_PIC=1

+    %if %2 == 1 ; avg

+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

+                                  x_offset, y_offset, \

+                                  dst, dst_stride, \

+                                  sec, sec_stride, \

+                                  height, sse, g_bilin_filter, g_pw_8

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      ; Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %else

+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

+                                x_offset, y_offset, dst, dst_stride, height, \

+                                sse, g_bilin_filter, g_pw_8

+      %define h heightd

+      ; Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %endif

+  %else

+    %if %2 == 1 ; avg

+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

+                                             x_offset, y_offset, \

+                                             dst, dst_stride, \

+                                             sec, sec_stride, \

+                                             height, sse

+      %if ARCH_X86_64

+      %define h heightd

+      %define sec_str sec_strideq

+      %else

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      %endif

+    %else

+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

+                              x_offset, y_offset, dst, dst_stride, height, sse

+      %define h heightd

+    %endif

+    %define bilin_filter bilin_filter_m

+  %endif

+%endif

+  ASSERT               %1 <= 16         ; m6 overflows if w > 16

+  pxor                 m6, m6           ; sum

+  pxor                 m7, m7           ; sse

+%if %1 < 16

+  sar                   h, 1

+%endif

+  ; FIXME(rbultje) replace by jumptable?

+  test          x_offsetd, x_offsetd

+  jnz .x_nonzero

+  ; x_offset == 0

+  test          y_offsetd, y_offsetd

+  jnz .x_zero_y_nonzero

+  ; x_offset == 0 && y_offset == 0

+.x_zero_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq + 16]

+  mova                 m1, [dstq]

+  mova                 m3, [dstq + 16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m2, [secq+16]

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq + src_strideq*2]

+  mova                 m1, [dstq]

+  mova                 m3, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m2, [secq + sec_str*2]

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_zero_loop

+  STORE_AND_RET

+.x_zero_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_zero_y_nonhalf

+  ; x_offset == 0 && y_offset == 0.5

+.x_zero_y_half_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m4, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+16]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+16]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*4]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+dst_strideq*2]

+  pavgw                m0, m1

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_half_loop

+  STORE_AND_RET

+.x_zero_y_nonhalf:

+  ; x_offset == 0 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+  mova                 m9, [bilin_filter+y_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else ; x86-32 or mmx

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0, reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_zero_y_other_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + 16]

+  movu                 m4, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+16]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+16]

+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

+  ; slightly faster because of pmullw latency. It would also cut our rodata

+  ; tables in half for this function, and save 1-2 registers on x86-64.

+  pmullw               m1, filter_y_a

+  pmullw               m5, filter_y_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, filter_rnd

+  paddw                m1, m5

+  paddw                m0, m4

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*4]

+  mova                 m4, m1

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+dst_strideq*2]

+  pmullw               m1, filter_y_a

+  pmullw               m5, filter_y_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, filter_rnd

+  paddw                m1, m5

+  paddw                m0, m4

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonzero:

+  cmp           x_offsetd, 8

+  jne .x_nonhalf

+  ; x_offset == 0.5

+  test          y_offsetd, y_offsetd

+  jnz .x_half_y_nonzero

+  ; x_offset == 0.5 && y_offset == 0

+.x_half_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + 16]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + 18]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq + 16]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + src_strideq*2]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + src_strideq*2 + 2]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq + dst_strideq*2]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_zero_loop

+  STORE_AND_RET

+.x_half_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_half_y_nonhalf

+  ; x_offset == 0.5 && y_offset == 0.5

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+  pavgw                m1, m3

+.x_half_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq + 16]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + 18]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  pavgw                m0, m2

+  pavgw                m1, m3

+  mova                 m4, [dstq]

+  mova                 m5, [dstq + 16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  mova                 m0, m2

+  mova                 m1, m3

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+.x_half_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq + src_strideq*2]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + src_strideq*2 + 2]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  pavgw                m0, m2

+  pavgw                m2, m3

+  mova                 m4, [dstq]

+  mova                 m5, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m2, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m4, m2, m5, m6, m7

+  mova                 m0, m3

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_half_loop

+  STORE_AND_RET

+.x_half_y_nonhalf:

+  ; x_offset == 0.5 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+  mova                 m9, [bilin_filter+y_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else  ; x86_32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0.5. We can reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+  pavgw                m1, m3

+.x_half_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+16]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+18]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m1, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m1, filter_rnd

+  paddw                m1, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  psrlw                m1, 4

+  paddw                m0, m2

+  mova                 m2, [dstq]

+  psrlw                m0, 4

+  mova                 m3, [dstq+16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  mova                 m0, m4

+  mova                 m1, m5

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+.x_half_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m4, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m4, filter_rnd

+  paddw                m4, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  psrlw                m4, 4

+  paddw                m0, m2

+  mova                 m2, [dstq]

+  psrlw                m0, 4

+  mova                 m3, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m4, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m2, m4, m3, m6, m7

+  mova                 m0, m5

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf:

+  test          y_offsetd, y_offsetd

+  jnz .x_nonhalf_y_nonzero

+  ; x_offset == bilin interpolation && y_offset == 0

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_other_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+16]

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m1, m3

+  paddw                m0, m2

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+src_strideq*2+2]

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+dst_strideq*2]

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m1, m3

+  paddw                m0, m2

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  lea                srcq, [srcq+src_strideq*4]

+  lea                dstq, [dstq+dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_zero_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_nonhalf_y_nonhalf

+  ; x_offset == bilin interpolation && y_offset == 0.5

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0.5. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  paddw                m0, m2

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  lea                srcq, [srcq+src_strideq*2]

+.x_other_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+16]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+18]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+16]

+  psrlw                m2, 4

+  psrlw                m3, 4

+  pavgw                m0, m2

+  pavgw                m1, m3

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  mova                 m0, m2

+  mova                 m1, m3

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m2

+  psrlw                m0, 4

+  lea                srcq, [srcq+src_strideq*2]

+.x_other_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+dst_strideq*2]

+  psrlw                m2, 4

+  psrlw                m3, 4

+  pavgw                m0, m2

+  pavgw                m2, m3

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m2, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m4, m2, m5, m6, m7

+  mova                 m0, m3

+  lea                srcq, [srcq+src_strideq*4]

+  lea                dstq, [dstq+dst_strideq*4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_half_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonhalf:

+; loading filter - this is same as in 8-bit depth

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [bilin_filter+y_offsetq]

+  mova                m11, [bilin_filter+y_offsetq+16]

+  mova                m12, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_y_a m10

+%define filter_y_b m11

+%define filter_rnd m12

+%else   ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; In this case, there is NO unused register. Used src_stride register. Later,

+; src_stride has to be loaded from stack when it is needed.

+%define tempq src_strideq

+  mov tempq, g_bilin_filterm

+  add           x_offsetq, tempq

+  add           y_offsetq, tempq

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+  add           y_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+; end of load filter

+  ; x_offset == bilin interpolation && y_offset == bilin interpolation

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  movu                 m1, [srcq+16]

+  movu                 m3, [srcq+18]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  paddw                m0, m2

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  INC_SRC_BY_SRC_STRIDE

+.x_other_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m4, [srcq+2]

+  movu                 m3, [srcq+16]

+  movu                 m5, [srcq+18]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  psrlw                m2, 4

+  psrlw                m3, 4

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m0, m2

+  paddw                m1, filter_rnd

+  mova                 m2, [dstq]

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  mova                 m3, [dstq+16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  mova                 m0, m4

+  mova                 m1, m5

+  INC_SRC_BY_SRC_STRIDE

+  lea                dstq, [dstq + dst_strideq * 2]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*2]

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m2

+  psrlw                m0, 4

+  INC_SRC_BY_SRC_STRIDE

+.x_other_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m4, [srcq+2]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  psrlw                m2, 4

+  psrlw                m3, 4

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  pmullw               m4, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m0, m2

+  paddw                m4, filter_rnd

+  mova                 m2, [dstq]

+  paddw                m4, m3

+  psrlw                m0, 4

+  psrlw                m4, 4

+  mova                 m3, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m4, [secq+sec_str*2]

+%endif

+  SUM_SSE              m0, m2, m4, m3, m6, m7

+  mova                 m0, m5

+  INC_SRC_BY_SRC_2STRIDE

+  lea                dstq, [dstq + dst_strideq * 4]

+%if %2 == 1 ; avg

+  lea                secq, [secq + sec_str*4]

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_other_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+%endmacro

+INIT_XMM sse2

+SUBPEL_VARIANCE  8

+SUBPEL_VARIANCE 16

+INIT_XMM sse2

+SUBPEL_VARIANCE  8, 1

+SUBPEL_VARIANCE 16, 1

--- /dev/null

+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm

@@ -1,0 +1,313 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;unsigned int vp9_highbd_calc16x16var_sse2

+;(

+;    unsigned char   *  src_ptr,

+;    int             source_stride,

+;    unsigned char   *  ref_ptr,

+;    int             recon_stride,

+;    unsigned int    *  SSE,

+;    int             *  Sum

+;)

+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE

+sym(vp9_highbd_calc16x16var_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push rbx

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,            arg(0) ;[src_ptr]

+        mov         rdi,            arg(2) ;[ref_ptr]

+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

+        add         rax,            rax ; source stride in bytes

+        add         rdx,            rdx ; recon stride in bytes

+        ; Prefetch data

+        prefetcht0      [rsi]

+        prefetcht0      [rsi+16]

+        prefetcht0      [rsi+rax]

+        prefetcht0      [rsi+rax+16]

+        lea             rbx,    [rsi+rax*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+16]

+        prefetcht0      [rbx+rax]

+        prefetcht0      [rbx+rax+16]

+        prefetcht0      [rdi]

+        prefetcht0      [rdi+16]

+        prefetcht0      [rdi+rdx]

+        prefetcht0      [rdi+rdx+16]

+        lea             rbx,    [rdi+rdx*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+16]

+        prefetcht0      [rbx+rdx]

+        prefetcht0      [rbx+rdx+16]

+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack

+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs

+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse

+        mov         rcx,            16

+.var16loop:

+        movdqu      xmm1,           XMMWORD PTR [rsi]

+        movdqu      xmm2,           XMMWORD PTR [rdi]

+        lea             rbx,    [rsi+rax*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+16]

+        prefetcht0      [rbx+rax]

+        prefetcht0      [rbx+rax+16]

+        lea             rbx,    [rdi+rdx*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+16]

+        prefetcht0      [rbx+rdx]

+        prefetcht0      [rbx+rdx+16]

+        pxor        xmm5,           xmm5

+        psubw       xmm1,           xmm2

+        movdqu      xmm3,           XMMWORD PTR [rsi+16]

+        paddw       xmm5,           xmm1

+        pmaddwd     xmm1,           xmm1

+        movdqu      xmm2,           XMMWORD PTR [rdi+16]

+        paddd       xmm6,           xmm1

+        psubw       xmm3,           xmm2

+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]

+        paddw       xmm5,           xmm3

+        pmaddwd     xmm3,           xmm3

+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]

+        paddd       xmm6,           xmm3

+        psubw       xmm1,           xmm2

+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]

+        paddw       xmm5,           xmm1

+        pmaddwd     xmm1,           xmm1

+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]

+        paddd       xmm6,           xmm1

+        psubw       xmm3,           xmm2

+        paddw       xmm5,           xmm3

+        pmaddwd     xmm3,           xmm3

+        paddd       xmm6,           xmm3

+        movdqa      xmm1,           xmm5

+        movdqa      xmm2,           xmm5

+        pcmpgtw     xmm1,           xmm0

+        pcmpeqw     xmm2,           xmm0

+        por         xmm1,           xmm2

+        pcmpeqw     xmm1,           xmm0

+        movdqa      xmm2,           xmm5

+        punpcklwd   xmm5,           xmm1

+        punpckhwd   xmm2,           xmm1

+        paddd       xmm7,           xmm5

+        paddd       xmm7,           xmm2

+        lea         rsi,            [rsi + 2*rax]

+        lea         rdi,            [rdi + 2*rdx]

+        sub         rcx,            2

+        jnz         .var16loop

+        movdqa      xmm4,           xmm6

+        punpckldq   xmm6,           xmm0

+        punpckhdq   xmm4,           xmm0

+        movdqa      xmm5,           xmm7

+        paddd       xmm6,           xmm4

+        punpckldq   xmm7,           xmm0

+        punpckhdq   xmm5,           xmm0

+        paddd       xmm7,           xmm5

+        movdqa      xmm4,           xmm6

+        movdqa      xmm5,           xmm7

+        psrldq      xmm4,           8

+        psrldq      xmm5,           8

+        paddd       xmm6,           xmm4

+        paddd       xmm7,           xmm5

+        mov         rdi,            arg(4)   ; [SSE]

+        mov         rax,            arg(5)   ; [Sum]

+        movd DWORD PTR [rdi],       xmm6

+        movd DWORD PTR [rax],       xmm7

+    ; begin epilog

+    pop rdi

+    pop rsi

+    pop rbx

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_highbd_calc8x8var_sse2

+;(

+;    unsigned char   *  src_ptr,

+;    int             source_stride,

+;    unsigned char   *  ref_ptr,

+;    int             recon_stride,

+;    unsigned int    *  SSE,

+;    int             *  Sum

+;)

+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE

+sym(vp9_highbd_calc8x8var_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push rbx

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,            arg(0) ;[src_ptr]

+        mov         rdi,            arg(2) ;[ref_ptr]

+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

+        add         rax,            rax ; source stride in bytes

+        add         rdx,            rdx ; recon stride in bytes

+        ; Prefetch data

+        prefetcht0      [rsi]

+        prefetcht0      [rsi+rax]

+        lea             rbx,    [rsi+rax*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rax]

+        prefetcht0      [rdi]

+        prefetcht0      [rdi+rdx]

+        lea             rbx,    [rdi+rdx*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rdx]

+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack

+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs

+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse

+        mov         rcx,            8

+.var8loop:

+        movdqu      xmm1,           XMMWORD PTR [rsi]

+        movdqu      xmm2,           XMMWORD PTR [rdi]

+        lea             rbx,    [rsi+rax*4]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rax]

+        lea             rbx,    [rbx+rax*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rax]

+        lea             rbx,    [rdi+rdx*4]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rdx]

+        lea             rbx,    [rbx+rdx*2]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rdx]

+        pxor        xmm5,           xmm5

+        psubw       xmm1,           xmm2

+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]

+        paddw       xmm5,           xmm1

+        pmaddwd     xmm1,           xmm1

+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]

+        paddd       xmm6,           xmm1

+        lea         rsi,            [rsi + 2*rax]

+        lea         rdi,            [rdi + 2*rdx]

+        psubw       xmm3,           xmm2

+        movdqu      xmm1,           XMMWORD PTR [rsi]

+        paddw       xmm5,           xmm3

+        pmaddwd     xmm3,           xmm3

+        movdqu      xmm2,           XMMWORD PTR [rdi]

+        paddd       xmm6,           xmm3

+        psubw       xmm1,           xmm2

+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]

+        paddw       xmm5,           xmm1

+        pmaddwd     xmm1,           xmm1

+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]

+        paddd       xmm6,           xmm1

+        psubw       xmm3,           xmm2

+        paddw       xmm5,           xmm3

+        pmaddwd     xmm3,           xmm3

+        paddd       xmm6,           xmm3

+        movdqa      xmm1,           xmm5

+        movdqa      xmm2,           xmm5

+        pcmpgtw     xmm1,           xmm0

+        pcmpeqw     xmm2,           xmm0

+        por         xmm1,           xmm2

+        pcmpeqw     xmm1,           xmm0

+        movdqa      xmm2,           xmm5

+        punpcklwd   xmm5,           xmm1

+        punpckhwd   xmm2,           xmm1

+        paddd       xmm7,           xmm5

+        paddd       xmm7,           xmm2

+        lea         rsi,            [rsi + 2*rax]

+        lea         rdi,            [rdi + 2*rdx]

+        sub         rcx,            4

+        jnz         .var8loop

+        movdqa      xmm4,           xmm6

+        punpckldq   xmm6,           xmm0

+        punpckhdq   xmm4,           xmm0

+        movdqa      xmm5,           xmm7

+        paddd       xmm6,           xmm4

+        punpckldq   xmm7,           xmm0

+        punpckhdq   xmm5,           xmm0

+        paddd       xmm7,           xmm5

+        movdqa      xmm4,           xmm6

+        movdqa      xmm5,           xmm7

+        psrldq      xmm4,           8

+        psrldq      xmm5,           8

+        paddd       xmm6,           xmm4

+        paddd       xmm7,           xmm5

+        mov         rdi,            arg(4)   ; [SSE]

+        mov         rax,            arg(5)   ; [Sum]

+        movd DWORD PTR [rdi],       xmm6

+        movd DWORD PTR [rax],       xmm7

+    ; begin epilog

+    pop rdi

+    pop rsi

+    pop rbx

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c

@@ -1,0 +1,580 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/encoder/vp9_variance.h"

+#include "vpx_ports/mem.h"

+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,

+                                        const uint16_t *ref, int ref_stride,

+                                        uint32_t *sse, int *sum);

+uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,

+                                    const uint16_t *ref, int ref_stride,

+                                    uint32_t *sse, int *sum);

+uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,

+                                      const uint16_t *ref, int ref_stride,

+                                      uint32_t *sse, int *sum);

+static void highbd_variance_sse2(const uint16_t *src, int src_stride,

+                                 const uint16_t *ref, int ref_stride,

+                                 int w, int h, uint32_t *sse, int *sum,

+                                 high_variance_fn_t var_fn, int block_size) {

+  int i, j;

+  *sse = 0;

+  *sum = 0;

+  for (i = 0; i < h; i += block_size) {

+    for (j = 0; j < w; j += block_size) {

+      unsigned int sse0;

+      int sum0;

+      var_fn(src + src_stride * i + j, src_stride,

+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);

+      *sse += sse0;

+      *sum += sum0;

+    }

+  }

+}

+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,

+                                    const uint16_t *ref, int ref_stride,

+                                    int w, int h, uint32_t *sse, int *sum,

+                                    high_variance_fn_t var_fn, int block_size) {

+  int i, j;

+  uint64_t sse_long = 0;

+  int64_t sum_long = 0;

+  for (i = 0; i < h; i += block_size) {

+    for (j = 0; j < w; j += block_size) {

+      unsigned int sse0;

+      int sum0;

+      var_fn(src + src_stride * i + j, src_stride,

+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);

+      sse_long += sse0;

+      sum_long += sum0;

+    }

+  }

+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);

+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);

+}

+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,

+                                    const uint16_t *ref, int ref_stride,

+                                    int w, int h, uint32_t *sse, int *sum,

+                                    high_variance_fn_t var_fn, int block_size) {

+  int i, j;

+  uint64_t sse_long = 0;

+  int64_t sum_long = 0;

+  for (i = 0; i < h; i += block_size) {

+    for (j = 0; j < w; j += block_size) {

+      unsigned int sse0;

+      int sum0;

+      var_fn(src + src_stride * i + j, src_stride,

+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);

+      sse_long += sse0;

+      sum_long += sum0;

+    }

+  }

+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);

+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);

+}

+#define HIGH_GET_VAR(S) \

+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \

+                                       const uint8_t *ref8, int ref_stride, \

+                                       uint32_t *sse, int *sum) { \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \

+                                     sse, sum); \

+} \

+\

+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \

+                                          const uint8_t *ref8, int ref_stride, \

+                                          uint32_t *sse, int *sum) { \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \

+                                     sse, sum); \

+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \

+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \

+} \

+\

+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \

+                                          const uint8_t *ref8, int ref_stride, \

+                                          uint32_t *sse, int *sum) { \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \

+                                     sse, sum); \

+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \

+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \

+}

+HIGH_GET_VAR(16);

+HIGH_GET_VAR(8);

+#undef HIGH_GET_VAR

+#define VAR_FN(w, h, block_size, shift) \

+uint32_t vp9_highbd_variance##w##x##h##_sse2( \

+    const uint8_t *src8, int src_stride, \

+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \

+  int sum; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \

+                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \

+                       block_size); \

+  return *sse - (((int64_t)sum * sum) >> shift); \

+} \

+\

+uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \

+    const uint8_t *src8, int src_stride, \

+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \

+  int sum; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  highbd_10_variance_sse2( \

+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \

+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \

+  return *sse - (((int64_t)sum * sum) >> shift); \

+} \

+\

+uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \

+    const uint8_t *src8, int src_stride, \

+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \

+  int sum; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \

+  highbd_12_variance_sse2( \

+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \

+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \

+  return *sse - (((int64_t)sum * sum) >> shift); \

+}

+VAR_FN(64, 64, 16, 12);

+VAR_FN(64, 32, 16, 11);

+VAR_FN(32, 64, 16, 11);

+VAR_FN(32, 32, 16, 10);

+VAR_FN(32, 16, 16, 9);

+VAR_FN(16, 32, 16, 9);

+VAR_FN(16, 16, 16, 8);

+VAR_FN(16, 8, 8, 7);

+VAR_FN(8, 16, 8, 7);

+VAR_FN(8, 8, 8, 6);

+#undef VAR_FN

+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,

+                                      const uint8_t *ref8, int ref_stride,

+                                      unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,

+                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);

+  return *sse;

+}

+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,

+                                         const uint8_t *ref8, int ref_stride,

+                                         unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,

+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);

+  return *sse;

+}

+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,

+                                         const uint8_t *ref8, int ref_stride,

+                                         unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,

+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);

+  return *sse;

+}

+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,

+                                    const uint8_t *ref8, int ref_stride,

+                                    unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,

+                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);

+  return *sse;

+}

+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,

+                                       const uint8_t *ref8, int ref_stride,

+                                       unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,

+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);

+  return *sse;

+}

+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,

+                                       const uint8_t *ref8, int ref_stride,

+                                       unsigned int *sse) {

+  int sum;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,

+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);

+  return *sse;

+}

+#define DECL(w, opt) \

+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \

+                                               ptrdiff_t src_stride, \

+                                               int x_offset, int y_offset, \

+                                               const uint16_t *dst, \

+                                               ptrdiff_t dst_stride, \

+                                               int height, unsigned int *sse);

+#define DECLS(opt1, opt2) \

+DECL(8, opt1); \

+DECL(16, opt1)

+DECLS(sse2, sse);

+// DECLS(ssse3, ssse3);

+#undef DECLS

+#undef DECL

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \

+                                                        int src_stride, \

+                                                        int x_offset, \

+                                                        int y_offset, \

+                                                        const uint8_t *dst8, \

+                                                        int dst_stride, \

+                                                        uint32_t *sse_ptr) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

+                                                       x_offset, y_offset, \

+                                                       dst, dst_stride, h, \

+                                                       &sse); \

+  if (w > wf) { \

+    unsigned int sse2; \

+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

+                                                          src_stride, \

+                                                          x_offset, y_offset, \

+                                                          dst + 16, \

+                                                          dst_stride, \

+                                                          h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 32, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

+          src + 48, src_stride, x_offset, y_offset, \

+          dst + 48, dst_stride, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

+                                                       x_offset, y_offset, \

+                                                       dst, dst_stride, \

+                                                       h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

+                                                          src_stride, \

+                                                          x_offset, y_offset, \

+                                                          dst + 16, \

+                                                          dst_stride, \

+                                                          h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 32, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 48, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 2); \

+  sse = ROUND_POWER_OF_TWO(sse, 4); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

+  int start_row; \

+  uint32_t sse; \

+  int se = 0; \

+  uint64_t long_sse = 0; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  for (start_row = 0; start_row < h; start_row +=16) { \

+    uint32_t sse2; \

+    int height = h - start_row < 16 ? h - start_row : 16; \

+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

+        src + (start_row * src_stride), src_stride, \

+        x_offset, y_offset, dst + (start_row * dst_stride), \

+        dst_stride, height, &sse2); \

+    se += se2; \

+    long_sse += sse2; \

+    if (w > wf) { \

+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

+          src + 16 + (start_row * src_stride), src_stride, \

+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \

+          dst_stride, height, &sse2); \

+      se += se2; \

+      long_sse += sse2; \

+      if (w > wf * 2) { \

+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

+            src + 32 + (start_row * src_stride), src_stride, \

+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \

+            dst_stride, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

+            src + 48 + (start_row * src_stride), src_stride, \

+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \

+            dst_stride, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+      }\

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 4); \

+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1, opt2) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \

+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

+FN(8, 4, 8, 3, 2, opt1, (int64_t));

+FNS(sse2, sse);

+#undef FNS

+#undef FN

+#define DECL(w, opt) \

+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \

+                                                   ptrdiff_t src_stride, \

+                                                   int x_offset, int y_offset, \

+                                                   const uint16_t *dst, \

+                                                   ptrdiff_t dst_stride, \

+                                                   const uint16_t *sec, \

+                                                   ptrdiff_t sec_stride, \

+                                                   int height, \

+                                                   unsigned int *sse);

+#define DECLS(opt1) \

+DECL(16, opt1) \

+DECL(8, opt1)

+DECLS(sse2);

+#undef DECL

+#undef DECLS

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+               src, src_stride, x_offset, \

+               y_offset, dst, dst_stride, sec, w, h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                  src + 16, src_stride, x_offset, y_offset, \

+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 32, src_stride, x_offset, y_offset, \

+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 48, src_stride, x_offset, y_offset, \

+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src, src_stride, x_offset, \

+                                            y_offset, dst, dst_stride, \

+                                            sec, w, h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 16, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 16, dst_stride, \

+                                            sec + 16, w, h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 32, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 32, dst_stride, \

+                                            sec + 32, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 48, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 48, dst_stride, \

+                                            sec + 48, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 2); \

+  sse = ROUND_POWER_OF_TWO(sse, 4); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  int start_row; \

+  uint32_t sse; \

+  int se = 0; \

+  uint64_t long_sse = 0; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  for (start_row = 0; start_row < h; start_row +=16) { \

+    uint32_t sse2; \

+    int height = h - start_row < 16 ? h - start_row : 16; \

+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + (start_row * src_stride), src_stride, x_offset, \

+                y_offset, dst + (start_row * dst_stride), dst_stride, \

+                sec + (start_row * w), w, height, &sse2); \

+    se += se2; \

+    long_sse += sse2; \

+    if (w > wf) { \

+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 16 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 16 + (start_row * dst_stride), dst_stride, \

+                sec + 16 + (start_row * w), w, height, &sse2); \

+      se += se2; \

+      long_sse += sse2; \

+      if (w > wf * 2) { \

+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 32 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 32 + (start_row * dst_stride), dst_stride, \

+                sec + 32 + (start_row * w), w, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 48 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 48 + (start_row * dst_stride), dst_stride, \

+                sec + 48 + (start_row * w), w, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+      } \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 4); \

+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \

+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

+FN(8, 4, 8, 3, 2, opt1, (int64_t));

+FNS(sse2);

+#undef FNS

+#undef FN

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -104,6 +104,7 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm

 endif

 ifeq ($(CONFIG_USE_X86INC),yes)

@@ -115,6 +116,8 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm

 endif

 endif

--

⑨