shithub: libvpx

Download patch

ref: c3a9056df4d5144596f1bdcd99179d2565e3fba2
parent: 75c2c84bb50d7b03c650595888284fec8ef820f7
parent: 48032bfcdb412a8e7f9d89154c4ac8fbb3f8fe72
author: Debargha Mukherjee <debargha@google.com>
date: Fri Nov 14 16:11:27 EST 2014

Merge "Added sse2 acceleration for highbitdepth variance"

--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -7,16 +7,18 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdlib.h>
+
+#include <cstdlib>
 #include <new>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
+#include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "vpx/vpx_integer.h"
 #include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "./vp8_rtcd.h"
@@ -26,7 +28,6 @@
 # include "./vp9_rtcd.h"
 # include "vp9/encoder/vp9_variance.h"
 #endif
-#include "test/acm_random.h"
 
 namespace {
 
@@ -43,18 +44,50 @@
   return res;
 }
 
-static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
-                                 int l2w, int l2h, unsigned int *sse_ptr) {
+static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref,
+                                 int l2w, int l2h, int src_stride_coeff,
+                                 int ref_stride_coeff, uint32_t *sse_ptr,
+                                 bool use_high_bit_depth_,
+                                 vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff;
+      if (!use_high_bit_depth_) {
+        diff = ref[w * y * ref_stride_coeff + x] -
+               src[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  if (bit_depth > VPX_BITS_8) {
+    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
+    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
+  }
+#else
   int se = 0;
   unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      int diff = ref[w * y + x] - src[w * y + x];
+      int diff = ref[w * y * ref_stride_coeff + x] -
+                 src[w * y * src_stride_coeff + x];
       se += diff;
       sse += diff * diff;
     }
   }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -61,13 +94,56 @@
 
 static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
                                         int l2w, int l2h, int xoff, int yoff,
-                                        unsigned int *sse_ptr) {
+                                        unsigned int *sse_ptr,
+                                        bool use_high_bit_depth_,
+                                        vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  if (bit_depth > VPX_BITS_8) {
+    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
+    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
+  }
+#else
   int se = 0;
   unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
+      // Bilinear interpolation at a 16th pel step.
       const int a1 = ref[(w + 1) * (y + 0) + x + 0];
       const int a2 = ref[(w + 1) * (y + 0) + x + 1];
       const int b1 = ref[(w + 1) * (y + 1) + x + 0];
@@ -75,11 +151,12 @@
       const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
       const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
       const int r = a + (((b - a) * yoff + 8) >> 4);
-      int diff = r - src[w * y + x];
+      const int diff = r - src[w * y + x];
       se += diff;
       sse += diff * diff;
     }
   }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -130,27 +207,57 @@
 
 template<typename VarianceFunctionType>
 class VarianceTest
-    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            VarianceFunctionType, int> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
+    const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
     log2width_  = get<0>(params);
     width_ = 1 << log2width_;
     log2height_ = get<1>(params);
     height_ = 1 << log2height_;
     variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = static_cast<vpx_bit_depth_t>(get<3>(params));
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_) - 1;
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-    ref_ = new uint8_t[block_size_];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+      ref_ = new uint8_t[block_size_ * 2];
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+    }
+#else
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+    ref_ = new uint8_t[block_size_ * 2];
+#endif
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+    }
+#else
     vpx_free(src_);
     delete[] ref_;
+#endif
     libvpx_test::ClearSystemState();
   }
 
@@ -157,13 +264,17 @@
  protected:
   void ZeroTest();
   void RefTest();
+  void RefStrideTest();
   void OneQuarterTest();
 
   ACMRandom rnd_;
-  uint8_t* src_;
-  uint8_t* ref_;
+  uint8_t *src_;
+  uint8_t *ref_;
   int width_, log2width_;
   int height_, log2height_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  bool use_high_bit_depth_;
   int block_size_;
   VarianceFunctionType variance_;
 };
@@ -171,14 +282,32 @@
 template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::ZeroTest() {
   for (int i = 0; i <= 255; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      memset(src_, i, block_size_);
+    } else {
+      vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
+                   block_size_);
+    }
+#else
     memset(src_, i, block_size_);
+#endif
     for (int j = 0; j <= 255; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!use_high_bit_depth_) {
+        memset(ref_, j, block_size_);
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j  << (bit_depth_ - 8),
+                     block_size_);
+      }
+#else
       memset(ref_, j, block_size_);
+#endif
       unsigned int sse;
       unsigned int var;
       ASM_REGISTER_STATE_CHECK(
           var = variance_(src_, width_, ref_, width_, &sse));
-      EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
     }
   }
 }
@@ -187,15 +316,28 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
       src_[j] = rnd_.Rand8();
       ref_[j] = rnd_.Rand8();
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
     }
+#else
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+#endif
+    }
     unsigned int sse1, sse2;
     unsigned int var1;
+    const int stride_coeff = 1;
     ASM_REGISTER_STATE_CHECK(
         var1 = variance_(src_, width_, ref_, width_, &sse1));
     const unsigned int var2 = variance_ref(src_, ref_, log2width_,
-                                           log2height_, &sse2);
+                                           log2height_, stride_coeff,
+                                           stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
     EXPECT_EQ(sse1, sse2);
     EXPECT_EQ(var1, var2);
   }
@@ -202,11 +344,60 @@
 }
 
 template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefStrideTest() {
+  for (int i = 0; i < 10; ++i) {
+    int ref_stride_coeff = i % 2;
+    int src_stride_coeff = (i >> 1) % 2;
+    for (int j = 0; j < block_size_; j++) {
+      int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
+      int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!use_high_bit_depth_) {
+        src_[src_ind] = rnd_.Rand8();
+        ref_[ref_ind] = rnd_.Rand8();
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+      }
+#else
+      src_[src_ind] = rnd_.Rand8();
+      ref_[ref_ind] = rnd_.Rand8();
+#endif
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_ * src_stride_coeff,
+                         ref_, width_ * ref_stride_coeff, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, src_stride_coeff,
+                                           ref_stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
-  memset(src_, 255, block_size_);
   const int half = block_size_ / 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (!use_high_bit_depth_) {
+    memset(src_, 255, block_size_);
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
+                 block_size_);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+  }
+#else
+  memset(src_, 255, block_size_);
   memset(ref_, 255, half);
   memset(ref_ + half, 0, half);
+#endif
   unsigned int sse;
   unsigned int var;
   ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
@@ -264,8 +455,10 @@
       ref_[j] = rnd.Rand8();
     }
     unsigned int sse1, sse2;
+    const int stride_coeff = 1;
     ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));
-    variance_ref(src_, ref_, log2width_, log2height_, &sse2);
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
     EXPECT_EQ(sse1, sse2);
   }
 }
@@ -279,9 +472,10 @@
     }
     unsigned int sse2;
     unsigned int var1;
-    ASM_REGISTER_STATE_CHECK(
-        var1 = mse_(src_, width_, ref_, width_));
-    variance_ref(src_, ref_, log2width_, log2height_, &sse2);
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
     EXPECT_EQ(var1, sse2);
   }
 }
@@ -308,16 +502,59 @@
 #endif
 
 #if CONFIG_VP9_ENCODER
-
 unsigned int subpel_avg_variance_ref(const uint8_t *ref,
                                      const uint8_t *src,
                                      const uint8_t *second_pred,
                                      int l2w, int l2h,
                                      int xoff, int yoff,
-                                     unsigned int *sse_ptr) {
+                                     unsigned int *sse_ptr,
+                                     bool use_high_bit_depth,
+                                     vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        uint16_t *sec16   = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  if (bit_depth > 8) {
+    sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8));
+    se = ROUND_POWER_OF_TWO(se, bit_depth-8);
+  }
+#else
   int se = 0;
   unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
       // bilinear interpolation at a 16th pel step
@@ -328,11 +565,12 @@
       const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
       const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
       const int r = a + (((b - a) * yoff + 8) >> 4);
-      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+      const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
       se += diff;
       sse += diff * diff;
     }
   }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -340,10 +578,10 @@
 template<typename SubpelVarianceFunctionType>
 class SubpelVarianceTest
     : public ::testing::TestWithParam<tuple<int, int,
-                                            SubpelVarianceFunctionType> > {
+                                            SubpelVarianceFunctionType, int> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType>& params =
+    const tuple<int, int, SubpelVarianceFunctionType, int>& params =
         this->GetParam();
     log2width_  = get<0>(params);
     width_ = 1 << log2width_;
@@ -350,12 +588,37 @@
     log2height_ = get<1>(params);
     height_ = 1 << log2height_;
     subpel_variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = (vpx_bit_depth_t) get<3>(params);
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_)-1;
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      sec_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size_ + width_ + height_ + 1]);
+    }
+#else
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(sec_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
@@ -362,22 +625,37 @@
   }
 
   virtual void TearDown() {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+      vpx_free(sec_);
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+      vpx_free(CONVERT_TO_SHORTPTR(sec_));
+    }
+#else
     vpx_free(src_);
     delete[] ref_;
     vpx_free(sec_);
+#endif
     libvpx_test::ClearSystemState();
   }
 
  protected:
   void RefTest();
+  void ExtremeRefTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
+  bool use_high_bit_depth_;
+  vpx_bit_depth_t bit_depth_;
   int width_, log2width_;
   int height_, log2height_;
-  int block_size_;
+  int block_size_,  mask_;
   SubpelVarianceFunctionType subpel_variance_;
 };
 
@@ -385,6 +663,23 @@
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+      }
+#else
       for (int j = 0; j < block_size_; j++) {
         src_[j] = rnd_.Rand8();
       }
@@ -391,12 +686,15 @@
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
         ref_[j] = rnd_.Rand8();
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
                                                        src_, width_, &sse1));
       const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
-                                                    log2height_, x, y, &sse2);
+                                                    log2height_, x, y, &sse2,
+                                                    use_high_bit_depth_,
+                                                    bit_depth_);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -403,10 +701,69 @@
   }
 }
 
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+  // Compare against reference.
+  // Src: Set the first half of values to 0, the second half to the maximum.
+  // Ref: Set the first half of values to the maximum, the second half to 0.
+  for (int x = 0; x < 16; ++x) {
+    for (int y = 0; y < 16; ++y) {
+      const int half = block_size_ / 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!use_high_bit_depth_) {
+        memset(src_, 0, half);
+        memset(src_ + half, 255, half);
+        memset(ref_, 255, half);
+        memset(ref_ + half, 0, half + width_ + height_ + 1);
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
+                     half + width_ + height_ + 1);
+      }
+#else
+      memset(src_, 0, half);
+      memset(src_ + half, 255, half);
+      memset(ref_, 255, half);
+      memset(ref_ + half, 0, half + width_ + height_ + 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
+      const unsigned int var2 =
+          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
+                              use_high_bit_depth_, bit_depth_);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
 template<>
 void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+      }
+#else
       for (int j = 0; j < block_size_; j++) {
         src_[j] = rnd_.Rand8();
         sec_[j] = rnd_.Rand8();
@@ -414,6 +771,7 @@
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
         ref_[j] = rnd_.Rand8();
       }
+#endif
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
@@ -421,7 +779,9 @@
                                   src_, width_, &sse1, sec_));
       const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
                                                         log2width_, log2height_,
-                                                        x, y, &sse2);
+                                                        x, y, &sse2,
+                                                        use_high_bit_depth_,
+                                                        bit_depth_);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -468,11 +828,11 @@
 const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c),
-                      make_tuple(3, 3, variance8x8_c),
-                      make_tuple(3, 4, variance8x16_c),
-                      make_tuple(4, 3, variance16x8_c),
-                      make_tuple(4, 4, variance16x16_c)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
+                      make_tuple(3, 3, variance8x8_c, 0),
+                      make_tuple(3, 4, variance8x16_c, 0),
+                      make_tuple(4, 3, variance16x8_c, 0),
+                      make_tuple(4, 4, variance16x16_c, 0)));
 
 #if HAVE_NEON
 const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon;
@@ -491,13 +851,12 @@
 const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP8VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
-                      make_tuple(3, 4, variance8x16_neon),
-                      make_tuple(4, 3, variance16x8_neon),
-                      make_tuple(4, 4, variance16x16_neon)));
+    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
+                      make_tuple(3, 4, variance8x16_neon, 0),
+                      make_tuple(4, 3, variance16x8_neon, 0),
+                      make_tuple(4, 4, variance16x16_neon, 0)));
 #endif
 
-
 #if HAVE_MMX
 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
 const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
@@ -506,11 +865,11 @@
 const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
 INSTANTIATE_TEST_CASE_P(
     MMX, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
-                      make_tuple(3, 3, variance8x8_mmx),
-                      make_tuple(3, 4, variance8x16_mmx),
-                      make_tuple(4, 3, variance16x8_mmx),
-                      make_tuple(4, 4, variance16x16_mmx)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0),
+                      make_tuple(3, 3, variance8x8_mmx, 0),
+                      make_tuple(3, 4, variance8x16_mmx, 0),
+                      make_tuple(4, 3, variance16x8_mmx, 0),
+                      make_tuple(4, 4, variance16x16_mmx, 0)));
 #endif
 
 #if HAVE_SSE2
@@ -521,11 +880,11 @@
 const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
-                      make_tuple(3, 3, variance8x8_wmt),
-                      make_tuple(3, 4, variance8x16_wmt),
-                      make_tuple(4, 3, variance16x8_wmt),
-                      make_tuple(4, 4, variance16x16_wmt)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0),
+                      make_tuple(3, 3, variance8x8_wmt, 0),
+                      make_tuple(3, 4, variance8x16_wmt, 0),
+                      make_tuple(4, 3, variance16x8_wmt, 0),
+                      make_tuple(4, 4, variance16x16_wmt, 0)));
 #endif
 #endif  // CONFIG_VP8_ENCODER
 
@@ -537,7 +896,6 @@
 namespace vp9 {
 
 #if CONFIG_VP9_ENCODER
-
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 
@@ -550,10 +908,27 @@
 
 TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
 TEST_P(VP9VarianceTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest;
+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
+    VP9SubpelAvgVarianceHighTest;
+
+TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); }
+TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
 const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
 const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
@@ -569,20 +944,115 @@
 const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c),
-                      make_tuple(2, 3, variance4x8_c),
-                      make_tuple(3, 2, variance8x4_c),
-                      make_tuple(3, 3, variance8x8_c),
-                      make_tuple(3, 4, variance8x16_c),
-                      make_tuple(4, 3, variance16x8_c),
-                      make_tuple(4, 4, variance16x16_c),
-                      make_tuple(4, 5, variance16x32_c),
-                      make_tuple(5, 4, variance32x16_c),
-                      make_tuple(5, 5, variance32x32_c),
-                      make_tuple(5, 6, variance32x64_c),
-                      make_tuple(6, 5, variance64x32_c),
-                      make_tuple(6, 6, variance64x64_c)));
-
+    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
+                      make_tuple(2, 3, variance4x8_c, 0),
+                      make_tuple(3, 2, variance8x4_c, 0),
+                      make_tuple(3, 3, variance8x8_c, 0),
+                      make_tuple(3, 4, variance8x16_c, 0),
+                      make_tuple(4, 3, variance16x8_c, 0),
+                      make_tuple(4, 4, variance16x16_c, 0),
+                      make_tuple(4, 5, variance16x32_c, 0),
+                      make_tuple(5, 4, variance32x16_c, 0),
+                      make_tuple(5, 5, variance32x32_c, 0),
+                      make_tuple(5, 6, variance32x64_c, 0),
+                      make_tuple(6, 5, variance64x32_c, 0),
+                      make_tuple(6, 6, variance64x64_c, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c;
+const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c;
+const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c;
+const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c;
+const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c;
+const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c;
+const vp9_variance_fn_t highbd_10_variance16x16_c =
+    vp9_highbd_10_variance16x16_c;
+const vp9_variance_fn_t highbd_10_variance16x32_c =
+    vp9_highbd_10_variance16x32_c;
+const vp9_variance_fn_t highbd_10_variance32x16_c =
+    vp9_highbd_10_variance32x16_c;
+const vp9_variance_fn_t highbd_10_variance32x32_c =
+    vp9_highbd_10_variance32x32_c;
+const vp9_variance_fn_t highbd_10_variance32x64_c =
+    vp9_highbd_10_variance32x64_c;
+const vp9_variance_fn_t highbd_10_variance64x32_c =
+    vp9_highbd_10_variance64x32_c;
+const vp9_variance_fn_t highbd_10_variance64x64_c =
+    vp9_highbd_10_variance64x64_c;
+const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c;
+const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c;
+const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c;
+const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c;
+const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c;
+const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c;
+const vp9_variance_fn_t highbd_12_variance16x16_c =
+    vp9_highbd_12_variance16x16_c;
+const vp9_variance_fn_t highbd_12_variance16x32_c =
+    vp9_highbd_12_variance16x32_c;
+const vp9_variance_fn_t highbd_12_variance32x16_c =
+    vp9_highbd_12_variance32x16_c;
+const vp9_variance_fn_t highbd_12_variance32x32_c =
+    vp9_highbd_12_variance32x32_c;
+const vp9_variance_fn_t highbd_12_variance32x64_c =
+    vp9_highbd_12_variance32x64_c;
+const vp9_variance_fn_t highbd_12_variance64x32_c =
+    vp9_highbd_12_variance64x32_c;
+const vp9_variance_fn_t highbd_12_variance64x64_c =
+    vp9_highbd_12_variance64x64_c;
+const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c;
+const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c;
+const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c;
+const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c;
+const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c;
+const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c;
+const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c;
+const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c;
+const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c;
+const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c;
+const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c;
+const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c;
+const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9VarianceHighTest,
+    ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10),
+                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),
+                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),
+                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),
+                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),
+                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),
+                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),
+                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),
+                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),
+                      make_tuple(6, 6, highbd_12_variance64x64_c, 12),
+                      make_tuple(2, 2, highbd_variance4x4_c, 8),
+                      make_tuple(2, 3, highbd_variance4x8_c, 8),
+                      make_tuple(3, 2, highbd_variance8x4_c, 8),
+                      make_tuple(3, 3, highbd_variance8x8_c, 8),
+                      make_tuple(3, 4, highbd_variance8x16_c, 8),
+                      make_tuple(4, 3, highbd_variance16x8_c, 8),
+                      make_tuple(4, 4, highbd_variance16x16_c, 8),
+                      make_tuple(4, 5, highbd_variance16x32_c, 8),
+                      make_tuple(5, 4, highbd_variance32x16_c, 8),
+                      make_tuple(5, 5, highbd_variance32x32_c, 8),
+                      make_tuple(5, 6, highbd_variance32x64_c, 8),
+                      make_tuple(6, 5, highbd_variance64x32_c, 8),
+                      make_tuple(6, 6, highbd_variance64x64_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 const vp9_subpixvariance_fn_t subpel_variance4x4_c =
     vp9_sub_pixel_variance4x4_c;
 const vp9_subpixvariance_fn_t subpel_variance4x8_c =
@@ -611,20 +1081,19 @@
     vp9_sub_pixel_variance64x64_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
-                      make_tuple(2, 3, subpel_variance4x8_c),
-                      make_tuple(3, 2, subpel_variance8x4_c),
-                      make_tuple(3, 3, subpel_variance8x8_c),
-                      make_tuple(3, 4, subpel_variance8x16_c),
-                      make_tuple(4, 3, subpel_variance16x8_c),
-                      make_tuple(4, 4, subpel_variance16x16_c),
-                      make_tuple(4, 5, subpel_variance16x32_c),
-                      make_tuple(5, 4, subpel_variance32x16_c),
-                      make_tuple(5, 5, subpel_variance32x32_c),
-                      make_tuple(5, 6, subpel_variance32x64_c),
-                      make_tuple(6, 5, subpel_variance64x32_c),
-                      make_tuple(6, 6, subpel_variance64x64_c)));
-
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),
+                      make_tuple(2, 3, subpel_variance4x8_c, 0),
+                      make_tuple(3, 2, subpel_variance8x4_c, 0),
+                      make_tuple(3, 3, subpel_variance8x8_c, 0),
+                      make_tuple(3, 4, subpel_variance8x16_c, 0),
+                      make_tuple(4, 3, subpel_variance16x8_c, 0),
+                      make_tuple(4, 4, subpel_variance16x16_c, 0),
+                      make_tuple(4, 5, subpel_variance16x32_c, 0),
+                      make_tuple(5, 4, subpel_variance32x16_c, 0),
+                      make_tuple(5, 5, subpel_variance32x32_c, 0),
+                      make_tuple(5, 6, subpel_variance32x64_c, 0),
+                      make_tuple(6, 5, subpel_variance64x32_c, 0),
+                      make_tuple(6, 6, subpel_variance64x64_c, 0)));
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
     vp9_sub_pixel_avg_variance4x4_c;
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
@@ -653,23 +1122,263 @@
     vp9_sub_pixel_avg_variance64x64_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
-                      make_tuple(2, 3, subpel_avg_variance4x8_c),
-                      make_tuple(3, 2, subpel_avg_variance8x4_c),
-                      make_tuple(3, 3, subpel_avg_variance8x8_c),
-                      make_tuple(3, 4, subpel_avg_variance8x16_c),
-                      make_tuple(4, 3, subpel_avg_variance16x8_c),
-                      make_tuple(4, 4, subpel_avg_variance16x16_c),
-                      make_tuple(4, 5, subpel_avg_variance16x32_c),
-                      make_tuple(5, 4, subpel_avg_variance32x16_c),
-                      make_tuple(5, 5, subpel_avg_variance32x32_c),
-                      make_tuple(5, 6, subpel_avg_variance32x64_c),
-                      make_tuple(6, 5, subpel_avg_variance64x32_c),
-                      make_tuple(6, 6, subpel_avg_variance64x64_c)));
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_c, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_c, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_c, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_c, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_c, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_c, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_c, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_c, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_c, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_c, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_c, 0),
+                      make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x4_c =
+    vp9_highbd_10_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x8_c =
+    vp9_highbd_10_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_c =
+    vp9_highbd_10_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_c =
+    vp9_highbd_10_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_c =
+    vp9_highbd_10_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_c =
+    vp9_highbd_10_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_c =
+    vp9_highbd_10_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_c =
+    vp9_highbd_10_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_c =
+    vp9_highbd_10_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_c =
+    vp9_highbd_10_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_c =
+    vp9_highbd_10_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_c =
+    vp9_highbd_10_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_c =
+    vp9_highbd_10_sub_pixel_variance64x64_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x4_c =
+    vp9_highbd_12_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x8_c =
+    vp9_highbd_12_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_c =
+    vp9_highbd_12_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_c =
+    vp9_highbd_12_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_c =
+    vp9_highbd_12_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_c =
+    vp9_highbd_12_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_c =
+    vp9_highbd_12_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_c =
+    vp9_highbd_12_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_c =
+    vp9_highbd_12_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_c =
+    vp9_highbd_12_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_c =
+    vp9_highbd_12_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_c =
+    vp9_highbd_12_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_c =
+    vp9_highbd_12_sub_pixel_variance64x64_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance4x4_c =
+    vp9_highbd_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance4x8_c =
+    vp9_highbd_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_c =
+    vp9_highbd_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_c =
+    vp9_highbd_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_c =
+    vp9_highbd_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_c =
+    vp9_highbd_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_c =
+    vp9_highbd_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_c =
+    vp9_highbd_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_c =
+    vp9_highbd_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_c =
+    vp9_highbd_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_c =
+    vp9_highbd_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_c =
+    vp9_highbd_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_c =
+    vp9_highbd_sub_pixel_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelVarianceHighTest,
+    ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),
+                      make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),
+                      make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),
+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),
+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),
+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),
+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),
+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),
+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),
+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),
+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),
+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),
+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),
+                      make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),
+                      make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),
+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),
+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),
+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),
+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),
+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),
+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),
+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),
+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),
+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),
+                      make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),
+                      make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),
+                      make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),
+                      make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),
+                      make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),
+                      make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),
+                      make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),
+                      make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),
+                      make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),
+                      make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),
+                      make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),
+                      make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),
+                      make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),
+                      make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),
+                      make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =
+    vp9_highbd_10_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =
+    vp9_highbd_10_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =
+    vp9_highbd_10_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =
+    vp9_highbd_10_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =
+    vp9_highbd_10_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =
+    vp9_highbd_10_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =
+    vp9_highbd_10_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =
+    vp9_highbd_10_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =
+    vp9_highbd_10_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =
+    vp9_highbd_10_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =
+    vp9_highbd_10_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =
+    vp9_highbd_10_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =
+    vp9_highbd_10_sub_pixel_avg_variance64x64_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =
+    vp9_highbd_12_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =
+    vp9_highbd_12_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =
+    vp9_highbd_12_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =
+    vp9_highbd_12_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =
+    vp9_highbd_12_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =
+    vp9_highbd_12_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =
+    vp9_highbd_12_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =
+    vp9_highbd_12_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =
+    vp9_highbd_12_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =
+    vp9_highbd_12_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =
+    vp9_highbd_12_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =
+    vp9_highbd_12_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =
+    vp9_highbd_12_sub_pixel_avg_variance64x64_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =
+    vp9_highbd_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =
+    vp9_highbd_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =
+    vp9_highbd_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =
+    vp9_highbd_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =
+    vp9_highbd_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =
+    vp9_highbd_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =
+    vp9_highbd_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =
+    vp9_highbd_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =
+    vp9_highbd_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =
+    vp9_highbd_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =
+    vp9_highbd_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =
+    vp9_highbd_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =
+    vp9_highbd_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelAvgVarianceHighTest,
+    ::testing::Values(
+        make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),
+        make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),
+        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),
+        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),
+        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),
+        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),
+        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),
+        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),
+        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),
+        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),
+        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),
+        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),
+        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),
+        make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),
+        make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),
+        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),
+        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),
+        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),
+        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),
+        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),
+        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),
+        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),
+        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),
+        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),
+        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),
+        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),
+        make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),
+        make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),
+        make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),
+        make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),
+        make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),
+        make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),
+        make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),
+        make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),
+        make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),
+        make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),
+        make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),
+        make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
+        make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
-
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                         ::testing::Values(vp9_get_mb_ss_sse2));
 
@@ -688,20 +1397,19 @@
 const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
-                      make_tuple(2, 3, variance4x8_sse2),
-                      make_tuple(3, 2, variance8x4_sse2),
-                      make_tuple(3, 3, variance8x8_sse2),
-                      make_tuple(3, 4, variance8x16_sse2),
-                      make_tuple(4, 3, variance16x8_sse2),
-                      make_tuple(4, 4, variance16x16_sse2),
-                      make_tuple(4, 5, variance16x32_sse2),
-                      make_tuple(5, 4, variance32x16_sse2),
-                      make_tuple(5, 5, variance32x32_sse2),
-                      make_tuple(5, 6, variance32x64_sse2),
-                      make_tuple(6, 5, variance64x32_sse2),
-                      make_tuple(6, 6, variance64x64_sse2)));
-
+    ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0),
+                      make_tuple(2, 3, variance4x8_sse2, 0),
+                      make_tuple(3, 2, variance8x4_sse2, 0),
+                      make_tuple(3, 3, variance8x8_sse2, 0),
+                      make_tuple(3, 4, variance8x16_sse2, 0),
+                      make_tuple(4, 3, variance16x8_sse2, 0),
+                      make_tuple(4, 4, variance16x16_sse2, 0),
+                      make_tuple(4, 5, variance16x32_sse2, 0),
+                      make_tuple(5, 4, variance32x16_sse2, 0),
+                      make_tuple(5, 5, variance32x32_sse2, 0),
+                      make_tuple(5, 6, variance32x64_sse2, 0),
+                      make_tuple(6, 5, variance64x32_sse2, 0),
+                      make_tuple(6, 6, variance64x64_sse2, 0)));
 const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
     vp9_sub_pixel_variance4x4_sse;
 const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
@@ -730,20 +1438,19 @@
     vp9_sub_pixel_variance64x64_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
-                      make_tuple(2, 3, subpel_variance4x8_sse),
-                      make_tuple(3, 2, subpel_variance8x4_sse2),
-                      make_tuple(3, 3, subpel_variance8x8_sse2),
-                      make_tuple(3, 4, subpel_variance8x16_sse2),
-                      make_tuple(4, 3, subpel_variance16x8_sse2),
-                      make_tuple(4, 4, subpel_variance16x16_sse2),
-                      make_tuple(4, 5, subpel_variance16x32_sse2),
-                      make_tuple(5, 4, subpel_variance32x16_sse2),
-                      make_tuple(5, 5, subpel_variance32x32_sse2),
-                      make_tuple(5, 6, subpel_variance32x64_sse2),
-                      make_tuple(6, 5, subpel_variance64x32_sse2),
-                      make_tuple(6, 6, subpel_variance64x64_sse2)));
-
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),
+                      make_tuple(2, 3, subpel_variance4x8_sse, 0),
+                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),
+                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),
+                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),
+                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),
+                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),
+                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),
+                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),
+                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),
+                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),
+                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),
+                      make_tuple(6, 6, subpel_variance64x64_sse2, 0)));
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
     vp9_sub_pixel_avg_variance4x4_sse;
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
@@ -772,22 +1479,316 @@
     vp9_sub_pixel_avg_variance64x64_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
-                      make_tuple(2, 3, subpel_avg_variance4x8_sse),
-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2),
-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2),
-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2),
-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2),
-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2),
-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2),
-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2),
-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2),
-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2),
-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
-#endif
-#endif
-
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2;
+const vp9_variance_fn_t highbd_10_variance8x8_sse2 =
+    vp9_highbd_10_variance8x8_sse2;
+const vp9_variance_fn_t highbd_12_variance8x8_sse2 =
+    vp9_highbd_12_variance8x8_sse2;
+const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2;
+const vp9_variance_fn_t highbd_10_variance8x16_sse2 =
+    vp9_highbd_10_variance8x16_sse2;
+const vp9_variance_fn_t highbd_12_variance8x16_sse2 =
+    vp9_highbd_12_variance8x16_sse2;
+const vp9_variance_fn_t highbd_variance16x8_sse2 =
+    vp9_highbd_variance16x8_sse2;
+const vp9_variance_fn_t highbd_10_variance16x8_sse2 =
+    vp9_highbd_10_variance16x8_sse2;
+const vp9_variance_fn_t highbd_12_variance16x8_sse2 =
+    vp9_highbd_12_variance16x8_sse2;
+const vp9_variance_fn_t highbd_variance16x16_sse2 =
+    vp9_highbd_variance16x16_sse2;
+const vp9_variance_fn_t highbd_10_variance16x16_sse2 =
+    vp9_highbd_10_variance16x16_sse2;
+const vp9_variance_fn_t highbd_12_variance16x16_sse2 =
+    vp9_highbd_12_variance16x16_sse2;
+const vp9_variance_fn_t highbd_variance16x32_sse2 =
+    vp9_highbd_variance16x32_sse2;
+const vp9_variance_fn_t highbd_10_variance16x32_sse2 =
+    vp9_highbd_10_variance16x32_sse2;
+const vp9_variance_fn_t highbd_12_variance16x32_sse2 =
+    vp9_highbd_12_variance16x32_sse2;
+const vp9_variance_fn_t highbd_variance32x16_sse2 =
+    vp9_highbd_variance32x16_sse2;
+const vp9_variance_fn_t highbd_10_variance32x16_sse2 =
+    vp9_highbd_10_variance32x16_sse2;
+const vp9_variance_fn_t highbd_12_variance32x16_sse2 =
+    vp9_highbd_12_variance32x16_sse2;
+const vp9_variance_fn_t highbd_variance32x32_sse2 =
+    vp9_highbd_variance32x32_sse2;
+const vp9_variance_fn_t highbd_10_variance32x32_sse2 =
+    vp9_highbd_10_variance32x32_sse2;
+const vp9_variance_fn_t highbd_12_variance32x32_sse2 =
+    vp9_highbd_12_variance32x32_sse2;
+const vp9_variance_fn_t highbd_variance32x64_sse2 =
+    vp9_highbd_variance32x64_sse2;
+const vp9_variance_fn_t highbd_10_variance32x64_sse2 =
+    vp9_highbd_10_variance32x64_sse2;
+const vp9_variance_fn_t highbd_12_variance32x64_sse2 =
+    vp9_highbd_12_variance32x64_sse2;
+const vp9_variance_fn_t highbd_variance64x32_sse2 =
+    vp9_highbd_variance64x32_sse2;
+const vp9_variance_fn_t highbd_10_variance64x32_sse2 =
+    vp9_highbd_10_variance64x32_sse2;
+const vp9_variance_fn_t highbd_12_variance64x32_sse2 =
+    vp9_highbd_12_variance64x32_sse2;
+const vp9_variance_fn_t highbd_variance64x64_sse2 =
+    vp9_highbd_variance64x64_sse2;
+const vp9_variance_fn_t highbd_10_variance64x64_sse2 =
+    vp9_highbd_10_variance64x64_sse2;
+const vp9_variance_fn_t highbd_12_variance64x64_sse2 =
+    vp9_highbd_12_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9VarianceHighTest,
+    ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
+                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
+                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
+                      make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
+                      make_tuple(3, 3, highbd_variance8x8_sse2, 8),
+                      make_tuple(3, 4, highbd_variance8x16_sse2, 8),
+                      make_tuple(4, 3, highbd_variance16x8_sse2, 8),
+                      make_tuple(4, 4, highbd_variance16x16_sse2, 8),
+                      make_tuple(4, 5, highbd_variance16x32_sse2, 8),
+                      make_tuple(5, 4, highbd_variance32x16_sse2, 8),
+                      make_tuple(5, 5, highbd_variance32x32_sse2, 8),
+                      make_tuple(5, 6, highbd_variance32x64_sse2, 8),
+                      make_tuple(6, 5, highbd_variance64x32_sse2, 8),
+                      make_tuple(6, 6, highbd_variance64x64_sse2, 8)));
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 =
+    vp9_highbd_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 =
+    vp9_highbd_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_sse2 =
+    vp9_highbd_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_sse2 =
+    vp9_highbd_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_sse2 =
+    vp9_highbd_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_sse2 =
+    vp9_highbd_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_sse2 =
+    vp9_highbd_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_sse2 =
+    vp9_highbd_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_sse2 =
+    vp9_highbd_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_sse2 =
+    vp9_highbd_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_sse2 =
+    vp9_highbd_sub_pixel_variance64x64_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_sse2 =
+    vp9_highbd_10_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_sse2 =
+    vp9_highbd_10_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_sse2 =
+    vp9_highbd_10_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_sse2 =
+    vp9_highbd_10_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_sse2 =
+    vp9_highbd_10_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_sse2 =
+    vp9_highbd_10_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_sse2 =
+    vp9_highbd_10_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_sse2 =
+    vp9_highbd_10_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_sse2 =
+    vp9_highbd_10_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_sse2 =
+    vp9_highbd_10_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_sse2 =
+    vp9_highbd_10_sub_pixel_variance64x64_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_sse2 =
+    vp9_highbd_12_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_sse2 =
+    vp9_highbd_12_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_sse2 =
+    vp9_highbd_12_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_sse2 =
+    vp9_highbd_12_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_sse2 =
+    vp9_highbd_12_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_sse2 =
+    vp9_highbd_12_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_sse2 =
+    vp9_highbd_12_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_sse2 =
+    vp9_highbd_12_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_sse2 =
+    vp9_highbd_12_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_sse2 =
+    vp9_highbd_12_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_sse2 =
+    vp9_highbd_12_sub_pixel_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelVarianceHighTest,
+    ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+                      make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
+                      make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
+                      make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),
+                      make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),
+                      make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),
+                      make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),
+                      make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),
+                      make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),
+                      make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),
+                      make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),
+                      make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),
+                      make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),
+                      make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =
+    vp9_highbd_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =
+    vp9_highbd_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =
+    vp9_highbd_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =
+    vp9_highbd_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =
+    vp9_highbd_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =
+    vp9_highbd_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =
+    vp9_highbd_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =
+    vp9_highbd_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =
+    vp9_highbd_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =
+    vp9_highbd_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =
+    vp9_highbd_sub_pixel_avg_variance64x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =
+    vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =
+    vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelAvgVarianceHighTest,
+    ::testing::Values(
+                  make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
+                  make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
+                  make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
+                  make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
+                  make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
+                  make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
+                  make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
+                  make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
+                  make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
+                  make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
+                  make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
+                  make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
+                  make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
+                  make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
+                  make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
+                  make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
+                  make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
+                  make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
+                  make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
+                  make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
+                  make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
+                  make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
+                  make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),
+                  make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),
+                  make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),
+                  make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),
+                  make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),
+                  make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),
+                  make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),
+                  make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),
+                  make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),
+                  make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),
+                  make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
 
@@ -819,20 +1820,19 @@
     vp9_sub_pixel_variance64x64_ssse3;
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
-                      make_tuple(2, 3, subpel_variance4x8_ssse3),
-                      make_tuple(3, 2, subpel_variance8x4_ssse3),
-                      make_tuple(3, 3, subpel_variance8x8_ssse3),
-                      make_tuple(3, 4, subpel_variance8x16_ssse3),
-                      make_tuple(4, 3, subpel_variance16x8_ssse3),
-                      make_tuple(4, 4, subpel_variance16x16_ssse3),
-                      make_tuple(4, 5, subpel_variance16x32_ssse3),
-                      make_tuple(5, 4, subpel_variance32x16_ssse3),
-                      make_tuple(5, 5, subpel_variance32x32_ssse3),
-                      make_tuple(5, 6, subpel_variance32x64_ssse3),
-                      make_tuple(6, 5, subpel_variance64x32_ssse3),
-                      make_tuple(6, 6, subpel_variance64x64_ssse3)));
-
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
+                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
+                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
+                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
+                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+                      make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
+                      make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
     vp9_sub_pixel_avg_variance4x4_ssse3;
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
@@ -861,21 +1861,21 @@
     vp9_sub_pixel_avg_variance64x64_ssse3;
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
-                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
-                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
-#endif
-#endif
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
+                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
 
@@ -886,11 +1886,11 @@
 const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
 INSTANTIATE_TEST_CASE_P(
     AVX2, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
-                      make_tuple(5, 4, variance32x16_avx2),
-                      make_tuple(5, 5, variance32x32_avx2),
-                      make_tuple(6, 5, variance64x32_avx2),
-                      make_tuple(6, 6, variance64x64_avx2)));
+    ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0),
+                      make_tuple(5, 4, variance32x16_avx2, 0),
+                      make_tuple(5, 5, variance32x32_avx2, 0),
+                      make_tuple(6, 5, variance64x32_avx2, 0),
+                      make_tuple(6, 6, variance64x64_avx2, 0)));
 
 const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
     vp9_sub_pixel_variance32x32_avx2;
@@ -898,8 +1898,8 @@
     vp9_sub_pixel_variance64x64_avx2;
 INSTANTIATE_TEST_CASE_P(
     AVX2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
-                      make_tuple(6, 6, subpel_variance64x64_avx2)));
+    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),
+                      make_tuple(6, 6, subpel_variance64x64_avx2, 0)));
 
 const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
     vp9_sub_pixel_avg_variance32x32_avx2;
@@ -907,8 +1907,8 @@
     vp9_sub_pixel_avg_variance64x64_avx2;
 INSTANTIATE_TEST_CASE_P(
     AVX2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
-                      make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
+                      make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
 #endif  // HAVE_AVX2
 #if HAVE_NEON
 const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
@@ -916,9 +1916,9 @@
 const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
-                      make_tuple(4, 4, variance16x16_neon),
-                      make_tuple(5, 5, variance32x32_neon)));
+    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
+                      make_tuple(4, 4, variance16x16_neon, 0),
+                      make_tuple(5, 5, variance32x32_neon, 0)));
 
 const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
     vp9_sub_pixel_variance8x8_neon;
@@ -928,12 +1928,11 @@
     vp9_sub_pixel_variance32x32_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon),
-                      make_tuple(4, 4, subpel_variance16x16_neon),
-                      make_tuple(5, 5, subpel_variance32x32_neon)));
+    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
+                      make_tuple(4, 4, subpel_variance16x16_neon, 0),
+                      make_tuple(5, 5, subpel_variance32x32_neon, 0)));
 #endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace vp9
-
 }  // namespace
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -112,6 +112,9 @@
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int filterbit, uv_filterbit;
+#endif
   TX_SIZE tx_size;
   int8_t skip;
   int8_t segment_id;
@@ -126,11 +129,18 @@
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   uint8_t mode_context[MAX_REF_FRAMES];
   INTERP_FILTER interp_filter;
+
+#if CONFIG_EXT_TX
+  EXT_TX_TYPE ext_txfrm;
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
   struct MODE_INFO *src_mi;
   MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+  int b_filter_info[4];
+#endif
   b_mode_info bmi[4];
 } MODE_INFO;
 
@@ -139,6 +149,17 @@
                                       : mi->mbmi.mode;
 }
 
+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
+  (void)mode;
+  return 1;
+}
+
+static INLINE int is_filter_enabled(TX_SIZE txsize) {
+  return (txsize < TX_SIZES);
+}
+#endif
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -236,12 +257,33 @@
 
 extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
 
+#if CONFIG_EXT_TX
+static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {
+  switch (ext_tx) {
+    case NORM:
+    default:
+      return DCT_DCT;
+    case ALT:
+      return ADST_ADST;
+  }
+}
+#endif
+
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+#if CONFIG_EXT_TX
+  if (plane_type != PLANE_TYPE_Y || xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(mbmi)) {
+    return ext_tx_to_txtype(mbmi->ext_txfrm);
+  }
+#else
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
     return DCT_DCT;
+#endif
   return intra_mode_to_tx_type_lookup[mbmi->mode];
 }
 
@@ -249,8 +291,17 @@
                                       const MACROBLOCKD *xd, int ib) {
   const MODE_INFO *const mi = xd->mi[0].src_mi;
 
+#if CONFIG_EXT_TX
+  if (plane_type != PLANE_TYPE_Y || xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(&mi->mbmi)) {
+    return ext_tx_to_txtype(mi->mbmi.ext_txfrm);
+  }
+#else
   if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
     return DCT_DCT;
+#endif
 
   return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1283,34 +1283,34 @@
 
   # variance
   add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x16/;
+  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x32/;
+  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x32/;
+  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x64/;
+  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x32/;
+  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x64/;
+  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x16/;
+  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x8/;
+  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x16/;
+  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x8/;
+  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_variance8x4/;
@@ -1322,40 +1322,40 @@
   specialize qw/vp9_highbd_variance4x4/;
 
   add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get8x8var/;
+  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get16x16var/;
+  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x16/;
+  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x32/;
+  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x32/;
+  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x64/;
+  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x32/;
+  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x64/;
+  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x16/;
+  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x8/;
+  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x16/;
+  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x8/;
+  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_variance8x4/;
@@ -1367,40 +1367,40 @@
   specialize qw/vp9_highbd_10_variance4x4/;
 
   add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get8x8var/;
+  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get16x16var/;
+  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x16/;
+  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x32/;
+  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x32/;
+  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x64/;
+  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x32/;
+  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x64/;
+  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x16/;
+  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x8/;
+  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x16/;
+  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x8/;
+  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_variance8x4/;
@@ -1412,76 +1412,76 @@
   specialize qw/vp9_highbd_12_variance4x4/;
 
   add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get8x8var/;
+  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get16x16var/;
+  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1496,70 +1496,70 @@
   specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1574,70 +1574,70 @@
   specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1817,7 +1817,7 @@
   specialize qw/vp9_highbd_sad4x4x4d sse2/;
 
   add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x16/;
+  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_mse8x16/;
@@ -1826,10 +1826,10 @@
   specialize qw/vp9_highbd_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x8/;
+  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x16/;
+  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_mse8x16/;
@@ -1838,10 +1838,10 @@
   specialize qw/vp9_highbd_10_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x8/;
+  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x16/;
+  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_mse8x16/;
@@ -1850,7 +1850,7 @@
   specialize qw/vp9_highbd_12_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x8/;
+  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
 
   # ENCODEMB INVOKE
 
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
@@ -1,0 +1,1043 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 15
+                     times  8 dw  1
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 13
+                     times  8 dw  3
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 11
+                     times  8 dw  5
+                     times  8 dw 10
+                     times  8 dw  6
+                     times  8 dw  9
+                     times  8 dw  7
+                     times 16 dw  8
+                     times  8 dw  7
+                     times  8 dw  9
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  5
+                     times  8 dw 11
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  3
+                     times  8 dw 13
+                     times  8 dw  2
+                     times  8 dw 14
+                     times  8 dw  1
+                     times  8 dw 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*2]
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*4]
+%else
+  lea                srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define h heightd
+  %define bilin_filter sseq
+%else
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define h dword heightm
+      %define sec_str sec_stridemp
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                x_offset, y_offset, dst, dst_stride, height, \
+                                sse, g_bilin_filter, g_pw_8
+      %define h heightd
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define h heightd
+      %define sec_str sec_strideq
+      %else
+      %define h dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                              x_offset, y_offset, dst, dst_stride, height, sse
+      %define h heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   h, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq + sec_str*2]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m4, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m4, [secq+sec_str*2]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_2STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  lea                secq, [secq + sec_str*4]
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@@ -1,0 +1,313 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+sym(vp9_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+sym(vp9_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -1,0 +1,580 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_variance_sse2(const uint16_t *src, int src_stride,
+                                 const uint16_t *ref, int ref_stride,
+                                 int w, int h, uint32_t *sse, int *sum,
+                                 high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vp9_highbd_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+                       block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                               ptrdiff_t src_stride, \
+                                               int x_offset, int y_offset, \
+                                               const uint16_t *dst, \
+                                               ptrdiff_t dst_stride, \
+                                               int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                        int src_stride, \
+                                                        int x_offset, \
+                                                        int y_offset, \
+                                                        const uint8_t *dst8, \
+                                                        int dst_stride, \
+                                                        uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -104,6 +104,7 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
@@ -115,6 +116,8 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
 endif
 endif