ref: 48032bfcdb412a8e7f9d89154c4ac8fbb3f8fe72
parent: 807885b5e01f0f00edba27b611b9a0cfd49b5796
author: Peter de Rivaz <peter.derivaz@gmail.com>
date: Thu Oct 16 10:00:54 EDT 2014
Added sse2 acceleration for highbitdepth variance Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f (cherry picked from commit d7422b2b1eb9f0011a8c379c2be680d6892b16bc) (cherry picked from commit 6d741e4d76a7d9ece69ca117d1d9e2f9ee48ef8c)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -7,16 +7,18 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdlib.h>
+
+#include <cstdlib>
#include <new>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
+#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx/vpx_integer.h"
#include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#if CONFIG_VP8_ENCODER
# include "./vp8_rtcd.h"
@@ -26,7 +28,6 @@
# include "./vp9_rtcd.h"
# include "vp9/encoder/vp9_variance.h"
#endif
-#include "test/acm_random.h"
namespace {
@@ -43,18 +44,50 @@
return res;
}
-static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
- int l2w, int l2h, unsigned int *sse_ptr) {
+static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref,
+ int l2w, int l2h, int src_stride_coeff,
+ int ref_stride_coeff, uint32_t *sse_ptr,
+ bool use_high_bit_depth_,
+ vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ int diff;
+ if (!use_high_bit_depth_) {
+ diff = ref[w * y * ref_stride_coeff + x] -
+ src[w * y * src_stride_coeff + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
+ CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ if (bit_depth > VPX_BITS_8) {
+ sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
+ se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
+ }
+#else
int se = 0;
unsigned int sse = 0;
- const int w = 1 << l2w, h = 1 << l2h;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
- int diff = ref[w * y + x] - src[w * y + x];
+ int diff = ref[w * y * ref_stride_coeff + x] -
+ src[w * y * src_stride_coeff + x];
se += diff;
sse += diff * diff;
}
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -61,13 +94,56 @@
static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, int xoff, int yoff,
- unsigned int *sse_ptr) {
+ unsigned int *sse_ptr,
+ bool use_high_bit_depth_,
+ vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // Bilinear interpolation at a 16th pel step.
+ if (!use_high_bit_depth_) {
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = r - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = r - src16[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ if (bit_depth > VPX_BITS_8) {
+ sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
+ se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
+ }
+#else
int se = 0;
unsigned int sse = 0;
- const int w = 1 << l2w, h = 1 << l2h;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
- // bilinear interpolation at a 16th pel step
+ // Bilinear interpolation at a 16th pel step.
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
@@ -75,11 +151,12 @@
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
const int r = a + (((b - a) * yoff + 8) >> 4);
- int diff = r - src[w * y + x];
+ const int diff = r - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -130,27 +207,57 @@
template<typename VarianceFunctionType>
class VarianceTest
- : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+ : public ::testing::TestWithParam<tuple<int, int,
+ VarianceFunctionType, int> > {
public:
virtual void SetUp() {
- const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
+ const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
log2width_ = get<0>(params);
width_ = 1 << log2width_;
log2height_ = get<1>(params);
height_ = 1 << log2height_;
variance_ = get<2>(params);
+ if (get<3>(params)) {
+ bit_depth_ = static_cast<vpx_bit_depth_t>(get<3>(params));
+ use_high_bit_depth_ = true;
+ } else {
+ bit_depth_ = VPX_BITS_8;
+ use_high_bit_depth_ = false;
+ }
+ mask_ = (1 << bit_depth_) - 1;
rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- ref_ = new uint8_t[block_size_];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+ ref_ = new uint8_t[block_size_ * 2];
+ } else {
+ src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
+ ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+ }
+#else
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+ ref_ = new uint8_t[block_size_ * 2];
+#endif
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
}
virtual void TearDown() {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ vpx_free(src_);
+ delete[] ref_;
+ } else {
+ vpx_free(CONVERT_TO_SHORTPTR(src_));
+ delete[] CONVERT_TO_SHORTPTR(ref_);
+ }
+#else
vpx_free(src_);
delete[] ref_;
+#endif
libvpx_test::ClearSystemState();
}
@@ -157,13 +264,17 @@
protected:
void ZeroTest();
void RefTest();
+ void RefStrideTest();
void OneQuarterTest();
ACMRandom rnd_;
- uint8_t* src_;
- uint8_t* ref_;
+ uint8_t *src_;
+ uint8_t *ref_;
int width_, log2width_;
int height_, log2height_;
+ vpx_bit_depth_t bit_depth_;
+ int mask_;
+ bool use_high_bit_depth_;
int block_size_;
VarianceFunctionType variance_;
};
@@ -171,14 +282,32 @@
template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::ZeroTest() {
for (int i = 0; i <= 255; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ memset(src_, i, block_size_);
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
+ block_size_);
+ }
+#else
memset(src_, i, block_size_);
+#endif
for (int j = 0; j <= 255; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ memset(ref_, j, block_size_);
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8),
+ block_size_);
+ }
+#else
memset(ref_, j, block_size_);
+#endif
unsigned int sse;
unsigned int var;
ASM_REGISTER_STATE_CHECK(
var = variance_(src_, width_, ref_, width_, &sse));
- EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
+ EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
}
}
}
@@ -187,15 +316,28 @@
void VarianceTest<VarianceFunctionType>::RefTest() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size_; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
src_[j] = rnd_.Rand8();
ref_[j] = rnd_.Rand8();
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
}
+#else
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#endif
+ }
unsigned int sse1, sse2;
unsigned int var1;
+ const int stride_coeff = 1;
ASM_REGISTER_STATE_CHECK(
var1 = variance_(src_, width_, ref_, width_, &sse1));
const unsigned int var2 = variance_ref(src_, ref_, log2width_,
- log2height_, &sse2);
+ log2height_, stride_coeff,
+ stride_coeff, &sse2,
+ use_high_bit_depth_, bit_depth_);
EXPECT_EQ(sse1, sse2);
EXPECT_EQ(var1, var2);
}
@@ -202,11 +344,60 @@
}
template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefStrideTest() {
+ for (int i = 0; i < 10; ++i) {
+ int ref_stride_coeff = i % 2;
+ int src_stride_coeff = (i >> 1) % 2;
+ for (int j = 0; j < block_size_; j++) {
+ int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
+ int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ src_[src_ind] = rnd_.Rand8();
+ ref_[ref_ind] = rnd_.Rand8();
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
+ CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+ }
+#else
+ src_[src_ind] = rnd_.Rand8();
+ ref_[ref_ind] = rnd_.Rand8();
+#endif
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+
+ ASM_REGISTER_STATE_CHECK(
+ var1 = variance_(src_, width_ * src_stride_coeff,
+ ref_, width_ * ref_stride_coeff, &sse1));
+ const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+ log2height_, src_stride_coeff,
+ ref_stride_coeff, &sse2,
+ use_high_bit_depth_, bit_depth_);
+ EXPECT_EQ(sse1, sse2);
+ EXPECT_EQ(var1, var2);
+ }
+}
+
+template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
- memset(src_, 255, block_size_);
const int half = block_size_ / 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ memset(src_, 255, block_size_);
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half);
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
+ block_size_);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+ }
+#else
+ memset(src_, 255, block_size_);
memset(ref_, 255, half);
memset(ref_ + half, 0, half);
+#endif
unsigned int sse;
unsigned int var;
ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
@@ -264,8 +455,10 @@
ref_[j] = rnd.Rand8();
}
unsigned int sse1, sse2;
+ const int stride_coeff = 1;
ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));
- variance_ref(src_, ref_, log2width_, log2height_, &sse2);
+ variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+ stride_coeff, &sse2, false, VPX_BITS_8);
EXPECT_EQ(sse1, sse2);
}
}
@@ -279,9 +472,10 @@
}
unsigned int sse2;
unsigned int var1;
- ASM_REGISTER_STATE_CHECK(
- var1 = mse_(src_, width_, ref_, width_));
- variance_ref(src_, ref_, log2width_, log2height_, &sse2);
+ const int stride_coeff = 1;
+ ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));
+ variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+ stride_coeff, &sse2, false, VPX_BITS_8);
EXPECT_EQ(var1, sse2);
}
}
@@ -308,16 +502,59 @@
#endif
#if CONFIG_VP9_ENCODER
-
unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const uint8_t *src,
const uint8_t *second_pred,
int l2w, int l2h,
int xoff, int yoff,
- unsigned int *sse_ptr) {
+ unsigned int *sse_ptr,
+ bool use_high_bit_depth,
+ vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ if (!use_high_bit_depth) {
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+ const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ if (bit_depth > 8) {
+ sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8));
+ se = ROUND_POWER_OF_TWO(se, bit_depth-8);
+ }
+#else
int se = 0;
unsigned int sse = 0;
- const int w = 1 << l2w, h = 1 << l2h;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
// bilinear interpolation at a 16th pel step
@@ -328,11 +565,12 @@
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
const int r = a + (((b - a) * yoff + 8) >> 4);
- int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+ const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
@@ -340,10 +578,10 @@
template<typename SubpelVarianceFunctionType>
class SubpelVarianceTest
: public ::testing::TestWithParam<tuple<int, int,
- SubpelVarianceFunctionType> > {
+ SubpelVarianceFunctionType, int> > {
public:
virtual void SetUp() {
- const tuple<int, int, SubpelVarianceFunctionType>& params =
+ const tuple<int, int, SubpelVarianceFunctionType, int>& params =
this->GetParam();
log2width_ = get<0>(params);
width_ = 1 << log2width_;
@@ -350,12 +588,37 @@
log2height_ = get<1>(params);
height_ = 1 << log2height_;
subpel_variance_ = get<2>(params);
+ if (get<3>(params)) {
+ bit_depth_ = (vpx_bit_depth_t) get<3>(params);
+ use_high_bit_depth_ = true;
+ } else {
+ bit_depth_ = VPX_BITS_8;
+ use_high_bit_depth_ = false;
+ }
+ mask_ = (1 << bit_depth_)-1;
rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+ sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+ ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+ } else {
+ src_ = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, block_size_*sizeof(uint16_t))));
+ sec_ = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, block_size_*sizeof(uint16_t))));
+ ref_ = CONVERT_TO_BYTEPTR(
+ new uint16_t[block_size_ + width_ + height_ + 1]);
+ }
+#else
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#endif // CONFIG_VP9_HIGHBITDEPTH
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(sec_ != NULL);
ASSERT_TRUE(ref_ != NULL);
@@ -362,22 +625,37 @@
}
virtual void TearDown() {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ vpx_free(src_);
+ delete[] ref_;
+ vpx_free(sec_);
+ } else {
+ vpx_free(CONVERT_TO_SHORTPTR(src_));
+ delete[] CONVERT_TO_SHORTPTR(ref_);
+ vpx_free(CONVERT_TO_SHORTPTR(sec_));
+ }
+#else
vpx_free(src_);
delete[] ref_;
vpx_free(sec_);
+#endif
libvpx_test::ClearSystemState();
}
protected:
void RefTest();
+ void ExtremeRefTest();
ACMRandom rnd_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
+ bool use_high_bit_depth_;
+ vpx_bit_depth_t bit_depth_;
int width_, log2width_;
int height_, log2height_;
- int block_size_;
+ int block_size_, mask_;
SubpelVarianceFunctionType subpel_variance_;
};
@@ -385,6 +663,23 @@
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size_; j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ }
+ }
+#else
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd_.Rand8();
}
@@ -391,12 +686,15 @@
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd_.Rand8();
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
src_, width_, &sse1));
const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
- log2height_, x, y, &sse2);
+ log2height_, x, y, &sse2,
+ use_high_bit_depth_,
+ bit_depth_);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
@@ -403,10 +701,69 @@
}
}
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+ // Compare against reference.
+ // Src: Set the first half of values to 0, the second half to the maximum.
+ // Ref: Set the first half of values to the maximum, the second half to 0.
+ for (int x = 0; x < 16; ++x) {
+ for (int y = 0; y < 16; ++y) {
+ const int half = block_size_ / 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ memset(src_, 0, half);
+ memset(src_ + half, 255, half);
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half + width_ + height_ + 1);
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
+ half + width_ + height_ + 1);
+ }
+#else
+ memset(src_, 0, half);
+ memset(src_ + half, 255, half);
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half + width_ + height_ + 1);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ ASM_REGISTER_STATE_CHECK(
+ var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
+ const unsigned int var2 =
+ subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
+ use_high_bit_depth_, bit_depth_);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
template<>
void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd_.Rand8();
+ sec_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size_; j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+ CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ }
+ }
+#else
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd_.Rand8();
sec_[j] = rnd_.Rand8();
@@ -414,6 +771,7 @@
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd_.Rand8();
}
+#endif
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
@@ -421,7 +779,9 @@
src_, width_, &sse1, sec_));
const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
log2width_, log2height_,
- x, y, &sse2);
+ x, y, &sse2,
+ use_high_bit_depth_,
+ bit_depth_);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
@@ -468,11 +828,11 @@
const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
INSTANTIATE_TEST_CASE_P(
C, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_c),
- make_tuple(3, 3, variance8x8_c),
- make_tuple(3, 4, variance8x16_c),
- make_tuple(4, 3, variance16x8_c),
- make_tuple(4, 4, variance16x16_c)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
+ make_tuple(3, 3, variance8x8_c, 0),
+ make_tuple(3, 4, variance8x16_c, 0),
+ make_tuple(4, 3, variance16x8_c, 0),
+ make_tuple(4, 4, variance16x16_c, 0)));
#if HAVE_NEON
const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon;
@@ -491,13 +851,12 @@
const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
INSTANTIATE_TEST_CASE_P(
NEON, VP8VarianceTest,
- ::testing::Values(make_tuple(3, 3, variance8x8_neon),
- make_tuple(3, 4, variance8x16_neon),
- make_tuple(4, 3, variance16x8_neon),
- make_tuple(4, 4, variance16x16_neon)));
+ ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
+ make_tuple(3, 4, variance8x16_neon, 0),
+ make_tuple(4, 3, variance16x8_neon, 0),
+ make_tuple(4, 4, variance16x16_neon, 0)));
#endif
-
#if HAVE_MMX
const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
@@ -506,11 +865,11 @@
const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
INSTANTIATE_TEST_CASE_P(
MMX, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
- make_tuple(3, 3, variance8x8_mmx),
- make_tuple(3, 4, variance8x16_mmx),
- make_tuple(4, 3, variance16x8_mmx),
- make_tuple(4, 4, variance16x16_mmx)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0),
+ make_tuple(3, 3, variance8x8_mmx, 0),
+ make_tuple(3, 4, variance8x16_mmx, 0),
+ make_tuple(4, 3, variance16x8_mmx, 0),
+ make_tuple(4, 4, variance16x16_mmx, 0)));
#endif
#if HAVE_SSE2
@@ -521,11 +880,11 @@
const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
INSTANTIATE_TEST_CASE_P(
SSE2, VP8VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
- make_tuple(3, 3, variance8x8_wmt),
- make_tuple(3, 4, variance8x16_wmt),
- make_tuple(4, 3, variance16x8_wmt),
- make_tuple(4, 4, variance16x16_wmt)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0),
+ make_tuple(3, 3, variance8x8_wmt, 0),
+ make_tuple(3, 4, variance8x16_wmt, 0),
+ make_tuple(4, 3, variance16x8_wmt, 0),
+ make_tuple(4, 4, variance16x16_wmt, 0)));
#endif
#endif // CONFIG_VP8_ENCODER
@@ -537,7 +896,6 @@
namespace vp9 {
#if CONFIG_VP9_ENCODER
-
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
@@ -550,10 +908,27 @@
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest;
+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
+ VP9SubpelAvgVarianceHighTest;
+
+TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); }
+TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
+TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
@@ -569,20 +944,115 @@
const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_c),
- make_tuple(2, 3, variance4x8_c),
- make_tuple(3, 2, variance8x4_c),
- make_tuple(3, 3, variance8x8_c),
- make_tuple(3, 4, variance8x16_c),
- make_tuple(4, 3, variance16x8_c),
- make_tuple(4, 4, variance16x16_c),
- make_tuple(4, 5, variance16x32_c),
- make_tuple(5, 4, variance32x16_c),
- make_tuple(5, 5, variance32x32_c),
- make_tuple(5, 6, variance32x64_c),
- make_tuple(6, 5, variance64x32_c),
- make_tuple(6, 6, variance64x64_c)));
-
+ ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
+ make_tuple(2, 3, variance4x8_c, 0),
+ make_tuple(3, 2, variance8x4_c, 0),
+ make_tuple(3, 3, variance8x8_c, 0),
+ make_tuple(3, 4, variance8x16_c, 0),
+ make_tuple(4, 3, variance16x8_c, 0),
+ make_tuple(4, 4, variance16x16_c, 0),
+ make_tuple(4, 5, variance16x32_c, 0),
+ make_tuple(5, 4, variance32x16_c, 0),
+ make_tuple(5, 5, variance32x32_c, 0),
+ make_tuple(5, 6, variance32x64_c, 0),
+ make_tuple(6, 5, variance64x32_c, 0),
+ make_tuple(6, 6, variance64x64_c, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c;
+const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c;
+const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c;
+const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c;
+const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c;
+const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c;
+const vp9_variance_fn_t highbd_10_variance16x16_c =
+ vp9_highbd_10_variance16x16_c;
+const vp9_variance_fn_t highbd_10_variance16x32_c =
+ vp9_highbd_10_variance16x32_c;
+const vp9_variance_fn_t highbd_10_variance32x16_c =
+ vp9_highbd_10_variance32x16_c;
+const vp9_variance_fn_t highbd_10_variance32x32_c =
+ vp9_highbd_10_variance32x32_c;
+const vp9_variance_fn_t highbd_10_variance32x64_c =
+ vp9_highbd_10_variance32x64_c;
+const vp9_variance_fn_t highbd_10_variance64x32_c =
+ vp9_highbd_10_variance64x32_c;
+const vp9_variance_fn_t highbd_10_variance64x64_c =
+ vp9_highbd_10_variance64x64_c;
+const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c;
+const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c;
+const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c;
+const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c;
+const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c;
+const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c;
+const vp9_variance_fn_t highbd_12_variance16x16_c =
+ vp9_highbd_12_variance16x16_c;
+const vp9_variance_fn_t highbd_12_variance16x32_c =
+ vp9_highbd_12_variance16x32_c;
+const vp9_variance_fn_t highbd_12_variance32x16_c =
+ vp9_highbd_12_variance32x16_c;
+const vp9_variance_fn_t highbd_12_variance32x32_c =
+ vp9_highbd_12_variance32x32_c;
+const vp9_variance_fn_t highbd_12_variance32x64_c =
+ vp9_highbd_12_variance32x64_c;
+const vp9_variance_fn_t highbd_12_variance64x32_c =
+ vp9_highbd_12_variance64x32_c;
+const vp9_variance_fn_t highbd_12_variance64x64_c =
+ vp9_highbd_12_variance64x64_c;
+const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c;
+const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c;
+const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c;
+const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c;
+const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c;
+const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c;
+const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c;
+const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c;
+const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c;
+const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c;
+const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c;
+const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c;
+const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9VarianceHighTest,
+ ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10),
+ make_tuple(2, 3, highbd_10_variance4x8_c, 10),
+ make_tuple(3, 2, highbd_10_variance8x4_c, 10),
+ make_tuple(3, 3, highbd_10_variance8x8_c, 10),
+ make_tuple(3, 4, highbd_10_variance8x16_c, 10),
+ make_tuple(4, 3, highbd_10_variance16x8_c, 10),
+ make_tuple(4, 4, highbd_10_variance16x16_c, 10),
+ make_tuple(4, 5, highbd_10_variance16x32_c, 10),
+ make_tuple(5, 4, highbd_10_variance32x16_c, 10),
+ make_tuple(5, 5, highbd_10_variance32x32_c, 10),
+ make_tuple(5, 6, highbd_10_variance32x64_c, 10),
+ make_tuple(6, 5, highbd_10_variance64x32_c, 10),
+ make_tuple(6, 6, highbd_10_variance64x64_c, 10),
+ make_tuple(2, 2, highbd_12_variance4x4_c, 12),
+ make_tuple(2, 3, highbd_12_variance4x8_c, 12),
+ make_tuple(3, 2, highbd_12_variance8x4_c, 12),
+ make_tuple(3, 3, highbd_12_variance8x8_c, 12),
+ make_tuple(3, 4, highbd_12_variance8x16_c, 12),
+ make_tuple(4, 3, highbd_12_variance16x8_c, 12),
+ make_tuple(4, 4, highbd_12_variance16x16_c, 12),
+ make_tuple(4, 5, highbd_12_variance16x32_c, 12),
+ make_tuple(5, 4, highbd_12_variance32x16_c, 12),
+ make_tuple(5, 5, highbd_12_variance32x32_c, 12),
+ make_tuple(5, 6, highbd_12_variance32x64_c, 12),
+ make_tuple(6, 5, highbd_12_variance64x32_c, 12),
+ make_tuple(6, 6, highbd_12_variance64x64_c, 12),
+ make_tuple(2, 2, highbd_variance4x4_c, 8),
+ make_tuple(2, 3, highbd_variance4x8_c, 8),
+ make_tuple(3, 2, highbd_variance8x4_c, 8),
+ make_tuple(3, 3, highbd_variance8x8_c, 8),
+ make_tuple(3, 4, highbd_variance8x16_c, 8),
+ make_tuple(4, 3, highbd_variance16x8_c, 8),
+ make_tuple(4, 4, highbd_variance16x16_c, 8),
+ make_tuple(4, 5, highbd_variance16x32_c, 8),
+ make_tuple(5, 4, highbd_variance32x16_c, 8),
+ make_tuple(5, 5, highbd_variance32x32_c, 8),
+ make_tuple(5, 6, highbd_variance32x64_c, 8),
+ make_tuple(6, 5, highbd_variance64x32_c, 8),
+ make_tuple(6, 6, highbd_variance64x64_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
const vp9_subpixvariance_fn_t subpel_variance4x4_c =
vp9_sub_pixel_variance4x4_c;
const vp9_subpixvariance_fn_t subpel_variance4x8_c =
@@ -611,20 +1081,19 @@
vp9_sub_pixel_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
- make_tuple(2, 3, subpel_variance4x8_c),
- make_tuple(3, 2, subpel_variance8x4_c),
- make_tuple(3, 3, subpel_variance8x8_c),
- make_tuple(3, 4, subpel_variance8x16_c),
- make_tuple(4, 3, subpel_variance16x8_c),
- make_tuple(4, 4, subpel_variance16x16_c),
- make_tuple(4, 5, subpel_variance16x32_c),
- make_tuple(5, 4, subpel_variance32x16_c),
- make_tuple(5, 5, subpel_variance32x32_c),
- make_tuple(5, 6, subpel_variance32x64_c),
- make_tuple(6, 5, subpel_variance64x32_c),
- make_tuple(6, 6, subpel_variance64x64_c)));
-
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),
+ make_tuple(2, 3, subpel_variance4x8_c, 0),
+ make_tuple(3, 2, subpel_variance8x4_c, 0),
+ make_tuple(3, 3, subpel_variance8x8_c, 0),
+ make_tuple(3, 4, subpel_variance8x16_c, 0),
+ make_tuple(4, 3, subpel_variance16x8_c, 0),
+ make_tuple(4, 4, subpel_variance16x16_c, 0),
+ make_tuple(4, 5, subpel_variance16x32_c, 0),
+ make_tuple(5, 4, subpel_variance32x16_c, 0),
+ make_tuple(5, 5, subpel_variance32x32_c, 0),
+ make_tuple(5, 6, subpel_variance32x64_c, 0),
+ make_tuple(6, 5, subpel_variance64x32_c, 0),
+ make_tuple(6, 6, subpel_variance64x64_c, 0)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
vp9_sub_pixel_avg_variance4x4_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
@@ -653,23 +1122,263 @@
vp9_sub_pixel_avg_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
- make_tuple(2, 3, subpel_avg_variance4x8_c),
- make_tuple(3, 2, subpel_avg_variance8x4_c),
- make_tuple(3, 3, subpel_avg_variance8x8_c),
- make_tuple(3, 4, subpel_avg_variance8x16_c),
- make_tuple(4, 3, subpel_avg_variance16x8_c),
- make_tuple(4, 4, subpel_avg_variance16x16_c),
- make_tuple(4, 5, subpel_avg_variance16x32_c),
- make_tuple(5, 4, subpel_avg_variance32x16_c),
- make_tuple(5, 5, subpel_avg_variance32x32_c),
- make_tuple(5, 6, subpel_avg_variance32x64_c),
- make_tuple(6, 5, subpel_avg_variance64x32_c),
- make_tuple(6, 6, subpel_avg_variance64x64_c)));
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),
+ make_tuple(2, 3, subpel_avg_variance4x8_c, 0),
+ make_tuple(3, 2, subpel_avg_variance8x4_c, 0),
+ make_tuple(3, 3, subpel_avg_variance8x8_c, 0),
+ make_tuple(3, 4, subpel_avg_variance8x16_c, 0),
+ make_tuple(4, 3, subpel_avg_variance16x8_c, 0),
+ make_tuple(4, 4, subpel_avg_variance16x16_c, 0),
+ make_tuple(4, 5, subpel_avg_variance16x32_c, 0),
+ make_tuple(5, 4, subpel_avg_variance32x16_c, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_c, 0),
+ make_tuple(5, 6, subpel_avg_variance32x64_c, 0),
+ make_tuple(6, 5, subpel_avg_variance64x32_c, 0),
+ make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x4_c =
+ vp9_highbd_10_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x8_c =
+ vp9_highbd_10_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_c =
+ vp9_highbd_10_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_c =
+ vp9_highbd_10_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_c =
+ vp9_highbd_10_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_c =
+ vp9_highbd_10_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_c =
+ vp9_highbd_10_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_c =
+ vp9_highbd_10_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_c =
+ vp9_highbd_10_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_c =
+ vp9_highbd_10_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_c =
+ vp9_highbd_10_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_c =
+ vp9_highbd_10_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_c =
+ vp9_highbd_10_sub_pixel_variance64x64_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x4_c =
+ vp9_highbd_12_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x8_c =
+ vp9_highbd_12_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_c =
+ vp9_highbd_12_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_c =
+ vp9_highbd_12_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_c =
+ vp9_highbd_12_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_c =
+ vp9_highbd_12_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_c =
+ vp9_highbd_12_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_c =
+ vp9_highbd_12_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_c =
+ vp9_highbd_12_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_c =
+ vp9_highbd_12_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_c =
+ vp9_highbd_12_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_c =
+ vp9_highbd_12_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_c =
+ vp9_highbd_12_sub_pixel_variance64x64_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance4x4_c =
+ vp9_highbd_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance4x8_c =
+ vp9_highbd_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_c =
+ vp9_highbd_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_c =
+ vp9_highbd_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_c =
+ vp9_highbd_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_c =
+ vp9_highbd_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_c =
+ vp9_highbd_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_c =
+ vp9_highbd_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_c =
+ vp9_highbd_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_c =
+ vp9_highbd_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_c =
+ vp9_highbd_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_c =
+ vp9_highbd_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_c =
+ vp9_highbd_sub_pixel_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9SubpelVarianceHighTest,
+ ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),
+ make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),
+ make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),
+ make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),
+ make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),
+ make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),
+ make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),
+ make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),
+ make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),
+ make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),
+ make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),
+ make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),
+ make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),
+ make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),
+ make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),
+ make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),
+ make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),
+ make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),
+ make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),
+ make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),
+ make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),
+ make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),
+ make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),
+ make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),
+ make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),
+ make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),
+ make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),
+ make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),
+ make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),
+ make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),
+ make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),
+ make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),
+ make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),
+ make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),
+ make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),
+ make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),
+ make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),
+ make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),
+ make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =
+ vp9_highbd_10_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =
+ vp9_highbd_10_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =
+ vp9_highbd_10_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =
+ vp9_highbd_10_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =
+ vp9_highbd_10_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =
+ vp9_highbd_10_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =
+ vp9_highbd_10_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =
+ vp9_highbd_10_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =
+ vp9_highbd_10_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =
+ vp9_highbd_10_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =
+ vp9_highbd_10_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =
+ vp9_highbd_10_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =
+ vp9_highbd_10_sub_pixel_avg_variance64x64_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =
+ vp9_highbd_12_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =
+ vp9_highbd_12_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =
+ vp9_highbd_12_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =
+ vp9_highbd_12_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =
+ vp9_highbd_12_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =
+ vp9_highbd_12_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =
+ vp9_highbd_12_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =
+ vp9_highbd_12_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =
+ vp9_highbd_12_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =
+ vp9_highbd_12_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =
+ vp9_highbd_12_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =
+ vp9_highbd_12_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =
+ vp9_highbd_12_sub_pixel_avg_variance64x64_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =
+ vp9_highbd_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =
+ vp9_highbd_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =
+ vp9_highbd_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =
+ vp9_highbd_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =
+ vp9_highbd_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =
+ vp9_highbd_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =
+ vp9_highbd_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =
+ vp9_highbd_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =
+ vp9_highbd_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =
+ vp9_highbd_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =
+ vp9_highbd_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =
+ vp9_highbd_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =
+ vp9_highbd_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9SubpelAvgVarianceHighTest,
+ ::testing::Values(
+ make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),
+ make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),
+ make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),
+ make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),
+ make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),
+ make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),
+ make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),
+ make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),
+ make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),
+ make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),
+ make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),
+ make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),
+ make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),
+ make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),
+ make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),
+ make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),
+ make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),
+ make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),
+ make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),
+ make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),
+ make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),
+ make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),
+ make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),
+ make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),
+ make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),
+ make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),
+ make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),
+ make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),
+ make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),
+ make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),
+ make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),
+ make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),
+ make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),
+ make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),
+ make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),
+ make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),
+ make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),
+ make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
+ make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2
#if CONFIG_USE_X86INC
-
INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
::testing::Values(vp9_get_mb_ss_sse2));
@@ -688,20 +1397,19 @@
const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
- make_tuple(2, 3, variance4x8_sse2),
- make_tuple(3, 2, variance8x4_sse2),
- make_tuple(3, 3, variance8x8_sse2),
- make_tuple(3, 4, variance8x16_sse2),
- make_tuple(4, 3, variance16x8_sse2),
- make_tuple(4, 4, variance16x16_sse2),
- make_tuple(4, 5, variance16x32_sse2),
- make_tuple(5, 4, variance32x16_sse2),
- make_tuple(5, 5, variance32x32_sse2),
- make_tuple(5, 6, variance32x64_sse2),
- make_tuple(6, 5, variance64x32_sse2),
- make_tuple(6, 6, variance64x64_sse2)));
-
+ ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0),
+ make_tuple(2, 3, variance4x8_sse2, 0),
+ make_tuple(3, 2, variance8x4_sse2, 0),
+ make_tuple(3, 3, variance8x8_sse2, 0),
+ make_tuple(3, 4, variance8x16_sse2, 0),
+ make_tuple(4, 3, variance16x8_sse2, 0),
+ make_tuple(4, 4, variance16x16_sse2, 0),
+ make_tuple(4, 5, variance16x32_sse2, 0),
+ make_tuple(5, 4, variance32x16_sse2, 0),
+ make_tuple(5, 5, variance32x32_sse2, 0),
+ make_tuple(5, 6, variance32x64_sse2, 0),
+ make_tuple(6, 5, variance64x32_sse2, 0),
+ make_tuple(6, 6, variance64x64_sse2, 0)));
const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
vp9_sub_pixel_variance4x4_sse;
const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
@@ -730,20 +1438,19 @@
vp9_sub_pixel_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
- make_tuple(2, 3, subpel_variance4x8_sse),
- make_tuple(3, 2, subpel_variance8x4_sse2),
- make_tuple(3, 3, subpel_variance8x8_sse2),
- make_tuple(3, 4, subpel_variance8x16_sse2),
- make_tuple(4, 3, subpel_variance16x8_sse2),
- make_tuple(4, 4, subpel_variance16x16_sse2),
- make_tuple(4, 5, subpel_variance16x32_sse2),
- make_tuple(5, 4, subpel_variance32x16_sse2),
- make_tuple(5, 5, subpel_variance32x32_sse2),
- make_tuple(5, 6, subpel_variance32x64_sse2),
- make_tuple(6, 5, subpel_variance64x32_sse2),
- make_tuple(6, 6, subpel_variance64x64_sse2)));
-
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),
+ make_tuple(2, 3, subpel_variance4x8_sse, 0),
+ make_tuple(3, 2, subpel_variance8x4_sse2, 0),
+ make_tuple(3, 3, subpel_variance8x8_sse2, 0),
+ make_tuple(3, 4, subpel_variance8x16_sse2, 0),
+ make_tuple(4, 3, subpel_variance16x8_sse2, 0),
+ make_tuple(4, 4, subpel_variance16x16_sse2, 0),
+ make_tuple(4, 5, subpel_variance16x32_sse2, 0),
+ make_tuple(5, 4, subpel_variance32x16_sse2, 0),
+ make_tuple(5, 5, subpel_variance32x32_sse2, 0),
+ make_tuple(5, 6, subpel_variance32x64_sse2, 0),
+ make_tuple(6, 5, subpel_variance64x32_sse2, 0),
+ make_tuple(6, 6, subpel_variance64x64_sse2, 0)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
vp9_sub_pixel_avg_variance4x4_sse;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
@@ -772,22 +1479,316 @@
vp9_sub_pixel_avg_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
- make_tuple(2, 3, subpel_avg_variance4x8_sse),
- make_tuple(3, 2, subpel_avg_variance8x4_sse2),
- make_tuple(3, 3, subpel_avg_variance8x8_sse2),
- make_tuple(3, 4, subpel_avg_variance8x16_sse2),
- make_tuple(4, 3, subpel_avg_variance16x8_sse2),
- make_tuple(4, 4, subpel_avg_variance16x16_sse2),
- make_tuple(4, 5, subpel_avg_variance16x32_sse2),
- make_tuple(5, 4, subpel_avg_variance32x16_sse2),
- make_tuple(5, 5, subpel_avg_variance32x32_sse2),
- make_tuple(5, 6, subpel_avg_variance32x64_sse2),
- make_tuple(6, 5, subpel_avg_variance64x32_sse2),
- make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
-#endif
-#endif
-
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),
+ make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
+ make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
+ make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
+ make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
+ make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
+ make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
+ make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
+ make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
+ make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
+ make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
+ make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2;
+const vp9_variance_fn_t highbd_10_variance8x8_sse2 =
+ vp9_highbd_10_variance8x8_sse2;
+const vp9_variance_fn_t highbd_12_variance8x8_sse2 =
+ vp9_highbd_12_variance8x8_sse2;
+const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2;
+const vp9_variance_fn_t highbd_10_variance8x16_sse2 =
+ vp9_highbd_10_variance8x16_sse2;
+const vp9_variance_fn_t highbd_12_variance8x16_sse2 =
+ vp9_highbd_12_variance8x16_sse2;
+const vp9_variance_fn_t highbd_variance16x8_sse2 =
+ vp9_highbd_variance16x8_sse2;
+const vp9_variance_fn_t highbd_10_variance16x8_sse2 =
+ vp9_highbd_10_variance16x8_sse2;
+const vp9_variance_fn_t highbd_12_variance16x8_sse2 =
+ vp9_highbd_12_variance16x8_sse2;
+const vp9_variance_fn_t highbd_variance16x16_sse2 =
+ vp9_highbd_variance16x16_sse2;
+const vp9_variance_fn_t highbd_10_variance16x16_sse2 =
+ vp9_highbd_10_variance16x16_sse2;
+const vp9_variance_fn_t highbd_12_variance16x16_sse2 =
+ vp9_highbd_12_variance16x16_sse2;
+const vp9_variance_fn_t highbd_variance16x32_sse2 =
+ vp9_highbd_variance16x32_sse2;
+const vp9_variance_fn_t highbd_10_variance16x32_sse2 =
+ vp9_highbd_10_variance16x32_sse2;
+const vp9_variance_fn_t highbd_12_variance16x32_sse2 =
+ vp9_highbd_12_variance16x32_sse2;
+const vp9_variance_fn_t highbd_variance32x16_sse2 =
+ vp9_highbd_variance32x16_sse2;
+const vp9_variance_fn_t highbd_10_variance32x16_sse2 =
+ vp9_highbd_10_variance32x16_sse2;
+const vp9_variance_fn_t highbd_12_variance32x16_sse2 =
+ vp9_highbd_12_variance32x16_sse2;
+const vp9_variance_fn_t highbd_variance32x32_sse2 =
+ vp9_highbd_variance32x32_sse2;
+const vp9_variance_fn_t highbd_10_variance32x32_sse2 =
+ vp9_highbd_10_variance32x32_sse2;
+const vp9_variance_fn_t highbd_12_variance32x32_sse2 =
+ vp9_highbd_12_variance32x32_sse2;
+const vp9_variance_fn_t highbd_variance32x64_sse2 =
+ vp9_highbd_variance32x64_sse2;
+const vp9_variance_fn_t highbd_10_variance32x64_sse2 =
+ vp9_highbd_10_variance32x64_sse2;
+const vp9_variance_fn_t highbd_12_variance32x64_sse2 =
+ vp9_highbd_12_variance32x64_sse2;
+const vp9_variance_fn_t highbd_variance64x32_sse2 =
+ vp9_highbd_variance64x32_sse2;
+const vp9_variance_fn_t highbd_10_variance64x32_sse2 =
+ vp9_highbd_10_variance64x32_sse2;
+const vp9_variance_fn_t highbd_12_variance64x32_sse2 =
+ vp9_highbd_12_variance64x32_sse2;
+const vp9_variance_fn_t highbd_variance64x64_sse2 =
+ vp9_highbd_variance64x64_sse2;
+const vp9_variance_fn_t highbd_10_variance64x64_sse2 =
+ vp9_highbd_10_variance64x64_sse2;
+const vp9_variance_fn_t highbd_12_variance64x64_sse2 =
+ vp9_highbd_12_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9VarianceHighTest,
+ ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
+ make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
+ make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
+ make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
+ make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
+ make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
+ make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
+ make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
+ make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
+ make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
+ make_tuple(3, 3, highbd_variance8x8_sse2, 8),
+ make_tuple(3, 4, highbd_variance8x16_sse2, 8),
+ make_tuple(4, 3, highbd_variance16x8_sse2, 8),
+ make_tuple(4, 4, highbd_variance16x16_sse2, 8),
+ make_tuple(4, 5, highbd_variance16x32_sse2, 8),
+ make_tuple(5, 4, highbd_variance32x16_sse2, 8),
+ make_tuple(5, 5, highbd_variance32x32_sse2, 8),
+ make_tuple(5, 6, highbd_variance32x64_sse2, 8),
+ make_tuple(6, 5, highbd_variance64x32_sse2, 8),
+ make_tuple(6, 6, highbd_variance64x64_sse2, 8)));
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 =
+ vp9_highbd_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 =
+ vp9_highbd_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_sse2 =
+ vp9_highbd_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_sse2 =
+ vp9_highbd_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_sse2 =
+ vp9_highbd_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_sse2 =
+ vp9_highbd_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_sse2 =
+ vp9_highbd_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_sse2 =
+ vp9_highbd_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_sse2 =
+ vp9_highbd_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_sse2 =
+ vp9_highbd_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_sse2 =
+ vp9_highbd_sub_pixel_variance64x64_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_sse2 =
+ vp9_highbd_10_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_sse2 =
+ vp9_highbd_10_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_sse2 =
+ vp9_highbd_10_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_sse2 =
+ vp9_highbd_10_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_sse2 =
+ vp9_highbd_10_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_sse2 =
+ vp9_highbd_10_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_sse2 =
+ vp9_highbd_10_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_sse2 =
+ vp9_highbd_10_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_sse2 =
+ vp9_highbd_10_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_sse2 =
+ vp9_highbd_10_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_sse2 =
+ vp9_highbd_10_sub_pixel_variance64x64_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_sse2 =
+ vp9_highbd_12_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_sse2 =
+ vp9_highbd_12_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_sse2 =
+ vp9_highbd_12_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_sse2 =
+ vp9_highbd_12_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_sse2 =
+ vp9_highbd_12_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_sse2 =
+ vp9_highbd_12_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_sse2 =
+ vp9_highbd_12_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_sse2 =
+ vp9_highbd_12_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_sse2 =
+ vp9_highbd_12_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_sse2 =
+ vp9_highbd_12_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_sse2 =
+ vp9_highbd_12_sub_pixel_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9SubpelVarianceHighTest,
+ ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
+ make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
+ make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
+ make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
+ make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
+ make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
+ make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
+ make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
+ make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
+ make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+ make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
+ make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
+ make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),
+ make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),
+ make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),
+ make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),
+ make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),
+ make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),
+ make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),
+ make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),
+ make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),
+ make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),
+ make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =
+ vp9_highbd_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =
+ vp9_highbd_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =
+ vp9_highbd_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =
+ vp9_highbd_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =
+ vp9_highbd_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =
+ vp9_highbd_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =
+ vp9_highbd_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =
+ vp9_highbd_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =
+ vp9_highbd_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =
+ vp9_highbd_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =
+ vp9_highbd_sub_pixel_avg_variance64x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =
+ vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =
+ vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9SubpelAvgVarianceHighTest,
+ ::testing::Values(
+ make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
+ make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
+ make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
+ make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
+ make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
+ make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
+ make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
+ make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
+ make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
+ make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
+ make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
+ make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
+ make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),
+ make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),
+ make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),
+ make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),
+ make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),
+ make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),
+ make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),
+ make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),
+ make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),
+ make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),
+ make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_USE_X86INC
+#endif // HAVE_SSE2
#if HAVE_SSSE3
#if CONFIG_USE_X86INC
@@ -819,20 +1820,19 @@
vp9_sub_pixel_variance64x64_ssse3;
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
- make_tuple(2, 3, subpel_variance4x8_ssse3),
- make_tuple(3, 2, subpel_variance8x4_ssse3),
- make_tuple(3, 3, subpel_variance8x8_ssse3),
- make_tuple(3, 4, subpel_variance8x16_ssse3),
- make_tuple(4, 3, subpel_variance16x8_ssse3),
- make_tuple(4, 4, subpel_variance16x16_ssse3),
- make_tuple(4, 5, subpel_variance16x32_ssse3),
- make_tuple(5, 4, subpel_variance32x16_ssse3),
- make_tuple(5, 5, subpel_variance32x32_ssse3),
- make_tuple(5, 6, subpel_variance32x64_ssse3),
- make_tuple(6, 5, subpel_variance64x32_ssse3),
- make_tuple(6, 6, subpel_variance64x64_ssse3)));
-
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),
+ make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
+ make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
+ make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
+ make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
+ make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
+ make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
+ make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
+ make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
+ make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
+ make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+ make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
+ make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
vp9_sub_pixel_avg_variance4x4_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
@@ -861,21 +1861,21 @@
vp9_sub_pixel_avg_variance64x64_ssse3;
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
- make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
- make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
- make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
- make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
- make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
- make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
- make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
- make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
- make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
- make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
- make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
- make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
-#endif
-#endif
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),
+ make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
+ make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
+ make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
+ make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
+ make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
+ make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
+ make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
+ make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
+ make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+ make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
+ make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
+#endif // CONFIG_USE_X86INC
+#endif // HAVE_SSSE3
#if HAVE_AVX2
@@ -886,11 +1886,11 @@
const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
INSTANTIATE_TEST_CASE_P(
AVX2, VP9VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
- make_tuple(5, 4, variance32x16_avx2),
- make_tuple(5, 5, variance32x32_avx2),
- make_tuple(6, 5, variance64x32_avx2),
- make_tuple(6, 6, variance64x64_avx2)));
+ ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0),
+ make_tuple(5, 4, variance32x16_avx2, 0),
+ make_tuple(5, 5, variance32x32_avx2, 0),
+ make_tuple(6, 5, variance64x32_avx2, 0),
+ make_tuple(6, 6, variance64x64_avx2, 0)));
const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
vp9_sub_pixel_variance32x32_avx2;
@@ -898,8 +1898,8 @@
vp9_sub_pixel_variance64x64_avx2;
INSTANTIATE_TEST_CASE_P(
AVX2, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
- make_tuple(6, 6, subpel_variance64x64_avx2)));
+ ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),
+ make_tuple(6, 6, subpel_variance64x64_avx2, 0)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
vp9_sub_pixel_avg_variance32x32_avx2;
@@ -907,8 +1907,8 @@
vp9_sub_pixel_avg_variance64x64_avx2;
INSTANTIATE_TEST_CASE_P(
AVX2, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
- make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+ ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
+ make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
#endif // HAVE_AVX2
#if HAVE_NEON
const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
@@ -916,9 +1916,9 @@
const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
INSTANTIATE_TEST_CASE_P(
NEON, VP9VarianceTest,
- ::testing::Values(make_tuple(3, 3, variance8x8_neon),
- make_tuple(4, 4, variance16x16_neon),
- make_tuple(5, 5, variance32x32_neon)));
+ ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
+ make_tuple(4, 4, variance16x16_neon, 0),
+ make_tuple(5, 5, variance32x32_neon, 0)));
const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
vp9_sub_pixel_variance8x8_neon;
@@ -928,12 +1928,11 @@
vp9_sub_pixel_variance32x32_neon;
INSTANTIATE_TEST_CASE_P(
NEON, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon),
- make_tuple(4, 4, subpel_variance16x16_neon),
- make_tuple(5, 5, subpel_variance32x32_neon)));
+ ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
+ make_tuple(4, 4, subpel_variance16x16_neon, 0),
+ make_tuple(5, 5, subpel_variance32x32_neon, 0)));
#endif // HAVE_NEON
#endif // CONFIG_VP9_ENCODER
} // namespace vp9
-
} // namespace
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -112,6 +112,9 @@
// Common for both INTER and INTRA blocks
BLOCK_SIZE sb_type;
PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+ int filterbit, uv_filterbit;
+#endif
TX_SIZE tx_size;
int8_t skip;
int8_t segment_id;
@@ -126,11 +129,18 @@
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
uint8_t mode_context[MAX_REF_FRAMES];
INTERP_FILTER interp_filter;
+
+#if CONFIG_EXT_TX
+ EXT_TX_TYPE ext_txfrm;
+#endif
} MB_MODE_INFO;
typedef struct MODE_INFO {
struct MODE_INFO *src_mi;
MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+ int b_filter_info[4];
+#endif
b_mode_info bmi[4];
} MODE_INFO;
@@ -139,6 +149,17 @@
: mi->mbmi.mode;
}
+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
+ (void)mode;
+ return 1;
+}
+
+static INLINE int is_filter_enabled(TX_SIZE txsize) {
+ return (txsize < TX_SIZES);
+}
+#endif
+
static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
return mbmi->ref_frame[0] > INTRA_FRAME;
}
@@ -236,12 +257,33 @@
extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
+#if CONFIG_EXT_TX
+static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {
+ switch (ext_tx) {
+ case NORM:
+ default:
+ return DCT_DCT;
+ case ALT:
+ return ADST_ADST;
+ }
+}
+#endif
+
static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
const MACROBLOCKD *xd) {
const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
- if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+#if CONFIG_EXT_TX
+ if (plane_type != PLANE_TYPE_Y || xd->lossless)
+ return DCT_DCT;
+
+ if (is_inter_block(mbmi)) {
+ return ext_tx_to_txtype(mbmi->ext_txfrm);
+ }
+#else
+ if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
return DCT_DCT;
+#endif
return intra_mode_to_tx_type_lookup[mbmi->mode];
}
@@ -249,8 +291,17 @@
const MACROBLOCKD *xd, int ib) {
const MODE_INFO *const mi = xd->mi[0].src_mi;
+#if CONFIG_EXT_TX
+ if (plane_type != PLANE_TYPE_Y || xd->lossless)
+ return DCT_DCT;
+
+ if (is_inter_block(&mi->mbmi)) {
+ return ext_tx_to_txtype(mi->mbmi.ext_txfrm);
+ }
+#else
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
return DCT_DCT;
+#endif
return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
}
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1283,34 +1283,34 @@
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x16/;
+ specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x32/;
+ specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x32/;
+ specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x64/;
+ specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x32/;
+ specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x64/;
+ specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x16/;
+ specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x8/;
+ specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x16/;
+ specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x8/;
+ specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
@@ -1322,40 +1322,40 @@
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get8x8var/;
+ specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get16x16var/;
+ specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x16/;
+ specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x32/;
+ specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x32/;
+ specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x64/;
+ specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x32/;
+ specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x64/;
+ specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x16/;
+ specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x8/;
+ specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x16/;
+ specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x8/;
+ specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
@@ -1367,40 +1367,40 @@
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get8x8var/;
+ specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get16x16var/;
+ specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x16/;
+ specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x32/;
+ specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x32/;
+ specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x64/;
+ specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x32/;
+ specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x64/;
+ specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x16/;
+ specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x8/;
+ specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x16/;
+ specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x8/;
+ specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
@@ -1412,76 +1412,76 @@
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get8x8var/;
+ specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get16x16var/;
+ specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1496,70 +1496,70 @@
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1574,70 +1574,70 @@
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1817,7 +1817,7 @@
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x16/;
+ specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
@@ -1826,10 +1826,10 @@
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x8/;
+ specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x16/;
+ specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
@@ -1838,10 +1838,10 @@
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x8/;
+ specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x16/;
+ specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
@@ -1850,7 +1850,7 @@
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x8/;
+ specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
@@ -1,0 +1,1043 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 15
+ times 8 dw 1
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 13
+ times 8 dw 3
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 11
+ times 8 dw 5
+ times 8 dw 10
+ times 8 dw 6
+ times 8 dw 9
+ times 8 dw 7
+ times 16 dw 8
+ times 8 dw 7
+ times 8 dw 9
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 5
+ times 8 dw 11
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 3
+ times 8 dw 13
+ times 8 dw 2
+ times 8 dw 14
+ times 8 dw 1
+ times 8 dw 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*2]
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*4]
+%else
+ lea srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, \
+ sse, g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar h, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq + sec_str*2]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m4, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*2]
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m4, [secq+sec_str*2]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_2STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ lea secq, [secq + sec_str*4]
+%endif
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@@ -1,0 +1,313 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+sym(vp9_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+sym(vp9_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -1,0 +1,580 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vp9_highbd_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+ block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst8, \
+ int dst_stride, \
+ uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, h, \
+ &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ }\
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint16_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, \
+ unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, \
+ dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, \
+ dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + (start_row * dst_stride), dst_stride, \
+ sec + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 16 + (start_row * dst_stride), dst_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 32 + (start_row * dst_stride), dst_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 48 + (start_row * dst_stride), dst_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -104,6 +104,7 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
endif
ifeq ($(CONFIG_USE_X86INC),yes)
@@ -115,6 +116,8 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
endif