ref: 904b957ae965bd3d67f15a75cd9db7954f810d33
parent: aa911e8b41733a950dfe96866dbf8118940bc996
author: Johann <johannkoenig@google.com>
date: Thu Feb 16 12:57:44 EST 2017
consolidate block_error functions vp9_highbd_block_error_8bit_c was a very simple wrapper around vp9_block_error_c. The SSE2 implemention was practically identical to the non-HBD one. It was missing some minor improvements which only went into the original version. In quick speed tests, the AVX implementation showed minimal improvement over SSE2 when it does not detect overflow. However, when overflow is detected the function is run a second time. The OperationCheck test seems to trigger this case and reverses any speed benefits by running ~60% slower. AVX2 on the other hand is always 30-40% faster. Change-Id: I9fcb9afbcb560f234c7ae1b13ddb69eca3988ba1
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -234,11 +234,11 @@
typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
const tran_low_t *dqcoeff, int block_size);
-typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestParam;
+typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
-class BlockErrorTest
+class BlockErrorTestFP
: public ::testing::Test,
- public ::testing::WithParamInterface<BlockErrorTestParam> {
+ public ::testing::WithParamInterface<BlockErrorTestFPParam> {
protected:
virtual void SetUp() {
txfm_size_ = GET_PARAM(0);
@@ -367,7 +367,7 @@
Check(expected);
}
-TEST_P(BlockErrorTest, MinValue) {
+TEST_P(BlockErrorTestFP, MinValue) {
const int64_t kMin = -32640;
const int64_t expected = kMin * kMin * txfm_size_;
FillConstant(kMin, 0);
@@ -374,7 +374,7 @@
Check(expected);
}
-TEST_P(BlockErrorTest, MaxValue) {
+TEST_P(BlockErrorTestFP, MaxValue) {
const int64_t kMax = 32640;
const int64_t expected = kMax * kMax * txfm_size_;
FillConstant(kMax, 0);
@@ -381,7 +381,7 @@
Check(expected);
}
-TEST_P(BlockErrorTest, Random) {
+TEST_P(BlockErrorTestFP, Random) {
int64_t expected;
switch (txfm_size_) {
case 16: expected = 2051681432; break;
@@ -410,7 +410,7 @@
make_tuple(1024, &vpx_satd_c)));
INSTANTIATE_TEST_CASE_P(
- C, BlockErrorTest,
+ C, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_c),
make_tuple(64, &vp9_block_error_fp_c),
make_tuple(256, &vp9_block_error_fp_c),
@@ -447,7 +447,7 @@
make_tuple(1024, &vpx_satd_sse2)));
INSTANTIATE_TEST_CASE_P(
- SSE2, BlockErrorTest,
+ SSE2, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
make_tuple(64, &vp9_block_error_fp_sse2),
make_tuple(256, &vp9_block_error_fp_sse2),
@@ -488,7 +488,7 @@
// in place.
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
- NEON, BlockErrorTest,
+ NEON, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
make_tuple(64, &vp9_block_error_fp_neon),
make_tuple(256, &vp9_block_error_fp_neon),
--- a/test/test.mk
+++ b/test/test.mk
@@ -157,7 +157,7 @@
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
--- /dev/null
+++ b/test/vp9_block_error_test.cc
@@ -1,0 +1,198 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int kNumIterations = 1000;
+
+typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps);
+
+typedef std::tr1::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
+ BlockErrorParam;
+
+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz);
+
+template <BlockErrorFunc fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bps) {
+ EXPECT_EQ(bps, 8);
+ return fn(coeff, dqcoeff, block_size, ssz);
+}
+
+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
+ public:
+ virtual ~BlockErrorTest() {}
+ virtual void SetUp() {
+ error_block_op_ = GET_PARAM(0);
+ ref_error_block_op_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ vpx_bit_depth_t bit_depth_;
+ HBDBlockErrorFunc error_block_op_;
+ HBDBlockErrorFunc ref_error_block_op_;
+};
+
+TEST_P(BlockErrorTest, OperationCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ int err_count_total = 0;
+ int first_failure = -1;
+ intptr_t block_size;
+ int64_t ssz;
+ int64_t ret;
+ int64_t ref_ssz;
+ int64_t ref_ret;
+ const int msb = bit_depth_ + 8 - 1;
+ for (int i = 0; i < kNumIterations; ++i) {
+ int err_count = 0;
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (int j = 0; j < block_size; j++) {
+ // coeff and dqcoeff will always have at least the same sign, and this
+ // can be used for optimization, so generate test input precisely.
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << msb);
+ dqcoeff[j] = rnd(1 << msb);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << msb);
+ dqcoeff[j] = -rnd(1 << msb);
+ }
+ }
+ ref_ret =
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ ASM_REGISTER_STATE_CHECK(
+ ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+ err_count += (ref_ret != ret) | (ref_ssz != ssz);
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Error Block Test, C output doesn't match optimized output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(BlockErrorTest, ExtremeValues) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ int err_count_total = 0;
+ int first_failure = -1;
+ intptr_t block_size;
+ int64_t ssz;
+ int64_t ret;
+ int64_t ref_ssz;
+ int64_t ref_ret;
+ const int msb = bit_depth_ + 8 - 1;
+ int max_val = ((1 << msb) - 1);
+ for (int i = 0; i < kNumIterations; ++i) {
+ int err_count = 0;
+ int k = (i / 9) % 9;
+
+ // Change the maximum coeff value, to test different bit boundaries
+ if (k == 8 && (i % 9) == 0) {
+ max_val >>= 1;
+ }
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (int j = 0; j < block_size; j++) {
+ if (k < 4) {
+ // Test at positive maximum values
+ coeff[j] = k % 2 ? max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+ } else if (k < 8) {
+ // Test at negative maximum values
+ coeff[j] = k % 2 ? -max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
+ } else {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << 14);
+ dqcoeff[j] = rnd(1 << 14);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << 14);
+ dqcoeff[j] = -rnd(1 << 14);
+ }
+ }
+ }
+ ref_ret =
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ ASM_REGISTER_STATE_CHECK(
+ ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+ err_count += (ref_ret != ret) | (ref_ssz != ssz);
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Error Block Test, C output doesn't match optimized output. "
+ << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const BlockErrorParam sse2_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+ make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+ VPX_BITS_10),
+ make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+ VPX_BITS_12),
+ make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+ VPX_BITS_8),
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,
+ &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,
+ ::testing::ValuesIn(sse2_block_error_tests));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, BlockErrorTest,
+ ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,
+ &BlockError8BitWrapper<vp9_block_error_c>,
+ VPX_BITS_8)));
+#endif // HAVE_AVX2
+} // namespace
--- a/test/vp9_error_block_test.cc
+++ /dev/null
@@ -1,199 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
-const int kNumIterations = 1000;
-
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz, int bps);
-
-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
- ErrorBlockParam;
-
-// wrapper for 8-bit block error functions without a 'bps' param.
-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz);
-template <HighBdBlockError8bit fn>
-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz,
- int bps) {
- EXPECT_EQ(8, bps);
- return fn(coeff, dqcoeff, block_size, ssz);
-}
-
-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
- public:
- virtual ~ErrorBlockTest() {}
- virtual void SetUp() {
- error_block_op_ = GET_PARAM(0);
- ref_error_block_op_ = GET_PARAM(1);
- bit_depth_ = GET_PARAM(2);
- }
-
- virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
- vpx_bit_depth_t bit_depth_;
- ErrorBlockFunc error_block_op_;
- ErrorBlockFunc ref_error_block_op_;
-};
-
-TEST_P(ErrorBlockTest, OperationCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
- int err_count_total = 0;
- int first_failure = -1;
- intptr_t block_size;
- int64_t ssz;
- int64_t ret;
- int64_t ref_ssz;
- int64_t ref_ret;
- const int msb = bit_depth_ + 8 - 1;
- for (int i = 0; i < kNumIterations; ++i) {
- int err_count = 0;
- block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
- for (int j = 0; j < block_size; j++) {
- // coeff and dqcoeff will always have at least the same sign, and this
- // can be used for optimization, so generate test input precisely.
- if (rnd(2)) {
- // Positive number
- coeff[j] = rnd(1 << msb);
- dqcoeff[j] = rnd(1 << msb);
- } else {
- // Negative number
- coeff[j] = -rnd(1 << msb);
- dqcoeff[j] = -rnd(1 << msb);
- }
- }
- ref_ret =
- ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
- ASM_REGISTER_STATE_CHECK(
- ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
- err_count += (ref_ret != ret) | (ref_ssz != ssz);
- if (err_count && !err_count_total) {
- first_failure = i;
- }
- err_count_total += err_count;
- }
- EXPECT_EQ(0, err_count_total)
- << "Error: Error Block Test, C output doesn't match optimized output. "
- << "First failed at test case " << first_failure;
-}
-
-TEST_P(ErrorBlockTest, ExtremeValues) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
- int err_count_total = 0;
- int first_failure = -1;
- intptr_t block_size;
- int64_t ssz;
- int64_t ret;
- int64_t ref_ssz;
- int64_t ref_ret;
- const int msb = bit_depth_ + 8 - 1;
- int max_val = ((1 << msb) - 1);
- for (int i = 0; i < kNumIterations; ++i) {
- int err_count = 0;
- int k = (i / 9) % 9;
-
- // Change the maximum coeff value, to test different bit boundaries
- if (k == 8 && (i % 9) == 0) {
- max_val >>= 1;
- }
- block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
- for (int j = 0; j < block_size; j++) {
- if (k < 4) {
- // Test at positive maximum values
- coeff[j] = k % 2 ? max_val : 0;
- dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
- } else if (k < 8) {
- // Test at negative maximum values
- coeff[j] = k % 2 ? -max_val : 0;
- dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
- } else {
- if (rnd(2)) {
- // Positive number
- coeff[j] = rnd(1 << 14);
- dqcoeff[j] = rnd(1 << 14);
- } else {
- // Negative number
- coeff[j] = -rnd(1 << 14);
- dqcoeff[j] = -rnd(1 << 14);
- }
- }
- }
- ref_ret =
- ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
- ASM_REGISTER_STATE_CHECK(
- ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
- err_count += (ref_ret != ret) | (ref_ssz != ssz);
- if (err_count && !err_count_total) {
- first_failure = i;
- }
- err_count_total += err_count;
- }
- EXPECT_EQ(0, err_count_total)
- << "Error: Error Block Test, C output doesn't match optimized output. "
- << "First failed at test case " << first_failure;
-}
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
- SSE2, ErrorBlockTest,
- ::testing::Values(
- make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
- VPX_BITS_10),
- make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
- VPX_BITS_12),
- make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
- VPX_BITS_8),
- make_tuple(
- &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,
- &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
- VPX_BITS_8)));
-#endif // HAVE_SSE2
-
-#if HAVE_AVX
-INSTANTIATE_TEST_CASE_P(
- AVX, ErrorBlockTest,
- ::testing::Values(make_tuple(
- &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,
- &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
- VPX_BITS_8)));
-#endif // HAVE_AVX
-
-#endif // CONFIG_VP9_HIGHBITDEPTH
-} // namespace
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -130,9 +130,6 @@
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error sse2/;
- add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
specialize qw/vp9_block_error_fp sse2/;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -284,22 +284,12 @@
return error;
}
-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz) {
- // Note that the C versions of these 2 functions (vp9_block_error and
- // vp9_highbd_block_error_8bit are the same, but the optimized assembly
- // routines are not compatible in the non high bitdepth configuration, so
- // they still cannot share the same name.
- return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bd) {
if (bd == 8) {
- return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+ return vp9_block_error(coeff, dqcoeff, block_size, ssz);
} else {
return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
}
@@ -1130,16 +1120,9 @@
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
- distortion +=
- vp9_highbd_block_error_8bit(
- coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
- 2;
-#else
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &unused) >>
2;
-#endif
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next;
vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
--- a/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ /dev/null
@@ -1,261 +1,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-; intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
- vzeroupper
-
- ; If only one iteration is required, then handle this as a special case.
- ; It is the most frequent case, so we can have a significant gain here
- ; by not setting up a loop and accumulators.
- cmp sizeq, 16
- jne .generic
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Common case of size == 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- ; Load input vectors
- mova xm0, [dqcq]
- packssdw xm0, [dqcq+16]
- mova xm2, [uqcq]
- packssdw xm2, [uqcq+16]
-
- mova xm1, [dqcq+32]
- packssdw xm1, [dqcq+48]
- mova xm3, [uqcq+32]
- packssdw xm3, [uqcq+48]
-
- ; Compute the errors.
- psubw xm0, xm2
- psubw xm1, xm3
-
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- pmaddwd xm2, xm2
- pmaddwd xm3, xm3
-
- pmaddwd xm0, xm0
- pmaddwd xm1, xm1
-
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
-
- ; Accumulate horizontally in 64 bits, there is no chance of overflow here
- pxor xm5, xm5
-
- pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
-
- pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
-
- paddq xm2, xm3
- paddq xm0, xm1
-
- psrldq xm3, xm2, 8
- psrldq xm1, xm0, 8
-
- paddq xm2, xm3
- paddq xm0, xm1
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm0
- movq [sszq], xm2
-%else
- movd eax, xm0
- pextrd edx, xm0, 1
- movq [sszd], xm2
-%endif
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, speculative low precision
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ALIGN 16
-.generic:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; overflow detection register for xm4
- pxor xm6, xm6 ; ssz accumulator
- pxor xm7, xm7 ; overflow detection register for xm6
- lea uqcq, [uqcq+sizeq*4]
- lea dqcq, [dqcq+sizeq*4]
- neg sizeq
-
- ; Push the negative size as the high precision code might need it
- push sizeq
-
-.loop:
- ; Load input vectors
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
-
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
-
- add sizeq, 16
-
- ; Compute the squared errors.
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
-
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
-
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
-
- ; We accumulate using 32 bit arithmetic, but detect potential overflow
- ; by checking if the MSB of the accumulators have ever been a set bit.
- ; If yes, we redo the whole compute at the end on higher precision, but
- ; this happens extremely rarely, so we still achieve a net gain.
- paddd xm4, xm0
- paddd xm6, xm2
- por xm5, xm4 ; OR in the accumulator for overflow detection
- por xm7, xm6 ; OR in the accumulator for overflow detection
-
- jnz .loop
-
- ; Add pairs horizontally (still only on 32 bits)
- phaddd xm4, xm4
- por xm5, xm4 ; OR in the accumulator for overflow detection
- phaddd xm6, xm6
- por xm7, xm6 ; OR in the accumulator for overflow detection
-
- ; Check for possibility of overflow by testing if bit 32 of each dword lane
- ; have ever been set. If they were not, then there was no overflow and the
- ; final sum will fit in 32 bits. If overflow happened, then
- ; we redo the whole computation on higher precision.
- por xm7, xm5
- pmovmskb r4, xm7
- test r4, 0x8888
- jnz .highprec
-
- phaddd xm4, xm4
- phaddd xm6, xm6
- pmovzxdq xm4, xm4
- pmovzxdq xm6, xm6
-
- ; Restore stack
- pop sizeq
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
-%else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
-%endif
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, high precision case
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; dedicated zero register
- pxor xm6, xm6 ; ssz accumulator
- pop sizeq
-
-.loophp:
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
-
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
-
- add sizeq, 16
-
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
-
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
-
- ; accumulate in 64bit
- punpckldq xm7, xm0, xm5
- punpckhdq xm0, xm5
- paddq xm4, xm7
-
- punpckldq xm7, xm2, xm5
- punpckhdq xm2, xm5
- paddq xm6, xm7
-
- punpckldq xm7, xm1, xm5
- punpckhdq xm1, xm5
- paddq xm4, xm7
-
- punpckldq xm7, xm3, xm5
- punpckhdq xm3, xm5
- paddq xm6, xm7
-
- paddq xm4, xm0
- paddq xm4, xm1
- paddq xm6, xm2
- paddq xm6, xm3
-
- jnz .loophp
-
- ; Accumulate horizontally
- movhlps xm5, xm4
- movhlps xm7, xm6
- paddq xm4, xm5
- paddq xm6, xm7
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
-%else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
-%endif
- RET
-
-END
--- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ /dev/null
@@ -1,98 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-; intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
- pxor m4, m4 ; sse accumulator
- pxor m6, m6 ; ssz accumulator
- pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*4]
- lea dqcq, [dqcq+sizeq*4]
- neg sizeq
-
- ALIGN 16
-
-.loop:
- mova m0, [dqcq+sizeq*4]
- packssdw m0, [dqcq+sizeq*4+mmsize]
- mova m2, [uqcq+sizeq*4]
- packssdw m2, [uqcq+sizeq*4+mmsize]
-
- mova m1, [dqcq+sizeq*4+mmsize*2]
- packssdw m1, [dqcq+sizeq*4+mmsize*3]
- mova m3, [uqcq+sizeq*4+mmsize*2]
- packssdw m3, [uqcq+sizeq*4+mmsize*3]
-
- add sizeq, mmsize
-
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
- psubw m0, m2
- pmaddwd m2, m2
- pmaddwd m0, m0
-
- psubw m1, m3
- pmaddwd m3, m3
- pmaddwd m1, m1
-
- ; accumulate in 64bit
- punpckldq m7, m0, m5
- punpckhdq m0, m5
- paddq m4, m7
-
- punpckldq m7, m2, m5
- punpckhdq m2, m5
- paddq m6, m7
-
- punpckldq m7, m1, m5
- punpckhdq m1, m5
- paddq m4, m7
-
- punpckldq m7, m3, m5
- punpckhdq m3, m5
- paddq m6, m7
-
- paddq m4, m0
- paddq m4, m1
- paddq m6, m2
- paddq m6, m3
-
- jnz .loop
-
- ; accumulate horizontally and store in return value
- movhlps m5, m4
- movhlps m7, m6
- paddq m4, m5
- paddq m6, m7
-
-%if ARCH_X86_64
- movq rax, m4
- movq [sszq], m6
-%else
- mov eax, sszm
- pshufd m5, m4, 0x1
- movq [eax], m6
- movd eax, m4
- movd edx, m5
-%endif
- RET
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -108,10 +108,6 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-endif
ifeq ($(ARCH_X86_64),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm