ref: e3f12b520fe2d963d4aecf18c30ebbb594d50899
parent: dff4d376eaff3460e930711807ce865ff76aff3a
author: Johann <johannkoenig@google.com>
date: Mon Apr 29 09:05:30 EDT 2019
vp8 quantize: use native abs/sign implementations ~4% improvement with a very rudimentary speed test Change-Id: Iad8868327e3276dbead783a79849295b0e4b135c
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -13,9 +13,10 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
#include "test/acm_random.h"
+#include "test/bench.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
@@ -117,7 +118,8 @@
};
class QuantizeTest : public QuantizeTestBase,
- public ::testing::TestWithParam<VP8QuantizeParam> {
+ public ::testing::TestWithParam<VP8QuantizeParam>,
+ public AbstractBench {
protected:
virtual void SetUp() {
SetupCompressor();
@@ -125,6 +127,10 @@
c_quant_ = GET_PARAM(1);
}
+ virtual void Run() {
+ asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]);
+ }
+
void RunComparison() {
for (int i = 0; i < kNumBlocks; ++i) {
ASM_REGISTER_STATE_CHECK(
@@ -165,6 +171,13 @@
FillCoeffRandom();
RunComparison();
}
+}
+
+TEST_P(QuantizeTest, DISABLED_Speed) {
+ FillCoeffRandom();
+
+ RunNTimes(10000000);
+ PrintMedian("vp8 quantize");
}
#if HAVE_SSE2
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -11,8 +11,8 @@
#include <smmintrin.h> /* SSE4.1 */
#include "./vp8_rtcd.h"
-#include "vp8/encoder/block.h"
#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+#include "vp8/encoder/block.h"
#define SELECT_EOB(i, z, x, y, q) \
do { \
@@ -31,8 +31,7 @@
char eob = 0;
short *zbin_boost_ptr = b->zrun_zbin_boost;
- __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,
- dqcoeff1;
+ __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
__m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
__m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
__m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
@@ -53,16 +52,10 @@
zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
- /* Sign of z: z >> 15 */
- sz0 = _mm_srai_epi16(z0, 15);
- sz1 = _mm_srai_epi16(z1, 15);
+ /* x = abs(z) */
+ x0 = _mm_abs_epi16(z0);
+ x1 = _mm_abs_epi16(z1);
- /* x = abs(z): (z ^ sz) - sz */
- x0 = _mm_xor_si128(z0, sz0);
- x1 = _mm_xor_si128(z1, sz1);
- x0 = _mm_sub_epi16(x0, sz0);
- x1 = _mm_sub_epi16(x1, sz1);
-
/* zbin[] + zbin_extra */
zbin0 = _mm_add_epi16(zbin0, zbin_extra);
zbin1 = _mm_add_epi16(zbin1, zbin_extra);
@@ -89,11 +82,9 @@
y0 = _mm_mulhi_epi16(y0, quant_shift0);
y1 = _mm_mulhi_epi16(y1, quant_shift1);
- /* Return the sign: (y ^ sz) - sz */
- y0 = _mm_xor_si128(y0, sz0);
- y1 = _mm_xor_si128(y1, sz1);
- y0 = _mm_sub_epi16(y0, sz0);
- y1 = _mm_sub_epi16(y1, sz1);
+ /* Restore the sign. */
+ y0 = _mm_sign_epi16(y0, z0);
+ y1 = _mm_sign_epi16(y1, z1);
/* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);