ref: 2a5aa98a35d295bbfd17d107630be84a5ccc1077
parent: 13eed991f9450cdd8b03fc03c38ea0f9b2667bf8
author: Johann <johannkoenig@google.com>
date: Mon Aug 21 07:23:49 EDT 2017
quantize neon: round dqcoeff towards zero Add 1 if negative to get dqcoeff to round towards zero. 10-15% faster than converting to positive before shifting. Change-Id: I01a62fd0c9bca786b6885b318bd447bb9229903d
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -10,6 +10,7 @@
#include <arm_neon.h>
+#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/mem_neon.h"
@@ -154,6 +155,10 @@
}
}
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
void vpx_quantize_b_32x32_neon(
@@ -205,7 +210,7 @@
// (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
int16x8_t dqcoeff;
- int32x4_t dqcoeff_0, dqcoeff_1, dqcoeff_0_sign, dqcoeff_1_sign;
+ int32x4_t dqcoeff_0, dqcoeff_1;
qcoeff = vaddq_s16(qcoeff, rounded);
@@ -230,21 +235,12 @@
dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
- // The way the C shifts the values requires us to convert to positive before
- // shifting or even narrowing, then put the sign back.
- dqcoeff_0_sign = vshrq_n_s32(dqcoeff_0, 31);
- dqcoeff_1_sign = vshrq_n_s32(dqcoeff_1, 31);
- dqcoeff_0 = vabsq_s32(dqcoeff_0);
- dqcoeff_1 = vabsq_s32(dqcoeff_1);
- dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
- dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
- dqcoeff_0 = veorq_s32(dqcoeff_0, dqcoeff_0_sign);
- dqcoeff_1 = veorq_s32(dqcoeff_1, dqcoeff_1_sign);
- dqcoeff_0 = vsubq_s32(dqcoeff_0, dqcoeff_0_sign);
- dqcoeff_1 = vsubq_s32(dqcoeff_1, dqcoeff_1_sign);
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
- // Narrow *without saturation* because that's what the C does.
- dqcoeff = vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1));
+ dqcoeff =
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
dqcoeff_ptr += 8;
@@ -274,7 +270,7 @@
// (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
int16x8_t dqcoeff;
- int32x4_t dqcoeff_0, dqcoeff_1, dqcoeff_0_sign, dqcoeff_1_sign;
+ int32x4_t dqcoeff_0, dqcoeff_1;
qcoeff = vaddq_s16(qcoeff, rounded);
@@ -300,18 +296,11 @@
dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
- dqcoeff_0_sign = vshrq_n_s32(dqcoeff_0, 31);
- dqcoeff_1_sign = vshrq_n_s32(dqcoeff_1, 31);
- dqcoeff_0 = vabsq_s32(dqcoeff_0);
- dqcoeff_1 = vabsq_s32(dqcoeff_1);
- dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
- dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
- dqcoeff_0 = veorq_s32(dqcoeff_0, dqcoeff_0_sign);
- dqcoeff_1 = veorq_s32(dqcoeff_1, dqcoeff_1_sign);
- dqcoeff_0 = vsubq_s32(dqcoeff_0, dqcoeff_0_sign);
- dqcoeff_1 = vsubq_s32(dqcoeff_1, dqcoeff_1_sign);
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
- dqcoeff = vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1));
+ dqcoeff =
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
dqcoeff_ptr += 8;