ref: 81a98509dcd44e939656d8c281e5a5ae5b9926be
parent: af355dacd568dfe5c109ae2d6d22104f94cdcf7f
author: Luc Trudeau <luc@trud.ca>
date: Thu May 10 12:54:13 EDT 2018
Faster VSX vpx_quantize_b Process 16 coefficients on the first iteration (a full 4x4) and 24 coefficients on subsequent iteration. VSX/VP9QuantizeTest.DISABLED_Speed Before: 4x4 176 ms 8x8 91 ms 16x16 72 ms After: 4x4 152 ms 8x8 82 ms 16x16 64 ms Change-Id: I07cb130833504206ccdc5bc12ae5af369364999a
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -41,10 +41,9 @@
}
static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
- const int16_t *iscan_ptr) {
- bool16x8_t zero_coeff;
- int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
- zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+ const int16_t *iscan_ptr, int index) {
+ int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+ bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
scan = vec_sub(scan, mask);
return vec_andc(scan, zero_coeff);
}
@@ -64,7 +63,8 @@
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan_ptr,
const int16_t *iscan_ptr) {
- int16x8_t qcoeff, dqcoeff, eob;
+ int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+ bool16x8_t zero_mask0, zero_mask1;
// First set of 8 coeff starts with DC + 7 AC
int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
@@ -73,51 +73,86 @@
int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
- int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
- int16x8_t coeff_abs = vec_abs(coeff);
- bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+ int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+ int16x8_t coeff0_abs = vec_abs(coeff0);
+ int16x8_t coeff1_abs = vec_abs(coeff1);
+
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zbin = vec_splat(zbin, 1);
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
(void)scan_ptr;
(void)skip_block;
assert(!skip_block);
- qcoeff =
- quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
- vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
- dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
- vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
- eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
-
- // All other sets of 8 coeffs will only contain AC
- zbin = vec_splat(zbin, 1);
+ qcoeff0 =
+ quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+ vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
round = vec_splat(round, 1);
quant = vec_splat(quant, 1);
- dequant = vec_splat(dequant, 1);
quant_shift = vec_splat(quant_shift, 1);
+ qcoeff1 =
+ quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+ vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
- n_coeffs -= 8;
- do {
- coeff_ptr += 8;
- qcoeff_ptr += 8;
- dqcoeff_ptr += 8;
- iscan_ptr += 8;
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+ dequant = vec_splat(dequant, 1);
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
- coeff = vec_vsx_ld(0, coeff_ptr);
- coeff_abs = vec_abs(coeff);
- zero_mask = vec_cmpge(coeff_abs, zbin);
- qcoeff =
- quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
- vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+ eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
- dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
- vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+ if (n_coeffs > 16) {
+ int index = 16;
+ int off0 = 32;
+ int off1 = 48;
+ int off2 = 64;
+ do {
+ int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+ bool16x8_t zero_mask2;
+ coeff0 = vec_vsx_ld(off0, coeff_ptr);
+ coeff1 = vec_vsx_ld(off1, coeff_ptr);
+ coeff2 = vec_vsx_ld(off2, coeff_ptr);
+ coeff0_abs = vec_abs(coeff0);
+ coeff1_abs = vec_abs(coeff1);
+ coeff2_abs = vec_abs(coeff2);
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+ zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+ qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+ zero_mask0);
+ qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+ zero_mask1);
+ qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+ zero_mask2);
+ vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+ vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+ vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
- eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
- n_coeffs -= 8;
- } while (n_coeffs > 0);
+ vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+ eob =
+ vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+ eob = vec_max(eob, eob2);
+
+ index += 24;
+ off0 += 48;
+ off1 += 48;
+ off2 += 48;
+ } while (index < n_coeffs);
+ }
eob = vec_max_across(eob);
*eob_ptr = eob[0];