shithub: libvpx

--- a/vpx_dsp/ppc/quantize_vsx.c

+++ b/vpx_dsp/ppc/quantize_vsx.c

@@ -41,10 +41,9 @@

 static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,

-                                          const int16_t *iscan_ptr) {

-  bool16x8_t zero_coeff;

-  int16x8_t scan = vec_vsx_ld(0, iscan_ptr);

-  zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);

+                                          const int16_t *iscan_ptr, int index) {

+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);

+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);

   scan = vec_sub(scan, mask);

   return vec_andc(scan, zero_coeff);

@@ -64,7 +63,8 @@

                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

                         uint16_t *eob_ptr, const int16_t *scan_ptr,

                         const int16_t *iscan_ptr) {

-  int16x8_t qcoeff, dqcoeff, eob;

+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;

+  bool16x8_t zero_mask0, zero_mask1;

   // First set of 8 coeff starts with DC + 7 AC

   int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);

@@ -73,51 +73,86 @@

   int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);

   int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);

-  int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);

-  int16x8_t coeff_abs = vec_abs(coeff);

-  bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);

+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);

+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);

+  int16x8_t coeff0_abs = vec_abs(coeff0);

+  int16x8_t coeff1_abs = vec_abs(coeff1);

+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);

+  zbin = vec_splat(zbin, 1);

+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);

   (void)scan_ptr;

   (void)skip_block;

   assert(!skip_block);

-  qcoeff =

-      quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);

-  vec_vsx_st(qcoeff, 0, qcoeff_ptr);

-  dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);

-  vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);

-  eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);

-  // All other sets of 8 coeffs will only contain AC

-  zbin = vec_splat(zbin, 1);

+  qcoeff0 =

+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);

+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);

   round = vec_splat(round, 1);

   quant = vec_splat(quant, 1);

-  dequant = vec_splat(dequant, 1);

   quant_shift = vec_splat(quant_shift, 1);

+  qcoeff1 =

+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);

+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);

-  n_coeffs -= 8;

-  do {

-    coeff_ptr += 8;

-    qcoeff_ptr += 8;

-    dqcoeff_ptr += 8;

-    iscan_ptr += 8;

+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);

+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);

+  dequant = vec_splat(dequant, 1);

+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);

+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);

-    coeff = vec_vsx_ld(0, coeff_ptr);

-    coeff_abs = vec_abs(coeff);

-    zero_mask = vec_cmpge(coeff_abs, zbin);

-    qcoeff =

-        quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);

-    vec_vsx_st(qcoeff, 0, qcoeff_ptr);

+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),

+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));

-    dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);

-    vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);

+  if (n_coeffs > 16) {

+    int index = 16;

+    int off0 = 32;

+    int off1 = 48;

+    int off2 = 64;

+    do {

+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;

+      bool16x8_t zero_mask2;

+      coeff0 = vec_vsx_ld(off0, coeff_ptr);

+      coeff1 = vec_vsx_ld(off1, coeff_ptr);

+      coeff2 = vec_vsx_ld(off2, coeff_ptr);

+      coeff0_abs = vec_abs(coeff0);

+      coeff1_abs = vec_abs(coeff1);

+      coeff2_abs = vec_abs(coeff2);

+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);

+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);

+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);

+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,

+                               zero_mask0);

+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,

+                               zero_mask1);

+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,

+                               zero_mask2);

+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);

+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);

+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);

-    eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));

+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);

+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);

+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);

-    n_coeffs -= 8;

-  } while (n_coeffs > 0);

+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);

+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);

+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);

+      eob =

+          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));

+      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),

+                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));

+      eob = vec_max(eob, eob2);

+      index += 24;

+      off0 += 48;

+      off1 += 48;

+      off2 += 48;

+    } while (index < n_coeffs);

+  }

   eob = vec_max_across(eob);

   *eob_ptr = eob[0];

--

⑨