shithub: libvpx

--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c

+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c

@@ -26,9 +26,11 @@

                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);

   int16x8_t x0, x1, sz0, sz1, y0, y1;

   uint16x8_t eob0, eob1;

+#ifndef __aarch64__

   uint16x4_t eob_d16;

   uint32x2_t eob_d32;

   uint32x4_t eob_q32;

+#endif  // __arch64__

   /* sign of z: z >> 15 */

   sz0 = vshrq_n_s16(z0, 15);

@@ -66,11 +68,17 @@

   /* select the largest value */

   eob0 = vmaxq_u16(eob0, eob1);

+#ifdef __aarch64__

+  *d->eob = (int8_t)vmaxvq_u16(eob0);

+#else

   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));

   eob_q32 = vmovl_u16(eob_d16);

   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));

   eob_d32 = vpmax_u32(eob_d32, eob_d32);

+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);

+#endif  // __aarch64__

   /* qcoeff = x */

   vst1q_s16(d->qcoeff, x0);

   vst1q_s16(d->qcoeff + 8, x1);

@@ -78,6 +86,4 @@

   /* dqcoeff = x * dequant */

   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));

   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));

-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);

--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c

+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -97,6 +97,9 @@

     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);

     store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);

+#ifdef __aarch64__

+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);

+#else

     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),

                                              vget_high_s16(v_eobmax_76543210));

@@ -111,6 +114,7 @@

     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);

+#endif  // __aarch64__

 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {

@@ -226,6 +230,9 @@

       dqcoeff_ptr += 8;

+#ifdef __aarch64__

+    *eob_ptr = vmaxvq_u16(eob_max);

+#else

       const uint16x4_t eob_max_0 =

           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));

@@ -233,5 +240,6 @@

       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);

       vst1_lane_u16(eob_ptr, eob_max_2, 0);

+#endif  // __aarch64__

--- a/vpx_dsp/arm/quantize_neon.c

+++ b/vpx_dsp/arm/quantize_neon.c

@@ -135,6 +135,9 @@

     } while (n_coeffs > 0);

+#ifdef __aarch64__

+  *eob_ptr = vmaxvq_u16(eob_max);

+#else

     const uint16x4_t eob_max_0 =

         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));

@@ -142,6 +145,7 @@

     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);

     vst1_lane_u16(eob_ptr, eob_max_2, 0);

+#endif  // __aarch64__

 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {

@@ -288,6 +292,9 @@

+#ifdef __aarch64__

+  *eob_ptr = vmaxvq_u16(eob_max);

+#else

     const uint16x4_t eob_max_0 =

         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));

@@ -295,4 +302,5 @@

     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);

     vst1_lane_u16(eob_ptr, eob_max_2, 0);

+#endif  // __aarch64__

--

⑨