shithub: libvpx

--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c

+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -9,7 +9,7 @@

*/

 #include <arm_neon.h>

+#include <assert.h>

 #include <math.h>

 #include "vpx_mem/vpx_mem.h"

@@ -31,86 +31,83 @@

                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

                           uint16_t *eob_ptr, const int16_t *scan,

                           const int16_t *iscan) {

+  // Quantization pass: All coefficients with index >= zero_flag are

+  // skippable. Note: zero_flag can be zero.

+  int i;

+  const int16x8_t v_zero = vdupq_n_s16(0);

+  const int16x8_t v_one = vdupq_n_s16(1);

+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);

+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);

+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);

+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);

   (void)scan;

+  (void)skip_block;

+  assert(!skip_block);

-  if (!skip_block) {

-    // Quantization pass: All coefficients with index >= zero_flag are

-    // skippable. Note: zero_flag can be zero.

-    int i;

-    const int16x8_t v_zero = vdupq_n_s16(0);

-    const int16x8_t v_one = vdupq_n_s16(1);

-    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);

-    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);

-    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);

-    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);

-    // adjust for dc

-    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);

-    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);

-    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);

-    // process dc and the first seven ac coeffs

-    {

-      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);

-      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);

-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);

-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);

-      const int32x4_t v_tmp_lo =

-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));

-      const int32x4_t v_tmp_hi =

-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));

-      const int16x8_t v_tmp2 =

-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));

-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);

-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);

-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);

-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);

-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);

-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);

-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);

-      store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);

-      store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);

-      v_round = vmovq_n_s16(round_ptr[1]);

-      v_quant = vmovq_n_s16(quant_ptr[1]);

-      v_dequant = vmovq_n_s16(dequant_ptr[1]);

-    }

-    // now process the rest of the ac coeffs

-    for (i = 8; i < count; i += 8) {

-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);

-      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);

-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);

-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);

-      const int32x4_t v_tmp_lo =

-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));

-      const int32x4_t v_tmp_hi =

-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));

-      const int16x8_t v_tmp2 =

-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));

-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);

-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);

-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);

-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);

-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);

-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);

-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);

-      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);

-      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);

-    }

-    {

-      const int16x4_t v_eobmax_3210 = vmax_s16(

-          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));

-      const int64x1_t v_eobmax_xx32 =

-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);

-      const int16x4_t v_eobmax_tmp =

-          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));

-      const int64x1_t v_eobmax_xxx3 =

-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);

-      const int16x4_t v_eobmax_final =

-          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));

+  // adjust for dc

+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);

+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);

+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);

+  // process dc and the first seven ac coeffs

+  {

+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);

+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);

+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);

+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);

+    const int32x4_t v_tmp_lo =

+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));

+    const int32x4_t v_tmp_hi =

+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));

+    const int16x8_t v_tmp2 =

+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));

+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);

+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);

+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);

+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);

+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);

+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);

+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);

+    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);

+    store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);

+    v_round = vmovq_n_s16(round_ptr[1]);

+    v_quant = vmovq_n_s16(quant_ptr[1]);

+    v_dequant = vmovq_n_s16(dequant_ptr[1]);

+  }

+  // now process the rest of the ac coeffs

+  for (i = 8; i < count; i += 8) {

+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);

+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);

+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);

+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);

+    const int32x4_t v_tmp_lo =

+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));

+    const int32x4_t v_tmp_hi =

+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));

+    const int16x8_t v_tmp2 =

+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));

+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);

+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);

+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);

+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);

+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);

+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);

+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);

+    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);

+    store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);

+  }

+  {

+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),

+                                             vget_high_s16(v_eobmax_76543210));

+    const int64x1_t v_eobmax_xx32 =

+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);

+    const int16x4_t v_eobmax_tmp =

+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));

+    const int64x1_t v_eobmax_xxx3 =

+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);

+    const int16x4_t v_eobmax_final =

+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));

-      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);

-    }

-  } else {

-    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

-    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

-    *eob_ptr = 0;

+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);

--- a/vp9/encoder/x86/vp9_quantize_sse2.c

+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <emmintrin.h>

 #include <xmmintrin.h>

@@ -25,8 +26,12 @@

   __m128i zero;

   __m128i thr;

   int16_t nzflag;

+  __m128i eob;

+  __m128i round, quant, dequant;

   (void)scan_ptr;

+  (void)skip_block;

+  assert(!skip_block);

   coeff_ptr += n_coeffs;

   iscan_ptr += n_coeffs;

@@ -35,40 +40,106 @@

   n_coeffs = -n_coeffs;

   zero = _mm_setzero_si128();

-  if (!skip_block) {

-    __m128i eob;

-    __m128i round, quant, dequant;

+  {

+    __m128i coeff0, coeff1;

+    // Setup global values

-      __m128i coeff0, coeff1;

+      round = _mm_load_si128((const __m128i *)round_ptr);

+      quant = _mm_load_si128((const __m128i *)quant_ptr);

+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);

+    }

-      // Setup global values

-      {

-        round = _mm_load_si128((const __m128i *)round_ptr);

-        quant = _mm_load_si128((const __m128i *)quant_ptr);

-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);

-      }

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      // Do DC and first 15 AC

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        // Do DC and first 15 AC

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+      round = _mm_unpackhi_epi64(round, round);

+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+      quant = _mm_unpackhi_epi64(quant, quant);

+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+      // Reinsert signs

+      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+      dequant = _mm_unpackhi_epi64(dequant, dequant);

+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

+    }

+    {

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob = _mm_max_epi16(eob, eob1);

+    }

+    n_coeffs += 8 * 2;

+  }

+  thr = _mm_srai_epi16(dequant, 1);

+  // AC only loop

+  while (n_coeffs < 0) {

+    __m128i coeff0, coeff1;

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |

+               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));

+      if (nzflag) {

         qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        round = _mm_unpackhi_epi64(round, round);

         qcoeff1 = _mm_adds_epi16(qcoeff1, round);

         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        quant = _mm_unpackhi_epi64(quant, quant);

         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

         // Reinsert signs

@@ -81,131 +152,51 @@

         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        dequant = _mm_unpackhi_epi64(dequant, dequant);

         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-      }

+      } else {

+        store_zero_tran_low(qcoeff_ptr + n_coeffs);

+        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob = _mm_max_epi16(eob, eob1);

+        store_zero_tran_low(dqcoeff_ptr + n_coeffs);

+        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);

-      n_coeffs += 8 * 2;

-    thr = _mm_srai_epi16(dequant, 1);

-    // AC only loop

-    while (n_coeffs < 0) {

-      __m128i coeff0, coeff1;

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |

-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));

-        if (nzflag) {

-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-          // Reinsert signs

-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);

-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);

-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-        } else {

-          store_zero_tran_low(qcoeff_ptr + n_coeffs);

-          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

-          store_zero_tran_low(dqcoeff_ptr + n_coeffs);

-          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);

-        }

-      }

-      if (nzflag) {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob0, eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob0 = _mm_max_epi16(eob0, eob1);

-        eob = _mm_max_epi16(eob, eob0);

-      }

-      n_coeffs += 8 * 2;

+    if (nzflag) {

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob0, eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob0 = _mm_max_epi16(eob0, eob1);

+      eob = _mm_max_epi16(eob, eob0);

+    n_coeffs += 8 * 2;

+  }

-    // Accumulate EOB

-    {

-      __m128i eob_shuffled;

-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      *eob_ptr = _mm_extract_epi16(eob, 1);

-    }

-  } else {

-    do {

-      store_zero_tran_low(qcoeff_ptr + n_coeffs);

-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);

-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);

-      n_coeffs += 8 * 2;

-    } while (n_coeffs < 0);

-    *eob_ptr = 0;

+  // Accumulate EOB

+  {

+    __m128i eob_shuffled;

+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    *eob_ptr = _mm_extract_epi16(eob, 1);

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -22,8 +22,6 @@

 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \

                                 qcoeff, dqcoeff, dequant, \

                                 eob, scan, iscan

-  cmp                    dword skipm, 0

-  jne .blank

   ; actual quantize loop - setup pointers, rounders, etc.

   movifnidn                   coeffq, coeffmp

@@ -172,27 +170,6 @@

   pmaxsw                          m8, m7

   pextrw                          r6, m8, 0

   mov                           [r2], r6

-  RET

-  ; skip-block, i.e. just write all zeroes

-.blank:

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-  lea                            r0q, [r0q+ncoeffq*2]

-  lea                            r2q, [r2q+ncoeffq*2]

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq

-  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8

-  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq

-  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                     word [r3q], 0

RET

 %endmacro

--- a/vpx_dsp/arm/quantize_neon.c

+++ b/vpx_dsp/arm/quantize_neon.c

@@ -9,6 +9,7 @@

*/

 #include <arm_neon.h>

+#include <assert.h>

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/mem_neon.h"

@@ -20,25 +21,13 @@

                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

                          uint16_t *eob_ptr, const int16_t *scan_ptr,

                          const int16_t *iscan_ptr) {

-  const int16x8_t zero = vdupq_n_s16(0);

   const int16x8_t one = vdupq_n_s16(1);

   const int16x8_t neg_one = vdupq_n_s16(-1);

   uint16x8_t eob_max;

   (void)scan_ptr;

+  (void)skip_block;

+  assert(!skip_block);

-  if (skip_block) {

-    do {

-      store_s16q_to_tran_low(qcoeff_ptr, zero);

-      store_s16q_to_tran_low(dqcoeff_ptr, zero);

-      qcoeff_ptr += 8;

-      dqcoeff_ptr += 8;

-      n_coeffs -= 8;

-    } while (n_coeffs > 0);

-    *eob_ptr = 0;

-    return;

-  }

   // Process first 8 values which include a dc component.

     // Only the first element of each vector is DC.

@@ -162,7 +151,6 @@

     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,

     const int16_t *scan_ptr, const int16_t *iscan_ptr) {

-  const int16x8_t zero = vdupq_n_s16(0);

   const int16x8_t one = vdupq_n_s16(1);

   const int16x8_t neg_one = vdupq_n_s16(-1);

   uint16x8_t eob_max;

@@ -169,17 +157,8 @@

   int i;

   (void)scan_ptr;

   (void)n_coeffs;  // Because we will always calculate 32*32.

-  if (skip_block) {

-    for (i = 0; i < 32 * 32 / 8; ++i) {

-      store_s16q_to_tran_low(qcoeff_ptr, zero);

-      store_s16q_to_tran_low(dqcoeff_ptr, zero);

-      qcoeff_ptr += 8;

-      dqcoeff_ptr += 8;

-    }

-    *eob_ptr = 0;

-    return;

-  }

+  (void)skip_block;

+  assert(!skip_block);

   // Process first 8 values which include a dc component.

--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <emmintrin.h>

 #include "vpx_dsp/vpx_dsp_common.h"

@@ -37,54 +38,54 @@

   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

   (void)scan;

+  (void)skip_block;

+  assert(!skip_block);

   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = ((int)count / 4) - 1; i >= 0; i--) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (test == 0xffff)

-        non_zero_regs--;

-      else

-        break;

-    }

+  // Pre-scan pass

+  for (i = ((int)count / 4) - 1; i >= 0; i--) {

+    __m128i coeffs, cmp1, cmp2;

+    int test;

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+    cmp1 = _mm_and_si128(cmp1, cmp2);

+    test = _mm_movemask_epi8(cmp1);

+    if (test == 0xffff)

+      non_zero_regs--;

+    else

+      break;

+  }

-    // Quantization pass:

-    for (i = 0; i < non_zero_regs; i++) {

-      __m128i coeffs, coeffs_sign, tmp1, tmp2;

-      int test;

-      int abs_coeff[4];

-      int coeff_sign[4];

+  // Quantization pass:

+  for (i = 0; i < non_zero_regs; i++) {

+    __m128i coeffs, coeffs_sign, tmp1, tmp2;

+    int test;

+    int abs_coeff[4];

+    int coeff_sign[4];

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      coeffs_sign = _mm_srai_epi32(coeffs, 31);

-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

-      tmp1 = _mm_or_si128(tmp1, tmp2);

-      test = _mm_movemask_epi8(tmp1);

-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);

-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    coeffs_sign = _mm_srai_epi32(coeffs, 31);

+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

+    tmp1 = _mm_or_si128(tmp1, tmp2);

+    test = _mm_movemask_epi8(tmp1);

+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);

+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

-      for (j = 0; j < 4; j++) {

-        if (test & (1 << (4 * j))) {

-          int k = 4 * i + j;

-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];

-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;

-          const uint32_t abs_qcoeff =

-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);

-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

-        }

+    for (j = 0; j < 4; j++) {

+      if (test & (1 << (4 * j))) {

+        int k = 4 * i + j;

+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];

+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;

+        const uint32_t abs_qcoeff =

+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);

+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

@@ -105,6 +106,9 @@

   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);

   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);

   (void)scan;

+  (void)skip_block;

+  assert(!skip_block);

   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);

   zbins[1] = _mm_set1_epi32(zbin1_tmp);

@@ -116,38 +120,35 @@

   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = 0; i < n_coeffs / 4; i++) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;

-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;

-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;

-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;

-    }

+  // Pre-scan pass

+  for (i = 0; i < n_coeffs / 4; i++) {

+    __m128i coeffs, cmp1, cmp2;

+    int test;

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+    cmp1 = _mm_and_si128(cmp1, cmp2);

+    test = _mm_movemask_epi8(cmp1);

+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;

+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;

+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;

+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;

+  }

-    // Quantization pass: only process the coefficients selected in

-    // pre-scan pass. Note: idx can be zero.

-    for (i = 0; i < idx; i++) {

-      const int rc = idx_arr[i];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      const int64_t tmp1 =

-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

-      const uint32_t abs_qcoeff =

-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

-    }

+  // Quantization pass: only process the coefficients selected in

+  // pre-scan pass. Note: idx can be zero.

+  for (i = 0; i < idx; i++) {

+    const int rc = idx_arr[i];

+    const int coeff = coeff_ptr[rc];

+    const int coeff_sign = (coeff >> 31);

+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

+    const uint32_t abs_qcoeff =

+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

   *eob_ptr = eob + 1;

--- a/vpx_dsp/x86/quantize_avx_x86_64.asm

+++ b/vpx_dsp/x86/quantize_avx_x86_64.asm

@@ -19,10 +19,6 @@

   vzeroupper

-  ; If we can skip this block, then just zero the output

-  cmp                         skipmp, 0

-  jne .blank

 %ifnidn %1, b_32x32

   ; Special case for ncoeff == 16, as it is frequent and we can save on

@@ -491,48 +487,6 @@

   pmaxsw                          m8, m7

   movq                           rax, m8

   mov                           [r2], ax

-  vzeroupper

-  RET

-  ; Skip-block, i.e. just write all zeroes

-.blank:

-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \

-            qcoeff, dqcoeff, dequant, eob, scan, iscan

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]

-%else

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-%endif

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7

-  mova       [dqcoeffq+ncoeffq*4+32], ymm7

-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7

-  mova        [qcoeffq+ncoeffq*4+32], ymm7

-%else

-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7

-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7

-%endif

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                         [eobq], word 0

   vzeroupper

RET

 %endmacro

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <emmintrin.h>

 #include <xmmintrin.h>

@@ -23,7 +24,12 @@

                          uint16_t *eob_ptr, const int16_t *scan_ptr,

                          const int16_t *iscan_ptr) {

   __m128i zero;

+  __m128i eob;

+  __m128i zbin;

+  __m128i round, quant, dequant, shift;

   (void)scan_ptr;

+  (void)skip_block;

+  assert(!skip_block);

   coeff_ptr += n_coeffs;

   iscan_ptr += n_coeffs;

@@ -31,193 +37,179 @@

   dqcoeff_ptr += n_coeffs;

   n_coeffs = -n_coeffs;

   zero = _mm_setzero_si128();

-  if (!skip_block) {

-    __m128i eob;

-    __m128i zbin;

-    __m128i round, quant, dequant, shift;

+  {

+    __m128i coeff0, coeff1;

+    // Setup global values

-      __m128i coeff0, coeff1;

+      __m128i pw_1;

+      zbin = _mm_load_si128((const __m128i *)zbin_ptr);

+      round = _mm_load_si128((const __m128i *)round_ptr);

+      quant = _mm_load_si128((const __m128i *)quant_ptr);

+      pw_1 = _mm_set1_epi16(1);

+      zbin = _mm_sub_epi16(zbin, pw_1);

+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);

+      shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

+    }

-      // Setup global values

-      {

-        __m128i pw_1;

-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);

-        round = _mm_load_si128((const __m128i *)round_ptr);

-        quant = _mm_load_si128((const __m128i *)quant_ptr);

-        pw_1 = _mm_set1_epi16(1);

-        zbin = _mm_sub_epi16(zbin, pw_1);

-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);

-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-      }

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      __m128i cmp_mask0, cmp_mask1;

+      // Do DC and first 15 AC

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

-        // Do DC and first 15 AC

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+      zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+      round = _mm_unpackhi_epi64(round, round);

+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+      quant = _mm_unpackhi_epi64(quant, quant);

+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+      shift = _mm_unpackhi_epi64(shift, shift);

+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        round = _mm_unpackhi_epi64(round, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        quant = _mm_unpackhi_epi64(quant, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        shift = _mm_unpackhi_epi64(shift, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+      // Reinsert signs

+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Mask out zbin threshold coeffs

+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+      dequant = _mm_unpackhi_epi64(dequant, dequant);

+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        dequant = _mm_unpackhi_epi64(dequant, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

+    }

-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob = _mm_max_epi16(eob, eob1);

-      }

-      n_coeffs += 8 * 2;

+    {

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob = _mm_max_epi16(eob, eob1);

+    n_coeffs += 8 * 2;

+  }

-    // AC only loop

-    while (n_coeffs < 0) {

-      __m128i coeff0, coeff1;

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

+  // AC only loop

+  while (n_coeffs < 0) {

+    __m128i coeff0, coeff1;

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      __m128i cmp_mask0, cmp_mask1;

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Reinsert signs

+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+      // Mask out zbin threshold coeffs

+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob0, eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob0 = _mm_max_epi16(eob0, eob1);

-        eob = _mm_max_epi16(eob, eob0);

-      }

-      n_coeffs += 8 * 2;

+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-    // Accumulate EOB

-      __m128i eob_shuffled;

-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      *eob_ptr = _mm_extract_epi16(eob, 1);

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob0, eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob0 = _mm_max_epi16(eob0, eob1);

+      eob = _mm_max_epi16(eob, eob0);

-  } else {

-    do {

-      store_tran_low(zero, dqcoeff_ptr + n_coeffs);

-      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);

-      store_tran_low(zero, qcoeff_ptr + n_coeffs);

-      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);

-      n_coeffs += 8 * 2;

-    } while (n_coeffs < 0);

-    *eob_ptr = 0;

+    n_coeffs += 8 * 2;

+  }

+  // Accumulate EOB

+  {

+    __m128i eob_shuffled;

+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    *eob_ptr = _mm_extract_epi16(eob, 1);

--- a/vpx_dsp/x86/quantize_ssse3.c

+++ b/vpx_dsp/x86/quantize_ssse3.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <tmmintrin.h>

 #include "./vpx_dsp_rtcd.h"

@@ -28,18 +29,8 @@

   __m128i round, quant, dequant, shift;

   intptr_t index = 0;

   (void)scan_ptr;

-  if (skip_block) {

-    do {

-      store_tran_low(zero, dqcoeff_ptr + index);

-      store_tran_low(zero, dqcoeff_ptr + index + 8);

-      store_tran_low(zero, qcoeff_ptr + index);

-      store_tran_low(zero, qcoeff_ptr + index + 8);

-      index += 16;

-    } while (index < n_coeffs);

-    *eob_ptr = 0;

-    return;

-  }

+  (void)skip_block;

+  assert(!skip_block);

   // Setup global values

--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm

+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -19,9 +19,6 @@

 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

                                 shift, qcoeff, dqcoeff, dequant, \

                                 eob, scan, iscan

-  cmp                    dword skipm, 0

-  jne .blank

   ; actual quantize loop - setup pointers, rounders, etc.

   movifnidn                   coeffq, coeffmp

   movifnidn                  ncoeffq, ncoeffmp

@@ -299,46 +296,6 @@

   pmaxsw                          m8, m7

   pextrw                          r6, m8, 0

   mov                             [r2], r6

-  RET

-  ; skip-block, i.e. just write all zeroes

-.blank:

-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \

-            qcoeff, dqcoeff, dequant, eob, scan, iscan

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]

-%else

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-%endif

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova       [dqcoeffq+ncoeffq*4+ 0], m7

-  mova       [dqcoeffq+ncoeffq*4+16], m7

-  mova       [dqcoeffq+ncoeffq*4+32], m7

-  mova       [dqcoeffq+ncoeffq*4+48], m7

-  mova        [qcoeffq+ncoeffq*4+ 0], m7

-  mova        [qcoeffq+ncoeffq*4+16], m7

-  mova        [qcoeffq+ncoeffq*4+32], m7

-  mova        [qcoeffq+ncoeffq*4+48], m7

-%else

-  mova       [dqcoeffq+ncoeffq*2+ 0], m7

-  mova       [dqcoeffq+ncoeffq*2+16], m7

-  mova        [qcoeffq+ncoeffq*2+ 0], m7

-  mova        [qcoeffq+ncoeffq*2+16], m7

-%endif

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                    word [eobq], 0

RET

 %endmacro

--

⑨