shithub: libvpx

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -23,185 +23,151 @@

                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

                          uint16_t *eob_ptr, const int16_t *scan_ptr,

                          const int16_t *iscan_ptr) {

-  __m128i zero;

-  __m128i eob;

-  __m128i zbin;

-  __m128i round, quant, dequant, shift;

+  const __m128i zero = _mm_setzero_si128();

+  int index = 16;

+  __m128i zbin, round, quant, dequant, shift;

+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;

+  __m128i qcoeff0, qcoeff1;

+  __m128i cmp_mask0, cmp_mask1;

+  __m128i qtmp0, qtmp1;

+  __m128i zero_coeff0, zero_coeff1, iscan0, iscan1;

+  __m128i eob, eob0, eob1;

   (void)scan_ptr;

   (void)skip_block;

   assert(!skip_block);

-  coeff_ptr += n_coeffs;

-  iscan_ptr += n_coeffs;

-  qcoeff_ptr += n_coeffs;

-  dqcoeff_ptr += n_coeffs;

-  n_coeffs = -n_coeffs;

-  zero = _mm_setzero_si128();

-  {

-    __m128i coeff0, coeff1;

+  // Setup global values.

+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);

+  round = _mm_load_si128((const __m128i *)round_ptr);

+  quant = _mm_load_si128((const __m128i *)quant_ptr);

+  zbin = _mm_sub_epi16(zbin, _mm_set1_epi16(1));

+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);

+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-    // Setup global values

-    {

-      __m128i pw_1;

-      zbin = _mm_load_si128((const __m128i *)zbin_ptr);

-      round = _mm_load_si128((const __m128i *)round_ptr);

-      quant = _mm_load_si128((const __m128i *)quant_ptr);

-      pw_1 = _mm_set1_epi16(1);

-      zbin = _mm_sub_epi16(zbin, pw_1);

-      dequant = _mm_load_si128((const __m128i *)dequant_ptr);

-      shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-    }

+  // Do DC and first 15 AC.

+  coeff0 = load_tran_low(coeff_ptr);

+  coeff1 = load_tran_low(coeff_ptr + 8);

-    {

-      __m128i coeff0_sign, coeff1_sign;

-      __m128i qcoeff0, qcoeff1;

-      __m128i qtmp0, qtmp1;

-      __m128i cmp_mask0, cmp_mask1;

-      // Do DC and first 15 AC

-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+  // Poor man's abs().

+  coeff0_sign = _mm_srai_epi16(coeff0, 15);

+  coeff1_sign = _mm_srai_epi16(coeff1, 15);

+  qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+  qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-      // Poor man's sign extract

-      coeff0_sign = _mm_srai_epi16(coeff0, 15);

-      coeff1_sign = _mm_srai_epi16(coeff1, 15);

-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-      zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

-      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-      round = _mm_unpackhi_epi64(round, round);

-      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-      quant = _mm_unpackhi_epi64(quant, quant);

-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-      shift = _mm_unpackhi_epi64(shift, shift);

-      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+  round = _mm_unpackhi_epi64(round, round);

+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-      // Reinsert signs

-      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+  qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+  quant = _mm_unpackhi_epi64(quant, quant);

+  qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-      // Mask out zbin threshold coeffs

-      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+  qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+  qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+  qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+  shift = _mm_unpackhi_epi64(shift, shift);

+  qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-      dequant = _mm_unpackhi_epi64(dequant, dequant);

-      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+  // Reinsert signs

+  qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+  qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-    }

+  // Mask out zbin threshold coeffs

+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-    {

-      // Scan for eob

-      __m128i zero_coeff0, zero_coeff1;

-      __m128i nzero_coeff0, nzero_coeff1;

-      __m128i iscan0, iscan1;

-      __m128i eob1;

-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-      // Add one to convert from indices to counts

-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-      eob = _mm_and_si128(iscan0, nzero_coeff0);

-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-      eob = _mm_max_epi16(eob, eob1);

-    }

-    n_coeffs += 8 * 2;

-  }

+  store_tran_low(qcoeff0, qcoeff_ptr);

+  store_tran_low(qcoeff1, qcoeff_ptr + 8);

-  // AC only loop

-  while (n_coeffs < 0) {

-    __m128i coeff0, coeff1;

-    {

-      __m128i coeff0_sign, coeff1_sign;

-      __m128i qcoeff0, qcoeff1;

-      __m128i qtmp0, qtmp1;

-      __m128i cmp_mask0, cmp_mask1;

+  coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+  dequant = _mm_unpackhi_epi64(dequant, dequant);

+  coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+  store_tran_low(coeff0, dqcoeff_ptr);

+  store_tran_low(coeff1, dqcoeff_ptr + 8);

-      // Poor man's sign extract

-      coeff0_sign = _mm_srai_epi16(coeff0, 15);

-      coeff1_sign = _mm_srai_epi16(coeff1, 15);

-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+  // Scan for eob.

+  zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+  zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+  iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));

+  iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));

+  // Add one to convert from indices to counts

+  iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);

+  iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);

+  eob = _mm_andnot_si128(zero_coeff0, iscan0);

+  eob1 = _mm_andnot_si128(zero_coeff1, iscan1);

+  eob = _mm_max_epi16(eob, eob1);

-      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+  // AC only loop.

+  while (index < n_coeffs) {

+    coeff0 = load_tran_low(coeff_ptr + index);

+    coeff1 = load_tran_low(coeff_ptr + index + 8);

-      // Reinsert signs

-      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+    coeff0_sign = _mm_srai_epi16(coeff0, 15);

+    coeff1_sign = _mm_srai_epi16(coeff1, 15);

+    qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+    qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-      // Mask out zbin threshold coeffs

-      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+    qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+    qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+    qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+    qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-    }

+    qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+    qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-    {

-      // Scan for eob

-      __m128i zero_coeff0, zero_coeff1;

-      __m128i nzero_coeff0, nzero_coeff1;

-      __m128i iscan0, iscan1;

-      __m128i eob0, eob1;

-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-      // Add one to convert from indices to counts

-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-      eob0 = _mm_and_si128(iscan0, nzero_coeff0);

-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-      eob0 = _mm_max_epi16(eob0, eob1);

-      eob = _mm_max_epi16(eob, eob0);

-    }

-    n_coeffs += 8 * 2;

+    qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+    qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+    qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+    qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+    store_tran_low(qcoeff0, qcoeff_ptr + index);

+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);

+    coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+    coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+    store_tran_low(coeff0, dqcoeff_ptr + index);

+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);

+    zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+    zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+    iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));

+    iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));

+    iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);

+    iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);

+    eob0 = _mm_andnot_si128(zero_coeff0, iscan0);

+    eob1 = _mm_andnot_si128(zero_coeff1, iscan1);

+    eob0 = _mm_max_epi16(eob0, eob1);

+    eob = _mm_max_epi16(eob, eob0);

+    index += 16;

-  // Accumulate EOB

+  // Accumulate eob.

     __m128i eob_shuffled;

     eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

--- a/vpx_dsp/x86/quantize_ssse3.c

+++ b/vpx_dsp/x86/quantize_ssse3.c

@@ -23,104 +23,88 @@

                           const int16_t *dequant_ptr, uint16_t *eob_ptr,

                           const int16_t *scan_ptr, const int16_t *iscan_ptr) {

   const __m128i zero = _mm_setzero_si128();

+  intptr_t index = 16;

+  __m128i zbin, round, quant, dequant, shift;

   __m128i coeff0, coeff1;

-  __m128i eob;

-  __m128i zbin;

-  __m128i round, quant, dequant, shift;

-  intptr_t index = 0;

+  __m128i qcoeff0, qcoeff1;

+  __m128i cmp_mask0, cmp_mask1;

+  __m128i qtmp0, qtmp1;

+  __m128i zero_coeff0, zero_coeff1, iscan0, iscan1;

+  __m128i eob, eob0, eob1;

   (void)scan_ptr;

   (void)skip_block;

   assert(!skip_block);

-  // Setup global values

-  {

-    const __m128i one = _mm_set1_epi16(1);

-    zbin = _mm_load_si128((const __m128i *)zbin_ptr);

-    // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so

-    // it is a strict "greater" comparison.

-    zbin = _mm_sub_epi16(zbin, one);

-    round = _mm_load_si128((const __m128i *)round_ptr);

-    quant = _mm_load_si128((const __m128i *)quant_ptr);

-    dequant = _mm_load_si128((const __m128i *)dequant_ptr);

-    shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-  }

+  // Setup global values.

+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);

+  // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so

+  // it is a strict "greater" comparison.

+  zbin = _mm_sub_epi16(zbin, _mm_set1_epi16(1));

+  round = _mm_load_si128((const __m128i *)round_ptr);

+  quant = _mm_load_si128((const __m128i *)quant_ptr);

+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);

+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-  {

-    __m128i qcoeff0, qcoeff1;

-    __m128i qtmp0, qtmp1;

-    __m128i cmp_mask0, cmp_mask1;

-    __m128i zero_coeff0, zero_coeff1;

-    __m128i iscan0, iscan1;

-    __m128i eob1;

+  // Do DC and first 15 AC.

+  coeff0 = load_tran_low(coeff_ptr);

+  coeff1 = load_tran_low(coeff_ptr + 8);

-    // Do DC and first 15 AC

-    coeff0 = load_tran_low(coeff_ptr + index);

-    coeff1 = load_tran_low(coeff_ptr + index + 8);

+  qcoeff0 = _mm_abs_epi16(coeff0);

+  qcoeff1 = _mm_abs_epi16(coeff1);

-    qcoeff0 = _mm_abs_epi16(coeff0);

-    qcoeff1 = _mm_abs_epi16(coeff1);

+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-    // Overwrite DC component.

-    zbin = _mm_unpackhi_epi64(zbin, zbin);

-    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+  round = _mm_unpackhi_epi64(round, round);

+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-    qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-    round = _mm_unpackhi_epi64(round, round);

-    qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+  qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+  quant = _mm_unpackhi_epi64(quant, quant);

+  qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-    qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-    quant = _mm_unpackhi_epi64(quant, quant);

-    qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+  qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+  qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-    qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-    qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+  qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+  shift = _mm_unpackhi_epi64(shift, shift);

+  qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-    qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-    shift = _mm_unpackhi_epi64(shift, shift);

-    qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+  // Reinsert signs

+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);

+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);

-    // Reinsert signs

-    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);

-    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);

+  // Mask out zbin threshold coeffs

+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-    // Mask out zbin threshold coeffs

-    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+  store_tran_low(qcoeff0, qcoeff_ptr);

+  store_tran_low(qcoeff1, qcoeff_ptr + 8);

-    store_tran_low(qcoeff0, qcoeff_ptr + index);

-    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);

+  coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+  dequant = _mm_unpackhi_epi64(dequant, dequant);

+  coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-    coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-    dequant = _mm_unpackhi_epi64(dequant, dequant);

-    coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+  store_tran_low(coeff0, dqcoeff_ptr);

+  store_tran_low(coeff1, dqcoeff_ptr + 8);

-    store_tran_low(coeff0, dqcoeff_ptr + index);

-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);

+  // Scan for eob.

+  zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+  zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+  iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));

+  iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));

+  // Add one to convert from indices to counts

+  iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);

+  iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);

+  eob = _mm_andnot_si128(zero_coeff0, iscan0);

+  eob1 = _mm_andnot_si128(zero_coeff1, iscan1);

+  eob = _mm_max_epi16(eob, eob1);

-    // Scan for eob

-    zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-    zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-    iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));

-    iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));

-    // Add one to convert from indices to counts

-    iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);

-    iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);

-    eob = _mm_andnot_si128(zero_coeff0, iscan0);

-    eob1 = _mm_andnot_si128(zero_coeff1, iscan1);

-    eob = _mm_max_epi16(eob, eob1);

-  }

-  index += 16;

-  // AC only loop

+  // AC only loop.

   while (index < n_coeffs) {

-    __m128i qcoeff0, qcoeff1;

-    __m128i qtmp0, qtmp1;

-    __m128i cmp_mask0, cmp_mask1;

-    __m128i zero_coeff0, zero_coeff1;

-    __m128i iscan0, iscan1;

-    __m128i eob0, eob1;

     coeff0 = load_tran_low(coeff_ptr + index);

     coeff1 = load_tran_low(coeff_ptr + index + 8);

@@ -142,11 +126,9 @@

     qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

     qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-    // Reinsert signs

     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);

     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);

-    // Mask out zbin threshold coeffs

     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

@@ -159,12 +141,10 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    // Scan for eob

     zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

     zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

     iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));

     iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));

-    // Add one to convert from indices to counts

     iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);

     iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);

     eob0 = _mm_andnot_si128(zero_coeff0, iscan0);

@@ -175,7 +155,7 @@

     index += 16;

-  // Accumulate EOB

+  // Accumulate eob.

     __m128i eob_shuffled;

     eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

--

⑨