shithub: libvpx

--- a/test/vp9_quantize_test.cc

+++ b/test/vp9_quantize_test.cc

@@ -19,7 +19,7 @@

 #include "test/register_state_check.h"

 #include "test/util.h"

 #include "./vpx_config.h"

-#include "./vp9_rtcd.h"

+#include "./vpx_dsp_rtcd.h"

 #include "vp9/common/vp9_entropy.h"

 #include "vp9/common/vp9_scan.h"

 #include "vpx/vpx_codec.h"

--- a/vp9/common/vp9_common.h

+++ b/vp9/common/vp9_common.h

@@ -56,20 +56,6 @@

       return (uint16_t)clamp(val, 0, 4095);

-// Note:

-// tran_low_t  is the datatype used for final transform coefficients.

-// tran_high_t is the datatype used for intermediate transform stages.

-typedef int64_t tran_high_t;

-typedef int32_t tran_low_t;

-#else

-// Note:

-// tran_low_t  is the datatype used for final transform coefficients.

-// tran_high_t is the datatype used for intermediate transform stages.

-typedef int32_t tran_high_t;

-typedef int16_t tran_low_t;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_DEBUG

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -781,12 +781,6 @@

   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_quantize_fp_32x32/;

-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_b/;

-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_b_32x32/;

   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_fdct8x8_quant/;

 } else {

@@ -802,12 +796,6 @@

   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";

-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";

-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";

   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;

@@ -934,12 +922,6 @@

   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_highbd_quantize_fp_32x32/;

-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_highbd_quantize_b sse2/;

-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;

   # Structured Similarity (SSIM)

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/quantize.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

@@ -23,7 +24,6 @@

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_encodemb.h"

-#include "vp9/encoder/vp9_quantize.h"

 #include "vp9/encoder/vp9_rd.h"

 #include "vp9/encoder/vp9_tokenize.h"

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -9,7 +9,7 @@

*/

 #include <math.h>

+#include "./vpx_dsp_rtcd.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

@@ -20,113 +20,6 @@

 #include "vp9/encoder/vp9_quantize.h"

 #include "vp9/encoder/vp9_rd.h"

-void vp9_quantize_dc(const tran_low_t *coeff_ptr,

-                     int n_coeffs, int skip_block,

-                     const int16_t *round_ptr, const int16_t quant,

-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {

-  const int rc = 0;

-  const int coeff = coeff_ptr[rc];

-  const int coeff_sign = (coeff >> 31);

-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-  int tmp, eob = -1;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

-    tmp = (tmp * quant) >> 16;

-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;

-    if (tmp)

-      eob = 0;

-  }

-  *eob_ptr = eob + 1;

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,

-                            int n_coeffs, int skip_block,

-                            const int16_t *round_ptr, const int16_t quant,

-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                            const int16_t dequant_ptr, uint16_t *eob_ptr) {

-  int eob = -1;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    const int coeff = coeff_ptr[0];

-    const int coeff_sign = (coeff >> 31);

-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-    const int64_t tmp = abs_coeff + round_ptr[0];

-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);

-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;

-    if (abs_qcoeff)

-      eob = 0;

-  }

-  *eob_ptr = eob + 1;

-}

-#endif

-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

-                           const int16_t *round_ptr, const int16_t quant,

-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {

-  const int n_coeffs = 1024;

-  const int rc = 0;

-  const int coeff = coeff_ptr[rc];

-  const int coeff_sign = (coeff >> 31);

-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-  int tmp, eob = -1;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),

-                INT16_MIN, INT16_MAX);

-    tmp = (tmp * quant) >> 15;

-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;

-    if (tmp)

-      eob = 0;

-  }

-  *eob_ptr = eob + 1;

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

-                                  int skip_block,

-                                  const int16_t *round_ptr,

-                                  const int16_t quant,

-                                  tran_low_t *qcoeff_ptr,

-                                  tran_low_t *dqcoeff_ptr,

-                                  const int16_t dequant_ptr,

-                                  uint16_t *eob_ptr) {

-  const int n_coeffs = 1024;

-  int eob = -1;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    const int coeff = coeff_ptr[0];

-    const int coeff_sign = (coeff >> 31);

-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);

-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);

-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;

-    if (abs_qcoeff)

-      eob = 0;

-  }

-  *eob_ptr = eob + 1;

-}

-#endif

 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                        int skip_block,

                        const int16_t *zbin_ptr, const int16_t *round_ptr,

@@ -292,224 +185,6 @@

       if (abs_qcoeff)

         eob = i;

-    }

-  }

-  *eob_ptr = eob + 1;

-}

-#endif

-void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

-                      int skip_block,

-                      const int16_t *zbin_ptr, const int16_t *round_ptr,

-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,

-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                      const int16_t *dequant_ptr,

-                      uint16_t *eob_ptr,

-                      const int16_t *scan, const int16_t *iscan) {

-  int i, non_zero_count = (int)n_coeffs, eob = -1;

-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};

-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

-  (void)iscan;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = (int)n_coeffs - 1; i >= 0; i--) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])

-        non_zero_count--;

-      else

-        break;

-    }

-    // Quantization pass: All coefficients with index >= zero_flag are

-    // skippable. Note: zero_flag can be zero.

-    for (i = 0; i < non_zero_count; i++) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= zbins[rc != 0]) {

-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *

-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization

-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];

-        if (tmp)

-          eob = i;

-      }

-    }

-  }

-  *eob_ptr = eob + 1;

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

-                             int skip_block, const int16_t *zbin_ptr,

-                             const int16_t *round_ptr, const int16_t *quant_ptr,

-                             const int16_t *quant_shift_ptr,

-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                             const int16_t *dequant_ptr,

-                             uint16_t *eob_ptr, const int16_t *scan,

-                             const int16_t *iscan) {

-  int i, non_zero_count = (int)n_coeffs, eob = -1;

-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};

-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

-  (void)iscan;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = (int)n_coeffs - 1; i >= 0; i--) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])

-        non_zero_count--;

-      else

-        break;

-    }

-    // Quantization pass: All coefficients with index >= zero_flag are

-    // skippable. Note: zero_flag can be zero.

-    for (i = 0; i < non_zero_count; i++) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= zbins[rc != 0]) {

-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];

-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

-        const uint32_t abs_qcoeff =

-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);

-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];

-        if (abs_qcoeff)

-          eob = i;

-      }

-    }

-  }

-  *eob_ptr = eob + 1;

-}

-#endif

-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

-                            int skip_block,

-                            const int16_t *zbin_ptr, const int16_t *round_ptr,

-                            const int16_t *quant_ptr,

-                            const int16_t *quant_shift_ptr,

-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                            const int16_t *dequant_ptr,

-                            uint16_t *eob_ptr,

-                            const int16_t *scan, const int16_t *iscan) {

-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),

-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};

-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

-  int idx = 0;

-  int idx_arr[1024];

-  int i, eob = -1;

-  (void)iscan;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = 0; i < n_coeffs; i++) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      // If the coefficient is out of the base ZBIN range, keep it for

-      // quantization.

-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])

-        idx_arr[idx++] = i;

-    }

-    // Quantization pass: only process the coefficients selected in

-    // pre-scan pass. Note: idx can be zero.

-    for (i = 0; i < idx; i++) {

-      const int rc = scan[idx_arr[i]];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      int tmp;

-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);

-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *

-               quant_shift_ptr[rc != 0]) >> 15;

-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

-      if (tmp)

-        eob = idx_arr[i];

-    }

-  }

-  *eob_ptr = eob + 1;

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,

-                                   intptr_t n_coeffs, int skip_block,

-                                   const int16_t *zbin_ptr,

-                                   const int16_t *round_ptr,

-                                   const int16_t *quant_ptr,

-                                   const int16_t *quant_shift_ptr,

-                                   tran_low_t *qcoeff_ptr,

-                                   tran_low_t *dqcoeff_ptr,

-                                   const int16_t *dequant_ptr,

-                                   uint16_t *eob_ptr,

-                                   const int16_t *scan, const int16_t *iscan) {

-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),

-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};

-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

-  int idx = 0;

-  int idx_arr[1024];

-  int i, eob = -1;

-  (void)iscan;

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = 0; i < n_coeffs; i++) {

-      const int rc = scan[i];

-      const int coeff = coeff_ptr[rc];

-      // If the coefficient is out of the base ZBIN range, keep it for

-      // quantization.

-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])

-        idx_arr[idx++] = i;

-    }

-    // Quantization pass: only process the coefficients selected in

-    // pre-scan pass. Note: idx can be zero.

-    for (i = 0; i < idx; i++) {

-      const int rc = scan[idx_arr[i]];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      const int64_t tmp1 = abs_coeff

-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

-      const uint32_t abs_qcoeff =

-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

-      if (abs_qcoeff)

-        eob = idx_arr[i];

   *eob_ptr = eob + 1;

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -37,33 +37,8 @@

   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);

 } QUANTS;

-void vp9_quantize_dc(const tran_low_t *coeff_ptr,

-                     int n_coeffs, int skip_block,

-                     const int16_t *round_ptr, const int16_t quant_ptr,

-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                     const int16_t dequant_ptr, uint16_t *eob_ptr);

-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

-                           const int16_t *round_ptr, const int16_t quant_ptr,

-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                           const int16_t dequant_ptr, uint16_t *eob_ptr);

 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,

                                 const int16_t *scan, const int16_t *iscan);

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,

-                            int n_coeffs, int skip_block,

-                            const int16_t *round_ptr, const int16_t quant_ptr,

-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                            const int16_t dequant_ptr, uint16_t *eob_ptr);

-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

-                                  int skip_block,

-                                  const int16_t *round_ptr,

-                                  const int16_t quant_ptr,

-                                  tran_low_t *qcoeff_ptr,

-                                  tran_low_t *dqcoeff_ptr,

-                                  const int16_t dequant_ptr,

-                                  uint16_t *eob_ptr);

-#endif

 struct VP9_COMP;

 struct VP9Common;

--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c

+++ /dev/null

@@ -1,179 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>

-#include "vpx_ports/mem.h"

-#include "vp9/common/vp9_common.h"

-#if CONFIG_VP9_HIGHBITDEPTH

-// from vp9_idct.h: typedef int32_t tran_low_t;

-void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,

-                                intptr_t count,

-                                int skip_block,

-                                const int16_t *zbin_ptr,

-                                const int16_t *round_ptr,

-                                const int16_t *quant_ptr,

-                                const int16_t *quant_shift_ptr,

-                                tran_low_t *qcoeff_ptr,

-                                tran_low_t *dqcoeff_ptr,

-                                const int16_t *dequant_ptr,

-                                uint16_t *eob_ptr,

-                                const int16_t *scan,

-                                const int16_t *iscan) {

-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;

-  __m128i zbins[2];

-  __m128i nzbins[2];

-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],

-                           (int)zbin_ptr[1],

-                           (int)zbin_ptr[1],

-                           (int)zbin_ptr[0]);

-  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);

-  nzbins[0] = _mm_setzero_si128();

-  nzbins[1] = _mm_setzero_si128();

-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

-  (void)scan;

-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = ((int)count / 4) - 1; i >= 0; i--) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (test == 0xffff)

-        non_zero_regs--;

-      else

-        break;

-    }

-    // Quantization pass:

-    for (i = 0; i < non_zero_regs; i++) {

-      __m128i coeffs, coeffs_sign, tmp1, tmp2;

-      int test;

-      int abs_coeff[4];

-      int coeff_sign[4];

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      coeffs_sign = _mm_srai_epi32(coeffs, 31);

-      coeffs = _mm_sub_epi32(

-            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

-      tmp1 = _mm_or_si128(tmp1, tmp2);

-      test = _mm_movemask_epi8(tmp1);

-      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);

-      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);

-      for (j = 0; j < 4; j++) {

-        if (test & (1 << (4 * j))) {

-          int k = 4 * i + j;

-          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];

-          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;

-          const uint32_t abs_qcoeff =

-              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);

-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

-          if (abs_qcoeff)

-            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

-        }

-      }

-    }

-  }

-  *eob_ptr = eob_i + 1;

-}

-void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,

-                                      intptr_t n_coeffs,

-                                      int skip_block,

-                                      const int16_t *zbin_ptr,

-                                      const int16_t *round_ptr,

-                                      const int16_t *quant_ptr,

-                                      const int16_t *quant_shift_ptr,

-                                      tran_low_t *qcoeff_ptr,

-                                      tran_low_t *dqcoeff_ptr,

-                                      const int16_t *dequant_ptr,

-                                      uint16_t *eob_ptr,

-                                      const int16_t *scan,

-                                      const int16_t *iscan) {

-  __m128i zbins[2];

-  __m128i nzbins[2];

-  int idx = 0;

-  int idx_arr[1024];

-  int i, eob = -1;

-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);

-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);

-  (void)scan;

-  zbins[0] = _mm_set_epi32(zbin1_tmp,

-                           zbin1_tmp,

-                           zbin1_tmp,

-                           zbin0_tmp);

-  zbins[1] = _mm_set1_epi32(zbin1_tmp);

-  nzbins[0] = _mm_setzero_si128();

-  nzbins[1] = _mm_setzero_si128();

-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = 0; i < n_coeffs / 4; i++) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (!(test & 0xf))

-        idx_arr[idx++] = i * 4;

-      if (!(test & 0xf0))

-        idx_arr[idx++] = i * 4 + 1;

-      if (!(test & 0xf00))

-        idx_arr[idx++] = i * 4 + 2;

-      if (!(test & 0xf000))

-        idx_arr[idx++] = i * 4 + 3;

-    }

-    // Quantization pass: only process the coefficients selected in

-    // pre-scan pass. Note: idx can be zero.

-    for (i = 0; i < idx; i++) {

-      const int rc = idx_arr[i];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      const int64_t tmp1 = abs_coeff

-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

-      const uint32_t abs_qcoeff =

-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

-      if (abs_qcoeff)

-        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

-    }

-  }

-  *eob_ptr = eob + 1;

-}

-#endif

--- a/vp9/encoder/x86/vp9_quantize_sse2.c

+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -14,214 +14,6 @@

 #include "./vp9_rtcd.h"

 #include "vpx/vpx_integer.h"

-void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,

-                         int skip_block, const int16_t* zbin_ptr,

-                         const int16_t* round_ptr, const int16_t* quant_ptr,

-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,

-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,

-                         uint16_t* eob_ptr,

-                         const int16_t* scan_ptr,

-                         const int16_t* iscan_ptr) {

-  __m128i zero;

-  (void)scan_ptr;

-  coeff_ptr += n_coeffs;

-  iscan_ptr += n_coeffs;

-  qcoeff_ptr += n_coeffs;

-  dqcoeff_ptr += n_coeffs;

-  n_coeffs = -n_coeffs;

-  zero = _mm_setzero_si128();

-  if (!skip_block) {

-    __m128i eob;

-    __m128i zbin;

-    __m128i round, quant, dequant, shift;

-    {

-      __m128i coeff0, coeff1;

-      // Setup global values

-      {

-        __m128i pw_1;

-        zbin = _mm_load_si128((const __m128i*)zbin_ptr);

-        round = _mm_load_si128((const __m128i*)round_ptr);

-        quant = _mm_load_si128((const __m128i*)quant_ptr);

-        pw_1 = _mm_set1_epi16(1);

-        zbin = _mm_sub_epi16(zbin, pw_1);

-        dequant = _mm_load_si128((const __m128i*)dequant_ptr);

-        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);

-      }

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

-        // Do DC and first 15 AC

-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        round = _mm_unpackhi_epi64(round, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        quant = _mm_unpackhi_epi64(quant, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        shift = _mm_unpackhi_epi64(shift, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        dequant = _mm_unpackhi_epi64(dequant, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob = _mm_max_epi16(eob, eob1);

-      }

-      n_coeffs += 8 * 2;

-    }

-    // AC only loop

-    while (n_coeffs < 0) {

-      __m128i coeff0, coeff1;

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob0, eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob0 = _mm_max_epi16(eob0, eob1);

-        eob = _mm_max_epi16(eob, eob0);

-      }

-      n_coeffs += 8 * 2;

-    }

-    // Accumulate EOB

-    {

-      __m128i eob_shuffled;

-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      *eob_ptr = _mm_extract_epi16(eob, 1);

-    }

-  } else {

-    do {

-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

-      n_coeffs += 8 * 2;

-    } while (n_coeffs < 0);

-    *eob_ptr = 0;

-  }

-}

 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,

                           int skip_block, const int16_t* zbin_ptr,

                           const int16_t* round_ptr, const int16_t* quant_ptr,

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -15,206 +15,6 @@

 SECTION .text

-; TODO(yunqingwang)fix quantize_b code for skip=1 case.

-%macro QUANTIZE_FN 2

-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

-                                shift, qcoeff, dqcoeff, dequant, \

-                                eob, scan, iscan

-  cmp                    dword skipm, 0

-  jne .blank

-  ; actual quantize loop - setup pointers, rounders, etc.

-  movifnidn                   coeffq, coeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, dequantmp

-  movifnidn                    zbinq, zbinmp

-  movifnidn                   roundq, roundmp

-  movifnidn                   quantq, quantmp

-  mova                            m0, [zbinq]              ; m0 = zbin

-  mova                            m1, [roundq]             ; m1 = round

-  mova                            m2, [quantq]             ; m2 = quant

-%ifidn %1, b_32x32

-  pcmpeqw                         m5, m5

-  psrlw                           m5, 15

-  paddw                           m0, m5

-  paddw                           m1, m5

-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

-%endif

-  mova                            m3, [r2q]                ; m3 = dequant

-  psubw                           m0, [pw_1]

-  mov                             r2, shiftmp

-  mov                             r3, qcoeffmp

-  mova                            m4, [r2]                 ; m4 = shift

-  mov                             r4, dqcoeffmp

-  mov                             r5, iscanmp

-%ifidn %1, b_32x32

-  psllw                           m4, 1

-%endif

-  pxor                            m5, m5                   ; m5 = dedicated zero

-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob

-  lea                         coeffq, [  coeffq+ncoeffq*2]

-  lea                         iscanq, [  iscanq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  neg                        ncoeffq

-  ; get DC and first 15 AC coeffs

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

-  pabsw                           m6, m9                   ; m6 = abs(m9)

-  pabsw                          m11, m10                  ; m11 = abs(m10)

-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

-  punpckhqdq                      m0, m0

-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

-  paddsw                          m6, m1                   ; m6 += round

-  punpckhqdq                      m1, m1

-  paddsw                         m11, m1                   ; m11 += round

-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16

-  punpckhqdq                      m2, m2

-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

-  paddw                           m8, m6                   ; m8 += m6

-  paddw                          m13, m11                  ; m13 += m11

-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16

-  punpckhqdq                      m4, m4

-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

-  psignw                          m8, m9                   ; m8 = reinsert sign

-  psignw                         m13, m10                  ; m13 = reinsert sign

-  pand                            m8, m7

-  pand                           m13, m12

-  mova        [qcoeffq+ncoeffq*2+ 0], m8

-  mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

-  pabsw                           m8, m8

-  pabsw                          m13, m13

-%endif

-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

-  punpckhqdq                      m3, m3

-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

-  psrlw                           m8, 1

-  psrlw                          m13, 1

-  psignw                          m8, m9

-  psignw                         m13, m10

-%endif

-  mova       [dqcoeffq+ncoeffq*2+ 0], m8

-  mova       [dqcoeffq+ncoeffq*2+16], m13

-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

-  psubw                           m6, m7                   ; m6 = scan[i] + 1

-  psubw                          m11, m12                  ; m11 = scan[i] + 1

-  pandn                           m8, m6                   ; m8 = max(eob)

-  pandn                          m13, m11                  ; m13 = max(eob)

-  pmaxsw                          m8, m13

-  add                        ncoeffq, mmsize

-  jz .accumulate_eob

-.ac_only_loop:

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

-  pabsw                           m6, m9                   ; m6 = abs(m9)

-  pabsw                          m11, m10                  ; m11 = abs(m10)

-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

-%ifidn %1, b_32x32

-  pmovmskb                       r6d, m7

-  pmovmskb                       r2d, m12

-  or                              r6, r2

-  jz .skip_iter

-%endif

-  paddsw                          m6, m1                   ; m6 += round

-  paddsw                         m11, m1                   ; m11 += round

-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

-  paddw                          m14, m6                   ; m14 += m6

-  paddw                          m13, m11                  ; m13 += m11

-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16

-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

-  psignw                         m14, m9                   ; m14 = reinsert sign

-  psignw                         m13, m10                  ; m13 = reinsert sign

-  pand                           m14, m7

-  pand                           m13, m12

-  mova        [qcoeffq+ncoeffq*2+ 0], m14

-  mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

-  pabsw                          m14, m14

-  pabsw                          m13, m13

-%endif

-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

-  psrlw                          m14, 1

-  psrlw                          m13, 1

-  psignw                         m14, m9

-  psignw                         m13, m10

-%endif

-  mova       [dqcoeffq+ncoeffq*2+ 0], m14

-  mova       [dqcoeffq+ncoeffq*2+16], m13

-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

-  psubw                           m6, m7                   ; m6 = scan[i] + 1

-  psubw                          m11, m12                  ; m11 = scan[i] + 1

-  pandn                          m14, m6                   ; m14 = max(eob)

-  pandn                          m13, m11                  ; m13 = max(eob)

-  pmaxsw                          m8, m14

-  pmaxsw                          m8, m13

-  add                        ncoeffq, mmsize

-  jl .ac_only_loop

-%ifidn %1, b_32x32

-  jmp .accumulate_eob

-.skip_iter:

-  mova        [qcoeffq+ncoeffq*2+ 0], m5

-  mova        [qcoeffq+ncoeffq*2+16], m5

-  mova       [dqcoeffq+ncoeffq*2+ 0], m5

-  mova       [dqcoeffq+ncoeffq*2+16], m5

-  add                        ncoeffq, mmsize

-  jl .ac_only_loop

-%endif

-.accumulate_eob:

-  ; horizontally accumulate/max eobs and write into [eob] memory pointer

-  mov                             r2, eobmp

-  pshufd                          m7, m8, 0xe

-  pmaxsw                          m8, m7

-  pshuflw                         m7, m8, 0xe

-  pmaxsw                          m8, m7

-  pshuflw                         m7, m8, 0x1

-  pmaxsw                          m8, m7

-  pextrw                          r6, m8, 0

-  mov                             [r2], r6

-  RET

-  ; skip-block, i.e. just write all zeroes

-.blank:

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-  mova       [dqcoeffq+ncoeffq*2+ 0], m7

-  mova       [dqcoeffq+ncoeffq*2+16], m7

-  mova        [qcoeffq+ncoeffq*2+ 0], m7

-  mova        [qcoeffq+ncoeffq*2+16], m7

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                    word [eobq], 0

-  RET

-%endmacro

-INIT_XMM ssse3

-QUANTIZE_FN b, 7

-QUANTIZE_FN b_32x32, 7

 %macro QUANTIZE_FP 2

 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

                                 shift, qcoeff, dqcoeff, dequant, \

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -104,7 +104,6 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c

 endif

--- /dev/null

+++ b/vpx_dsp/quantize.c

@@ -1,0 +1,337 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_dsp/quantize.h"

+#include "vpx_mem/vpx_mem.h"

+void vp9_quantize_dc(const tran_low_t *coeff_ptr,

+                     int n_coeffs, int skip_block,

+                     const int16_t *round_ptr, const int16_t quant,

+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {

+  const int rc = 0;

+  const int coeff = coeff_ptr[rc];

+  const int coeff_sign = (coeff >> 31);

+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+  int tmp, eob = -1;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

+    tmp = (tmp * quant) >> 16;

+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;

+    if (tmp)

+      eob = 0;

+  }

+  *eob_ptr = eob + 1;

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,

+                            int n_coeffs, int skip_block,

+                            const int16_t *round_ptr, const int16_t quant,

+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {

+  int eob = -1;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    const int coeff = coeff_ptr[0];

+    const int coeff_sign = (coeff >> 31);

+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+    const int64_t tmp = abs_coeff + round_ptr[0];

+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);

+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;

+    if (abs_qcoeff)

+      eob = 0;

+  }

+  *eob_ptr = eob + 1;

+}

+#endif

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

+                           const int16_t *round_ptr, const int16_t quant,

+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {

+  const int n_coeffs = 1024;

+  const int rc = 0;

+  const int coeff = coeff_ptr[rc];

+  const int coeff_sign = (coeff >> 31);

+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+  int tmp, eob = -1;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),

+                INT16_MIN, INT16_MAX);

+    tmp = (tmp * quant) >> 15;

+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;

+    if (tmp)

+      eob = 0;

+  }

+  *eob_ptr = eob + 1;

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

+                                  int skip_block,

+                                  const int16_t *round_ptr,

+                                  const int16_t quant,

+                                  tran_low_t *qcoeff_ptr,

+                                  tran_low_t *dqcoeff_ptr,

+                                  const int16_t dequant_ptr,

+                                  uint16_t *eob_ptr) {

+  const int n_coeffs = 1024;

+  int eob = -1;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    const int coeff = coeff_ptr[0];

+    const int coeff_sign = (coeff >> 31);

+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);

+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);

+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;

+    if (abs_qcoeff)

+      eob = 0;

+  }

+  *eob_ptr = eob + 1;

+}

+#endif

+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                      int skip_block,

+                      const int16_t *zbin_ptr, const int16_t *round_ptr,

+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,

+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                      const int16_t *dequant_ptr,

+                      uint16_t *eob_ptr,

+                      const int16_t *scan, const int16_t *iscan) {

+  int i, non_zero_count = (int)n_coeffs, eob = -1;

+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};

+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

+  (void)iscan;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = (int)n_coeffs - 1; i >= 0; i--) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])

+        non_zero_count--;

+      else

+        break;

+    }

+    // Quantization pass: All coefficients with index >= zero_flag are

+    // skippable. Note: zero_flag can be zero.

+    for (i = 0; i < non_zero_count; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      if (abs_coeff >= zbins[rc != 0]) {

+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *

+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization

+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;

+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];

+        if (tmp)

+          eob = i;

+      }

+    }

+  }

+  *eob_ptr = eob + 1;

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                             int skip_block, const int16_t *zbin_ptr,

+                             const int16_t *round_ptr, const int16_t *quant_ptr,

+                             const int16_t *quant_shift_ptr,

+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                             const int16_t *dequant_ptr,

+                             uint16_t *eob_ptr, const int16_t *scan,

+                             const int16_t *iscan) {

+  int i, non_zero_count = (int)n_coeffs, eob = -1;

+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};

+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

+  (void)iscan;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = (int)n_coeffs - 1; i >= 0; i--) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])

+        non_zero_count--;

+      else

+        break;

+    }

+    // Quantization pass: All coefficients with index >= zero_flag are

+    // skippable. Note: zero_flag can be zero.

+    for (i = 0; i < non_zero_count; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      if (abs_coeff >= zbins[rc != 0]) {

+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];

+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

+        const uint32_t abs_qcoeff =

+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);

+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];

+        if (abs_qcoeff)

+          eob = i;

+      }

+    }

+  }

+  *eob_ptr = eob + 1;

+}

+#endif

+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                            int skip_block,

+                            const int16_t *zbin_ptr, const int16_t *round_ptr,

+                            const int16_t *quant_ptr,

+                            const int16_t *quant_shift_ptr,

+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                            const int16_t *dequant_ptr,

+                            uint16_t *eob_ptr,

+                            const int16_t *scan, const int16_t *iscan) {

+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),

+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};

+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

+  int idx = 0;

+  int idx_arr[1024];

+  int i, eob = -1;

+  (void)iscan;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = 0; i < n_coeffs; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      // If the coefficient is out of the base ZBIN range, keep it for

+      // quantization.

+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])

+        idx_arr[idx++] = i;

+    }

+    // Quantization pass: only process the coefficients selected in

+    // pre-scan pass. Note: idx can be zero.

+    for (i = 0; i < idx; i++) {

+      const int rc = scan[idx_arr[i]];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      int tmp;

+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);

+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *

+               quant_shift_ptr[rc != 0]) >> 15;

+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+      if (tmp)

+        eob = idx_arr[i];

+    }

+  }

+  *eob_ptr = eob + 1;

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,

+                                   intptr_t n_coeffs, int skip_block,

+                                   const int16_t *zbin_ptr,

+                                   const int16_t *round_ptr,

+                                   const int16_t *quant_ptr,

+                                   const int16_t *quant_shift_ptr,

+                                   tran_low_t *qcoeff_ptr,

+                                   tran_low_t *dqcoeff_ptr,

+                                   const int16_t *dequant_ptr,

+                                   uint16_t *eob_ptr,

+                                   const int16_t *scan, const int16_t *iscan) {

+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),

+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};

+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

+  int idx = 0;

+  int idx_arr[1024];

+  int i, eob = -1;

+  (void)iscan;

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = 0; i < n_coeffs; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      // If the coefficient is out of the base ZBIN range, keep it for

+      // quantization.

+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])

+        idx_arr[idx++] = i;

+    }

+    // Quantization pass: only process the coefficients selected in

+    // pre-scan pass. Note: idx can be zero.

+    for (i = 0; i < idx; i++) {

+      const int rc = scan[idx_arr[i]];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      const int64_t tmp1 = abs_coeff

+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

+      const uint32_t abs_qcoeff =

+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+      if (abs_qcoeff)

+        eob = idx_arr[i];

+    }

+  }

+  *eob_ptr = eob + 1;

+}

+#endif

--- /dev/null

+++ b/vpx_dsp/quantize.h

@@ -1,0 +1,51 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_QUANTIZE_H_

+#define VPX_DSP_QUANTIZE_H_

+#include "./vpx_config.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+#ifdef __cplusplus

+extern "C" {

+#endif

+void vp9_quantize_dc(const tran_low_t *coeff_ptr,

+                     int n_coeffs, int skip_block,

+                     const int16_t *round_ptr, const int16_t quant_ptr,

+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                     const int16_t dequant_ptr, uint16_t *eob_ptr);

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

+                           const int16_t *round_ptr, const int16_t quant_ptr,

+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                           const int16_t dequant_ptr, uint16_t *eob_ptr);

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,

+                            int n_coeffs, int skip_block,

+                            const int16_t *round_ptr, const int16_t quant_ptr,

+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                            const int16_t dequant_ptr, uint16_t *eob_ptr);

+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

+                                  int skip_block,

+                                  const int16_t *round_ptr,

+                                  const int16_t quant_ptr,

+                                  tran_low_t *qcoeff_ptr,

+                                  tran_low_t *dqcoeff_ptr,

+                                  const int16_t dequant_ptr,

+                                  uint16_t *eob_ptr);

+#endif

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#endif  // VPX_DSP_QUANTIZE_H_

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -54,6 +54,21 @@

 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c

 endif  # CONFIG_VP9_HIGHBITDEPTH

+ifeq ($(CONFIG_VP9_ENCODER),yes)

+DSP_SRCS-yes            += quantize.c

+DSP_SRCS-yes            += quantize.h

+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c

+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c

+endif

+ifeq ($(ARCH_X86_64),yes)

+ifeq ($(CONFIG_USE_X86INC),yes)

+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm

+endif

+endif

+endif  # CONFIG_VP9_ENCODER

 ifeq ($(CONFIG_ENCODERS),yes)

 DSP_SRCS-yes            += sad.c

 DSP_SRCS-yes            += subtract.c

--- a/vpx_dsp/vpx_dsp_common.h

+++ b/vpx_dsp/vpx_dsp_common.h

@@ -24,6 +24,20 @@

 #define MIN(x, y) (((x) < (y)) ? (x) : (y))

 #define MAX(x, y) (((x) > (y)) ? (x) : (y))

+#if CONFIG_VP9_HIGHBITDEPTH

+// Note:

+// tran_low_t  is the datatype used for final transform coefficients.

+// tran_high_t is the datatype used for intermediate transform stages.

+typedef int64_t tran_high_t;

+typedef int32_t tran_low_t;

+#else

+// Note:

+// tran_low_t  is the datatype used for final transform coefficients.

+// tran_high_t is the datatype used for intermediate transform stages.

+typedef int32_t tran_high_t;

+typedef int16_t tran_low_t;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 static INLINE uint8_t clip_pixel(int val) {

   return (val > 255) ? 255 : (val < 0) ? 0 : val;

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -5,12 +5,18 @@

*/

 #include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

EOF

 forward_decls qw/vpx_dsp_forward_decls/;

-# Functions which use x86inc.asm instead of x86_abi_support.asm

+# x86inc.asm had specific constraints. break it out so it's easy to disable.

+# zero all the variables to avoid tricky else conditions.

+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =

+  $avx2_x86inc = '';

+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =

+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';

 if (vpx_config("CONFIG_USE_X86INC") eq "yes") {

   $mmx_x86inc = 'mmx';

   $sse_x86inc = 'sse';

@@ -18,12 +24,18 @@

   $ssse3_x86inc = 'ssse3';

   $avx_x86inc = 'avx';

   $avx2_x86inc = 'avx2';

-} else {

-  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =

-  $avx_x86inc = $avx2_x86inc = '';

+  if ($opts{arch} eq "x86_64") {

+    $mmx_x86_64_x86inc = 'mmx';

+    $sse_x86_64_x86inc = 'sse';

+    $sse2_x86_64_x86inc = 'sse2';

+    $ssse3_x86_64_x86inc = 'ssse3';

+    $avx_x86_64_x86inc = 'avx';

+    $avx2_x86_64_x86inc = 'avx2';

+  }

-# Functions which are 64 bit only.

+# functions that are 64 bit only.

+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';

 if ($opts{arch} eq "x86_64") {

   $mmx_x86_64 = 'mmx';

   $sse2_x86_64 = 'sse2';

@@ -30,9 +42,6 @@

   $ssse3_x86_64 = 'ssse3';

   $avx_x86_64 = 'avx';

   $avx2_x86_64 = 'avx2';

-} else {

-  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =

-  $avx_x86_64 = $avx2_x86_64 = '';

@@ -109,6 +118,31 @@

   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";

   specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;

+}  # CONFIG_VP9_HIGHBITDEPTH

+#

+# Encoder functions.

+#

+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b/;

+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b_32x32/;

+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_highbd_quantize_b sse2/;

+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;

+} else {

+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";

+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";

+}  # CONFIG_VP9_ENCODER

 }  # CONFIG_VP9_HIGHBITDEPTH

 if (vpx_config("CONFIG_ENCODERS") eq "yes") {

--- /dev/null

+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -1,0 +1,180 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>

+#include "vpx_dsp/vpx_dsp_common.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_ports/mem.h"

+#if CONFIG_VP9_HIGHBITDEPTH

+// from vp9_idct.h: typedef int32_t tran_low_t;

+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,

+                                intptr_t count,

+                                int skip_block,

+                                const int16_t *zbin_ptr,

+                                const int16_t *round_ptr,

+                                const int16_t *quant_ptr,

+                                const int16_t *quant_shift_ptr,

+                                tran_low_t *qcoeff_ptr,

+                                tran_low_t *dqcoeff_ptr,

+                                const int16_t *dequant_ptr,

+                                uint16_t *eob_ptr,

+                                const int16_t *scan,

+                                const int16_t *iscan) {

+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;

+  __m128i zbins[2];

+  __m128i nzbins[2];

+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],

+                           (int)zbin_ptr[1],

+                           (int)zbin_ptr[1],

+                           (int)zbin_ptr[0]);

+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);

+  nzbins[0] = _mm_setzero_si128();

+  nzbins[1] = _mm_setzero_si128();

+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

+  (void)scan;

+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = ((int)count / 4) - 1; i >= 0; i--) {

+      __m128i coeffs, cmp1, cmp2;

+      int test;

+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+      cmp1 = _mm_and_si128(cmp1, cmp2);

+      test = _mm_movemask_epi8(cmp1);

+      if (test == 0xffff)

+        non_zero_regs--;

+      else

+        break;

+    }

+    // Quantization pass:

+    for (i = 0; i < non_zero_regs; i++) {

+      __m128i coeffs, coeffs_sign, tmp1, tmp2;

+      int test;

+      int abs_coeff[4];

+      int coeff_sign[4];

+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+      coeffs_sign = _mm_srai_epi32(coeffs, 31);

+      coeffs = _mm_sub_epi32(

+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

+      tmp1 = _mm_or_si128(tmp1, tmp2);

+      test = _mm_movemask_epi8(tmp1);

+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);

+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);

+      for (j = 0; j < 4; j++) {

+        if (test & (1 << (4 * j))) {

+          int k = 4 * i + j;

+          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];

+          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;

+          const uint32_t abs_qcoeff =

+              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);

+          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

+          if (abs_qcoeff)

+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

+        }

+      }

+    }

+  }

+  *eob_ptr = eob_i + 1;

+}

+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,

+                                      intptr_t n_coeffs,

+                                      int skip_block,

+                                      const int16_t *zbin_ptr,

+                                      const int16_t *round_ptr,

+                                      const int16_t *quant_ptr,

+                                      const int16_t *quant_shift_ptr,

+                                      tran_low_t *qcoeff_ptr,

+                                      tran_low_t *dqcoeff_ptr,

+                                      const int16_t *dequant_ptr,

+                                      uint16_t *eob_ptr,

+                                      const int16_t *scan,

+                                      const int16_t *iscan) {

+  __m128i zbins[2];

+  __m128i nzbins[2];

+  int idx = 0;

+  int idx_arr[1024];

+  int i, eob = -1;

+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);

+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);

+  (void)scan;

+  zbins[0] = _mm_set_epi32(zbin1_tmp,

+                           zbin1_tmp,

+                           zbin1_tmp,

+                           zbin0_tmp);

+  zbins[1] = _mm_set1_epi32(zbin1_tmp);

+  nzbins[0] = _mm_setzero_si128();

+  nzbins[1] = _mm_setzero_si128();

+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+  if (!skip_block) {

+    // Pre-scan pass

+    for (i = 0; i < n_coeffs / 4; i++) {

+      __m128i coeffs, cmp1, cmp2;

+      int test;

+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+      cmp1 = _mm_and_si128(cmp1, cmp2);

+      test = _mm_movemask_epi8(cmp1);

+      if (!(test & 0xf))

+        idx_arr[idx++] = i * 4;

+      if (!(test & 0xf0))

+        idx_arr[idx++] = i * 4 + 1;

+      if (!(test & 0xf00))

+        idx_arr[idx++] = i * 4 + 2;

+      if (!(test & 0xf000))

+        idx_arr[idx++] = i * 4 + 3;

+    }

+    // Quantization pass: only process the coefficients selected in

+    // pre-scan pass. Note: idx can be zero.

+    for (i = 0; i < idx; i++) {

+      const int rc = idx_arr[i];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      const int64_t tmp1 = abs_coeff

+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

+      const uint32_t abs_qcoeff =

+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

+      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+      if (abs_qcoeff)

+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

+    }

+  }

+  *eob_ptr = eob + 1;

+}

+#endif

--- /dev/null

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -1,0 +1,223 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>

+#include <xmmintrin.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx/vpx_integer.h"

+void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,

+                         int skip_block, const int16_t* zbin_ptr,

+                         const int16_t* round_ptr, const int16_t* quant_ptr,

+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,

+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,

+                         uint16_t* eob_ptr,

+                         const int16_t* scan_ptr,

+                         const int16_t* iscan_ptr) {

+  __m128i zero;

+  (void)scan_ptr;

+  coeff_ptr += n_coeffs;

+  iscan_ptr += n_coeffs;

+  qcoeff_ptr += n_coeffs;

+  dqcoeff_ptr += n_coeffs;

+  n_coeffs = -n_coeffs;

+  zero = _mm_setzero_si128();

+  if (!skip_block) {

+    __m128i eob;

+    __m128i zbin;

+    __m128i round, quant, dequant, shift;

+    {

+      __m128i coeff0, coeff1;

+      // Setup global values

+      {

+        __m128i pw_1;

+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);

+        round = _mm_load_si128((const __m128i*)round_ptr);

+        quant = _mm_load_si128((const __m128i*)quant_ptr);

+        pw_1 = _mm_set1_epi16(1);

+        zbin = _mm_sub_epi16(zbin, pw_1);

+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);

+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);

+      }

+      {

+        __m128i coeff0_sign, coeff1_sign;

+        __m128i qcoeff0, qcoeff1;

+        __m128i qtmp0, qtmp1;

+        __m128i cmp_mask0, cmp_mask1;

+        // Do DC and first 15 AC

+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

+        // Poor man's sign extract

+        coeff0_sign = _mm_srai_epi16(coeff0, 15);

+        coeff1_sign = _mm_srai_epi16(coeff1, 15);

+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+        round = _mm_unpackhi_epi64(round, round);

+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+        quant = _mm_unpackhi_epi64(quant, quant);

+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+        shift = _mm_unpackhi_epi64(shift, shift);

+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+        // Reinsert signs

+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+        // Mask out zbin threshold coeffs

+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+        dequant = _mm_unpackhi_epi64(dequant, dequant);

+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

+      }

+      {

+        // Scan for eob

+        __m128i zero_coeff0, zero_coeff1;

+        __m128i nzero_coeff0, nzero_coeff1;

+        __m128i iscan0, iscan1;

+        __m128i eob1;

+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

+        // Add one to convert from indices to counts

+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+        eob = _mm_and_si128(iscan0, nzero_coeff0);

+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+        eob = _mm_max_epi16(eob, eob1);

+      }

+      n_coeffs += 8 * 2;

+    }

+    // AC only loop

+    while (n_coeffs < 0) {

+      __m128i coeff0, coeff1;

+      {

+        __m128i coeff0_sign, coeff1_sign;

+        __m128i qcoeff0, qcoeff1;

+        __m128i qtmp0, qtmp1;

+        __m128i cmp_mask0, cmp_mask1;

+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

+        // Poor man's sign extract

+        coeff0_sign = _mm_srai_epi16(coeff0, 15);

+        coeff1_sign = _mm_srai_epi16(coeff1, 15);

+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+        // Reinsert signs

+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+        // Mask out zbin threshold coeffs

+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

+      }

+      {

+        // Scan for eob

+        __m128i zero_coeff0, zero_coeff1;

+        __m128i nzero_coeff0, nzero_coeff1;

+        __m128i iscan0, iscan1;

+        __m128i eob0, eob1;

+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

+        // Add one to convert from indices to counts

+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);

+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+        eob0 = _mm_max_epi16(eob0, eob1);

+        eob = _mm_max_epi16(eob, eob0);

+      }

+      n_coeffs += 8 * 2;

+    }

+    // Accumulate EOB

+    {

+      __m128i eob_shuffled;

+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

+      eob = _mm_max_epi16(eob, eob_shuffled);

+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

+      eob = _mm_max_epi16(eob, eob_shuffled);

+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

+      eob = _mm_max_epi16(eob, eob_shuffled);

+      *eob_ptr = _mm_extract_epi16(eob, 1);

+    }

+  } else {

+    do {

+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

+      n_coeffs += 8 * 2;

+    } while (n_coeffs < 0);

+    *eob_ptr = 0;

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -1,0 +1,216 @@

+;

+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+pw_1: times 8 dw 1

+SECTION .text

+; TODO(yunqingwang)fix quantize_b code for skip=1 case.

+%macro QUANTIZE_FN 2

+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

+                                shift, qcoeff, dqcoeff, dequant, \

+                                eob, scan, iscan

+  cmp                    dword skipm, 0

+  jne .blank

+  ; actual quantize loop - setup pointers, rounders, etc.

+  movifnidn                   coeffq, coeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, dequantmp

+  movifnidn                    zbinq, zbinmp

+  movifnidn                   roundq, roundmp

+  movifnidn                   quantq, quantmp

+  mova                            m0, [zbinq]              ; m0 = zbin

+  mova                            m1, [roundq]             ; m1 = round

+  mova                            m2, [quantq]             ; m2 = quant

+%ifidn %1, b_32x32

+  pcmpeqw                         m5, m5

+  psrlw                           m5, 15

+  paddw                           m0, m5

+  paddw                           m1, m5

+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

+%endif

+  mova                            m3, [r2q]                ; m3 = dequant

+  psubw                           m0, [pw_1]

+  mov                             r2, shiftmp

+  mov                             r3, qcoeffmp

+  mova                            m4, [r2]                 ; m4 = shift

+  mov                             r4, dqcoeffmp

+  mov                             r5, iscanmp

+%ifidn %1, b_32x32

+  psllw                           m4, 1

+%endif

+  pxor                            m5, m5                   ; m5 = dedicated zero

+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob

+  lea                         coeffq, [  coeffq+ncoeffq*2]

+  lea                         iscanq, [  iscanq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  ; get DC and first 15 AC coeffs

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

+  punpckhqdq                      m0, m0

+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

+  paddsw                          m6, m1                   ; m6 += round

+  punpckhqdq                      m1, m1

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16

+  punpckhqdq                      m2, m2

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  paddw                           m8, m6                   ; m8 += m6

+  paddw                          m13, m11                  ; m13 += m11

+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16

+  punpckhqdq                      m4, m4

+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

+  psignw                          m8, m9                   ; m8 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  pand                            m8, m7

+  pand                           m13, m12

+  mova        [qcoeffq+ncoeffq*2+ 0], m8

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                           m8, m8

+  pabsw                          m13, m13

+%endif

+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

+  punpckhqdq                      m3, m3

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                           m8, 1

+  psrlw                          m13, 1

+  psignw                          m8, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m8

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                           m8, m6                   ; m8 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jz .accumulate_eob

+.ac_only_loop:

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

+%ifidn %1, b_32x32

+  pmovmskb                       r6d, m7

+  pmovmskb                       r2d, m12

+  or                              r6, r2

+  jz .skip_iter

+%endif

+  paddsw                          m6, m1                   ; m6 += round

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  paddw                          m14, m6                   ; m14 += m6

+  paddw                          m13, m11                  ; m13 += m11

+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16

+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

+  psignw                         m14, m9                   ; m14 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  pand                           m14, m7

+  pand                           m13, m12

+  mova        [qcoeffq+ncoeffq*2+ 0], m14

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                          m14, m14

+  pabsw                          m13, m13

+%endif

+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                          m14, 1

+  psrlw                          m13, 1

+  psignw                         m14, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m14

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                          m14, m6                   ; m14 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m14

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%ifidn %1, b_32x32

+  jmp .accumulate_eob

+.skip_iter:

+  mova        [qcoeffq+ncoeffq*2+ 0], m5

+  mova        [qcoeffq+ncoeffq*2+16], m5

+  mova       [dqcoeffq+ncoeffq*2+ 0], m5

+  mova       [dqcoeffq+ncoeffq*2+16], m5

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%endif

+.accumulate_eob:

+  ; horizontally accumulate/max eobs and write into [eob] memory pointer

+  mov                             r2, eobmp

+  pshufd                          m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0x1

+  pmaxsw                          m8, m7

+  pextrw                          r6, m8, 0

+  mov                             [r2], r6

+  RET

+  ; skip-block, i.e. just write all zeroes

+.blank:

+  mov                             r0, dqcoeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, qcoeffmp

+  mov                             r3, eobmp

+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  pxor                            m7, m7

+.blank_loop:

+  mova       [dqcoeffq+ncoeffq*2+ 0], m7

+  mova       [dqcoeffq+ncoeffq*2+16], m7

+  mova        [qcoeffq+ncoeffq*2+ 0], m7

+  mova        [qcoeffq+ncoeffq*2+16], m7

+  add                        ncoeffq, mmsize

+  jl .blank_loop

+  mov                    word [eobq], 0

+  RET

+%endmacro

+INIT_XMM ssse3

+QUANTIZE_FN b, 7

+QUANTIZE_FN b_32x32, 7

--

⑨