shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -529,9 +529,8 @@

 specialize vp9_get_mb_ss mmx sse2

 # ENCODEMB INVOKE

-prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"

-specialize vp9_block_error mmx sse2

-vp9_block_error_sse2=vp9_block_error_xmm

+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"

+specialize vp9_block_error sse2

 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"

 specialize vp9_subtract_block sse2

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -582,7 +582,7 @@

 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,

-                          TOKENEXTRA **tp, int *totalrate, int *totaldist,

+                          TOKENEXTRA **tp, int *totalrate, int64_t *totaldist,

                           BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON * const cm = &cpi->common;

   MACROBLOCK * const x = &cpi->mb;

@@ -1195,7 +1195,7 @@

 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,

                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,

-                             int *rate, int *dist) {

+                             int *rate, int64_t *dist) {

   VP9_COMMON * const cm = &cpi->common;

   MACROBLOCK * const x = &cpi->mb;

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

@@ -1211,7 +1211,8 @@

   BLOCK_SIZE_TYPE subsize;

   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];

   PARTITION_CONTEXT sl[8], sa[8];

-  int r = 0, d = 0;

+  int r = 0;

+  int64_t d = 0;

   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

     return;

@@ -1252,7 +1253,8 @@

       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,

                     get_block_context(x, subsize));

       if (mi_row + (bh >> 1) <= cm->mi_rows) {

-        int rt, dt;

+        int rt;

+        int64_t dt;

         update_state(cpi, get_block_context(x, subsize), subsize, 0);

         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

         *(get_sb_index(xd, subsize)) = 1;

@@ -1270,7 +1272,8 @@

       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,

                     get_block_context(x, subsize));

       if (mi_col + (bs >> 1) <= cm->mi_cols) {

-        int rt, dt;

+        int rt;

+        int64_t dt;

         update_state(cpi, get_block_context(x, subsize), subsize, 0);

         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

         *(get_sb_index(xd, subsize)) = 1;

@@ -1289,7 +1292,8 @@

         int x_idx = (i & 1) * (bs >> 2);

         int y_idx = (i >> 1) * (bs >> 2);

         int jj = i >> 1, ii = i & 0x01;

-        int rt, dt;

+        int rt;

+        int64_t dt;

         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))

           continue;

@@ -1323,7 +1327,7 @@

 // results, for encoding speed-up.

 static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,

                               int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,

-                              int *dist) {

+                              int64_t *dist) {

   VP9_COMMON * const cm = &cpi->common;

   MACROBLOCK * const x = &cpi->mb;

   MACROBLOCKD * const xd = &x->e_mbd;

@@ -1334,7 +1338,8 @@

   TOKENEXTRA *tp_orig = *tp;

   int i, pl;

   BLOCK_SIZE_TYPE subsize;

-  int srate = INT_MAX, sdist = INT_MAX;

+  int srate = INT_MAX;

+  int64_t sdist = INT_MAX;

   if (bsize < BLOCK_SIZE_SB8X8)

     if (xd->ab_index != 0) {

@@ -1351,7 +1356,8 @@

       || (cpi->sf.use_partitions_greater_than

           && bsize > cpi->sf.greater_than_block_size)) {

     if (bsize >= BLOCK_SIZE_SB8X8) {

-      int r4 = 0, d4 = 0;

+      int r4 = 0;

+      int64_t d4 = 0;

       subsize = get_subsize(bsize, PARTITION_SPLIT);

       *(get_sb_partitioning(x, bsize)) = subsize;

@@ -1358,7 +1364,8 @@

       for (i = 0; i < 4; ++i) {

         int x_idx = (i & 1) * (ms >> 1);

         int y_idx = (i >> 1) * (ms >> 1);

-        int r = 0, d = 0;

+        int r = 0;

+        int64_t d = 0;

         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))

           continue;

@@ -1386,8 +1393,8 @@

           && bsize <= cpi->sf.less_than_block_size)) {

     // PARTITION_HORZ

     if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {

-      int r2, d2;

-      int r = 0, d = 0;

+      int r2, r = 0;

+      int64_t d2, d = 0;

       subsize = get_subsize(bsize, PARTITION_HORZ);

       *(get_sb_index(xd, subsize)) = 0;

       pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,

@@ -1418,13 +1425,15 @@

     // PARTITION_VERT

     if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {

-      int r2, d2;

+      int r2;

+      int64_t d2;

       subsize = get_subsize(bsize, PARTITION_VERT);

       *(get_sb_index(xd, subsize)) = 0;

       pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,

                     get_block_context(x, subsize));

       if (mi_col + (ms >> 1) < cm->mi_cols) {

-        int r = 0, d = 0;

+        int r = 0;

+        int64_t d = 0;

         update_state(cpi, get_block_context(x, subsize), subsize, 0);

         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

@@ -1450,7 +1459,8 @@

     // PARTITION_NONE

     if ((mi_row + (ms >> 1) < cm->mi_rows) &&

         (mi_col + (ms >> 1) < cm->mi_cols)) {

-      int r, d;

+      int r;

+      int64_t d;

       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,

                     get_block_context(x, bsize));

       if (bsize >= BLOCK_SIZE_SB8X8) {

@@ -1497,7 +1507,8 @@

   // Code each SB in the row

   for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;

       mi_col += 64 / MI_SIZE) {

-    int dummy_rate, dummy_dist;

+    int dummy_rate;

+    int64_t dummy_dist;

     if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||

         cpi->sf.use_one_partition_size_always ) {

       const int idx_str = cm->mode_info_stride * mi_row + mi_col;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -274,12 +274,14 @@

-int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {

-  int i, error = 0;

+int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,

+                          intptr_t block_size) {

+  int i;

+  int64_t error = 0;

   for (i = 0; i < block_size; i++) {

     int this_diff = coeff[i] - dqcoeff[i];

-    error += this_diff * this_diff;

+    error += (unsigned)this_diff * this_diff;

   return error;

@@ -417,7 +419,7 @@

 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,

                                      int (*r)[2], int *rate,

-                                     int *d, int *distortion,

+                                     int64_t *d, int64_t *distortion,

                                      int *s, int *skip,

                                      int64_t txfm_cache[NB_TXFM_MODES],

                                      TX_SIZE max_txfm_size) {

@@ -496,27 +498,15 @@

                                  rd[TX_4X4][1] : rd[TX_8X8][1];

-static int block_error(int16_t *coeff, int16_t *dqcoeff,

-                       int block_size, int shift) {

-  int i;

-  int64_t error = 0;

-  for (i = 0; i < block_size; i++) {

-    int this_diff = coeff[i] - dqcoeff[i];

-    error += (unsigned)this_diff * this_diff;

-  }

-  error >>= shift;

-  return error > INT_MAX ? INT_MAX : (int)error;

-}

-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {

+static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

+                               int shift) {

   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

-  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

-                     16 << (bwl + bhl), shift);

+  return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

+                         16 << (bwl + bhl)) >> shift;

-static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {

+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

+                                int shift) {

   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

   int64_t sum = 0;

   int plane;

@@ -524,11 +514,10 @@

   for (plane = 1; plane < MAX_MB_PLANE; plane++) {

     const int subsampling = x->e_mbd.plane[plane].subsampling_x +

                             x->e_mbd.plane[plane].subsampling_y;

-    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

-                       16 << (bwl + bhl - subsampling), 0);

+    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

+                           16 << (bwl + bhl - subsampling));

-  sum >>= shift;

-  return sum > INT_MAX ? INT_MAX : (int)sum;

+  return sum >> shift;

 struct rdcost_block_args {

@@ -586,7 +575,8 @@

 static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

-                                     int *rate, int *distortion, int *skippable,

+                                     int *rate, int64_t *distortion,

+                                     int *skippable,

                                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

   xd->mode_info_context->mbmi.txfm_size = tx_size;

@@ -602,11 +592,12 @@

 static void super_block_yrd(VP9_COMP *cpi,

-                            MACROBLOCK *x, int *rate, int *distortion,

+                            MACROBLOCK *x, int *rate, int64_t *distortion,

                             int *skip, BLOCK_SIZE_TYPE bs,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

   VP9_COMMON *const cm = &cpi->common;

-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

+  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];

+  int64_t d[TX_SIZE_MAX_SB];

   MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

@@ -651,13 +642,13 @@

                                      int *bmode_costs,

                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

                                      int *bestrate, int *bestratey,

-                                     int *bestdistortion,

+                                     int64_t *bestdistortion,

                                      BLOCK_SIZE_TYPE bsize) {

   MB_PREDICTION_MODE mode;

   MACROBLOCKD *xd = &x->e_mbd;

   int64_t best_rd = INT64_MAX;

   int rate = 0;

-  int distortion;

+  int64_t distortion;

   VP9_COMMON *const cm = &cpi->common;

   const int src_stride = x->plane[0].src.stride;

   uint8_t *src, *dst;

@@ -777,7 +768,7 @@

 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

                                          int *Rate, int *rate_y,

-                                         int *Distortion, int64_t best_rd) {

+                                         int64_t *Distortion, int64_t best_rd) {

   int i, j;

   MACROBLOCKD *const xd = &mb->e_mbd;

   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

@@ -785,7 +776,7 @@

   int bh = 1 << b_height_log2(bsize);

   int idx, idy;

   int cost = 0;

-  int distortion = 0;

+  int64_t distortion = 0;

   int tot_rate_y = 0;

   int64_t total_rd = 0;

   ENTROPY_CONTEXT t_above[4], t_left[4];

@@ -802,7 +793,7 @@

       const int mis = xd->mode_info_stride;

       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);

-      int UNINITIALIZED_IS_SAFE(d);

+      int64_t UNINITIALIZED_IS_SAFE(d);

       i = idy * 2 + idx;

       if (xd->frame_type == KEY_FRAME) {

@@ -844,14 +835,14 @@

 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,

                                       int *rate, int *rate_tokenonly,

-                                      int *distortion, int *skippable,

+                                      int64_t *distortion, int *skippable,

                                       BLOCK_SIZE_TYPE bsize,

                                       int64_t txfm_cache[NB_TXFM_MODES]) {

   MB_PREDICTION_MODE mode;

   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

   MACROBLOCKD *const xd = &x->e_mbd;

-  int this_rate, this_rate_tokenonly;

-  int this_distortion, s;

+  int this_rate, this_rate_tokenonly, s;

+  int64_t this_distortion;

   int64_t best_rd = INT64_MAX, this_rd;

   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);

   int i;

@@ -912,7 +903,7 @@

 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

-                                      int *rate, int *distortion,

+                                      int *rate, int64_t *distortion,

                                       int *skippable, BLOCK_SIZE_TYPE bsize,

                                       TX_SIZE uv_tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -927,7 +918,7 @@

 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,

-                             int *rate, int *distortion, int *skippable,

+                             int *rate, int64_t *distortion, int *skippable,

                              BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

@@ -952,13 +943,13 @@

 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,

                                        int *rate, int *rate_tokenonly,

-                                       int *distortion, int *skippable,

+                                       int64_t *distortion, int *skippable,

                                        BLOCK_SIZE_TYPE bsize) {

   MB_PREDICTION_MODE mode;

   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

   int64_t best_rd = INT64_MAX, this_rd;

-  int this_rate_tokenonly, this_rate;

-  int this_distortion, s;

+  int this_rate_tokenonly, this_rate, s;

+  int64_t this_distortion;

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

@@ -1101,7 +1092,7 @@

                                        MACROBLOCK *x,

                                        int i,

                                        int *labelyrate,

-                                       int *distortion,

+                                       int64_t *distortion,

                                        ENTROPY_CONTEXT *ta,

                                        ENTROPY_CONTEXT *tl) {

   int k;

@@ -1126,7 +1117,7 @@

   raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,

                             xd->plane[0].dst.buf,

                             xd->plane[0].dst.stride);

-  int thisdistortion = 0;

+  int64_t thisdistortion = 0;

   int thisrate = 0;

   *labelyrate = 0;

@@ -1189,7 +1180,7 @@

   int64_t segment_rd;

   int r;

-  int d;

+  int64_t d;

   int segment_yrate;

   MB_PREDICTION_MODE modes[4];

   int_mv mvs[4], second_mvs[4];

@@ -1281,21 +1272,18 @@

                                     BEST_SEG_INFO *bsi,

                                     int_mv seg_mvs[4][MAX_REF_FRAMES],

                                     int mi_row, int mi_col) {

-  int i, j;

-  int br = 0, bd = 0;

+  int i, j, br = 0, rate = 0, sbr = 0, idx, idy;

+  int64_t bd = 0, sbd = 0;

   MB_PREDICTION_MODE this_mode;

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

   const int label_count = 4;

   int64_t this_segment_rd = 0, other_segment_rd;

   int label_mv_thresh;

-  int rate = 0;

-  int sbr = 0, sbd = 0;

   int segmentyrate = 0;

   int best_eobs[4] = { 0 };

   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;

   int bwl = b_width_log2(bsize), bw = 1 << bwl;

   int bhl = b_height_log2(bsize), bh = 1 << bhl;

-  int idx, idy;

   vp9_variance_fn_ptr_t *v_fn_ptr;

   ENTROPY_CONTEXT t_above[4], t_left[4];

   ENTROPY_CONTEXT t_above_b[4], t_left_b[4];

@@ -1340,7 +1328,7 @@

       // search for the best motion vector on this segment

       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {

         int64_t this_rd;

-        int distortion;

+        int64_t distortion;

         int labelyrate;

         ENTROPY_CONTEXT t_above_s[4], t_left_s[4];

         const struct buf_2d orig_src = x->plane[0].src;

@@ -1527,7 +1515,7 @@

                                        int64_t best_rd,

                                        int *returntotrate,

                                        int *returnyrate,

-                                       int *returndistortion,

+                                       int64_t *returndistortion,

                                        int *skippable, int mvthresh,

                                        int_mv seg_mvs[4][MAX_REF_FRAMES],

                                        int mi_row, int mi_col) {

@@ -1921,7 +1909,7 @@

 static void model_rd_from_var_lapndz(int var, int n, int qstep,

-                                     int *rate, int *dist) {

+                                     int *rate, int64_t *dist) {

   // This function models the rate and distortion for a Laplacian

   // source with given variance when quantized with a uniform quantizer

   // with given stepsize. The closed form expression is:

@@ -1958,12 +1946,13 @@

 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,

                             MACROBLOCK *x, MACROBLOCKD *xd,

-                            int *out_rate_sum, int *out_dist_sum) {

+                            int *out_rate_sum, int64_t *out_dist_sum) {

   // Note our transform coeffs are 8 times an orthogonal transform.

   // Hence quantizer step is also 8 times. To get effective quantizer

   // we need to divide by 8 before sending to modeling function.

   unsigned int sse;

-  int i, rate_sum = 0, dist_sum = 0;

+  int i, rate_sum = 0;

+  int64_t dist_sum = 0;

   for (i = 0; i < MAX_MB_PLANE; ++i) {

     struct macroblock_plane *const p = &x->plane[i];

@@ -1973,7 +1962,8 @@

     const int bw = plane_block_width(bsize, pd);

     const int bh = plane_block_height(bsize, pd);

     const enum BlockSize bs = get_block_size(bw, bh);

-    int rate, dist;

+    int rate;

+    int64_t dist;

     cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,

                        pd->dst.buf, pd->dst.stride, &sse);

     model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);

@@ -2238,9 +2228,10 @@

 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

                                  BLOCK_SIZE_TYPE bsize,

                                  int64_t txfm_cache[],

-                                 int *rate2, int *distortion, int *skippable,

-                                 int *rate_y, int *distortion_y,

-                                 int *rate_uv, int *distortion_uv,

+                                 int *rate2, int64_t *distortion,

+                                 int *skippable,

+                                 int *rate_y, int64_t *distortion_y,

+                                 int *rate_uv, int64_t *distortion_uv,

                                  int *mode_excluded, int *disable_skip,

                                  INTERPOLATIONFILTERTYPE *best_filter,

                                  int_mv *frame_mv,

@@ -2344,7 +2335,8 @@

     *best_filter = EIGHTTAP;

   } else {

     int i, newbest;

-    int tmp_rate_sum = 0, tmp_dist_sum = 0;

+    int tmp_rate_sum = 0;

+    int64_t tmp_dist_sum = 0;

     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

       int rs = 0;

       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];

@@ -2359,7 +2351,8 @@

       if (interpolating_intpel_seen && is_intpel_interp) {

         rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);

       } else {

-        int rate_sum = 0, dist_sum = 0;

+        int rate_sum = 0;

+        int64_t dist_sum = 0;

         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);

         rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);

@@ -2503,19 +2496,20 @@

 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                               int *returnrate, int *returndist,

+                               int *returnrate, int64_t *returndist,

                                BLOCK_SIZE_TYPE bsize,

                                PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

-  int rate_y = 0, rate_uv;

-  int rate_y_tokenonly = 0, rate_uv_tokenonly;

-  int dist_y = 0, dist_uv;

-  int y_skip = 0, uv_skip;

+  int rate_y = 0, rate_uv = 0;

+  int rate_y_tokenonly = 0, rate_uv_tokenonly = 0;

+  int64_t dist_y = 0, dist_uv = 0;

+  int y_skip = 0, uv_skip = 0;

   int64_t txfm_cache[NB_TXFM_MODES], err;

   MB_PREDICTION_MODE mode;

   TX_SIZE txfm_size;

-  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;

+  int rate4x4_y, rate4x4_y_tokenonly;

+  int64_t dist4x4_y;

   int64_t err4x4 = INT64_MAX;

   int i;

@@ -2566,7 +2560,7 @@

 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

                                   int mi_row, int mi_col,

                                   int *returnrate,

-                                  int *returndistortion,

+                                  int64_t *returndistortion,

                                   BLOCK_SIZE_TYPE bsize,

                                   PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON *cm = &cpi->common;

@@ -2601,7 +2595,8 @@

   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;

   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;

   int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];

-  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];

+  int64_t dist_uv[TX_SIZE_MAX_SB];

+  int skip_uv[TX_SIZE_MAX_SB];

   MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];

   struct scale_factors scale_factor[4];

   unsigned int ref_frame_mask = 0;

@@ -2704,7 +2699,7 @@

     int disable_skip = 0;

     int compmode_cost = 0;

     int rate2 = 0, rate_y = 0, rate_uv = 0;

-    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;

+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;

     int skippable;

     int64_t txfm_cache[NB_TXFM_MODES];

     int i;

@@ -2891,11 +2886,13 @@

       distortion2 = distortion_y + distortion_uv;

     } else if (this_mode == SPLITMV) {

       const int is_comp_pred = mbmi->ref_frame[1] > 0;

-      int rate, distortion;

+      int rate;

+      int64_t distortion;

       int64_t this_rd_thresh;

       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;

       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;

-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;

+      int64_t tmp_best_distortion = INT_MAX;

+      int tmp_best_skippable = 0;

       int switchable_filter_index;

       int_mv *second_ref = is_comp_pred ?

           &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;

--- a/vp9/encoder/vp9_rdopt.h

+++ b/vp9/encoder/vp9_rdopt.h

@@ -20,12 +20,12 @@

 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);

 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                               int *r, int *d, BLOCK_SIZE_TYPE bsize,

+                               int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,

                                PICK_MODE_CONTEXT *ctx);

 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

                                   int mi_row, int mi_col,

-                                  int *r, int *d, BLOCK_SIZE_TYPE bsize,

+                                  int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,

                                   PICK_MODE_CONTEXT *ctx);

 void vp9_init_me_luts();

--- a/vp9/encoder/x86/vp9_encodeopt.asm

+++ /dev/null

@@ -1,125 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)

-global sym(vp9_block_error_xmm) PRIVATE

-sym(vp9_block_error_xmm):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prologue

-        mov         rsi,        arg(0) ;coeff_ptr

-        mov         rdi,        arg(1) ;dcoef_ptr

-        movdqa      xmm0,       [rsi]

-        movdqa      xmm1,       [rdi]

-        movdqa      xmm2,       [rsi+16]

-        movdqa      xmm3,       [rdi+16]

-        psubw       xmm0,       xmm1

-        psubw       xmm2,       xmm3

-        pmaddwd     xmm0,       xmm0

-        pmaddwd     xmm2,       xmm2

-        paddd       xmm0,       xmm2

-        pxor        xmm5,       xmm5

-        movdqa      xmm1,       xmm0

-        punpckldq   xmm0,       xmm5

-        punpckhdq   xmm1,       xmm5

-        paddd       xmm0,       xmm1

-        movdqa      xmm1,       xmm0

-        psrldq      xmm0,       8

-        paddd       xmm0,       xmm1

-        movq        rax,        xmm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)

-global sym(vp9_block_error_mmx) PRIVATE

-sym(vp9_block_error_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        mm7,        mm7

-        mov         rdi,        arg(1) ;dcoef_ptr

-        movq        mm3,        [rsi]

-        movq        mm4,        [rdi]

-        movq        mm5,        [rsi+8]

-        movq        mm6,        [rdi+8]

-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0

-        movq        mm2,        mm7

-        psubw       mm5,        mm6

-        por         mm1,        mm2

-        pmaddwd     mm5,        mm5

-        pcmpeqw     mm1,        mm7

-        psubw       mm3,        mm4

-        pand        mm1,        mm3

-        pmaddwd     mm1,        mm1

-        paddd       mm1,        mm5

-        movq        mm3,        [rsi+16]

-        movq        mm4,        [rdi+16]

-        movq        mm5,        [rsi+24]

-        movq        mm6,        [rdi+24]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pmaddwd     mm3,        mm3

-        paddd       mm3,        mm5

-        paddd       mm1,        mm3

-        movq        mm0,        mm1

-        psrlq       mm1,        32

-        paddd       mm0,        mm1

-        movq        rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- /dev/null

+++ b/vp9/encoder/x86/vp9_error_sse2.asm

@@ -1,0 +1,57 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION .text

+; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)

+INIT_XMM sse2

+cglobal block_error, 3, 3, 6, uqc, dqc, size

+  pxor      m4, m4                 ; accumulator

+  pxor      m5, m5                 ; dedicated zero register

+  lea     uqcq, [uqcq+sizeq*2]

+  lea     dqcq, [dqcq+sizeq*2]

+  neg    sizeq

+.loop:

+  mova      m0, [uqcq+sizeq*2]

+  mova      m2, [dqcq+sizeq*2]

+  mova      m1, [uqcq+sizeq*2+mmsize]

+  mova      m3, [dqcq+sizeq*2+mmsize]

+  psubw     m0, m2

+  psubw     m1, m3

+  ; individual errors are max. 15bit+sign, so squares are 30bit, and

+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)

+  pmaddwd   m0, m0

+  pmaddwd   m1, m1

+  ; accumulate in 64bit

+  punpckldq m2, m0, m5

+  punpckhdq m0, m5

+  punpckldq m3, m1, m5

+  punpckhdq m1, m5

+  paddq     m4, m2

+  paddq     m4, m0

+  paddq     m4, m3

+  paddq     m4, m1

+  add    sizeq, mmsize

+  jl .loop

+  ; accumulate horizontally and store in return value

+  movhlps   m5, m4

+  paddq     m4, m5

+%if ARCH_X86_64

+  movq    rax, m4

+%else

+  pshufd   m5, m4, 0x1

+  movd    eax, m4

+  movd    edx, m5

+%endif

+  RET

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -85,12 +85,12 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm

-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm

 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c

--

⑨