shithub: libvpx

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -122,13 +122,6 @@

 #define WHT_UPSCALE_FACTOR 2

-#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)

-#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \

-                            (c)->fc.tx_probs_8x8p :    \

-                            (b) < BLOCK_SIZE_SB32X32 ? \

-                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)

 /* For keyframes, intra block modes are predicted by the (already decoded)

    modes for the Y blocks to the left and above us; for interframes, there

    is a single probability table. */

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1678,6 +1678,7 @@

 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {

   if (lossless) {

+    // printf("Switching to lossless\n");

     cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;

     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;

     cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;

@@ -1687,6 +1688,7 @@

     cpi->zbin_mode_boost_enabled = 0;

     cpi->common.txfm_mode = ONLY_4X4;

   } else {

+    // printf("Not lossless\n");

     cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;

     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;

     cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;

@@ -1695,7 +1697,7 @@

 static void switch_txfm_mode(VP9_COMP *cpi) {

-  if (cpi->sf.use_largest_txform &&

+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&

       cpi->common.txfm_mode >= ALLOW_32X32)

     cpi->common.txfm_mode = ALLOW_32X32;

@@ -1728,6 +1730,7 @@

   vp9_zero(cm->fc.switchable_interp_count);

   vp9_zero(cpi->best_switchable_interp_count);

+  vp9_zero(cpi->txfm_stepdown_count);

   xd->mode_info_context = cm->mi;

   xd->prev_mode_info_context = cm->prev_mi;

@@ -1930,6 +1933,47 @@

+static int get_frame_type(VP9_COMP *cpi) {

+  int frame_type;

+  if (cpi->common.frame_type == KEY_FRAME)

+    frame_type = 0;

+  else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)

+    frame_type = 3;

+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)

+    frame_type = 1;

+  else

+    frame_type = 2;

+  return frame_type;

+}

+static void select_txfm_mode(VP9_COMP *cpi) {

+  if (cpi->oxcf.lossless) {

+    cpi->common.txfm_mode = ONLY_4X4;

+  } else if (cpi->common.current_video_frame == 0) {

+    cpi->common.txfm_mode = TX_MODE_SELECT;

+  } else {

+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {

+      int frame_type = get_frame_type(cpi);

+      cpi->common.txfm_mode =

+          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]

+          > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?

+          ALLOW_32X32 : TX_MODE_SELECT;

+    } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {

+      cpi->common.txfm_mode = ALLOW_32X32;

+    } else {

+      unsigned int total = 0;

+      int i;

+      for (i = 0; i < TX_SIZE_MAX_SB; ++i)

+        total += cpi->txfm_stepdown_count[i];

+      if (total) {

+        double fraction = (double)cpi->txfm_stepdown_count[0] / total;

+        cpi->common.txfm_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;

+        // printf("fraction = %f\n", fraction);

+      }  // else keep unchanged

+    }

+  }

+}

 void vp9_encode_frame(VP9_COMP *cpi) {

   VP9_COMMON * const cm = &cpi->common;

@@ -1940,7 +1984,7 @@

   // side behaviour is where the ALT ref buffer has oppositie sign bias to

   // the other two.

   if ((cm->ref_frame_sign_bias[ALTREF_FRAME]

-      == cm->ref_frame_sign_bias[GOLDEN_FRAME])

+       == cm->ref_frame_sign_bias[GOLDEN_FRAME])

       || (cm->ref_frame_sign_bias[ALTREF_FRAME]

           == cm->ref_frame_sign_bias[LAST_FRAME])) {

     cm->allow_comp_inter_inter = 0;

@@ -1952,9 +1996,7 @@

   if (cpi->sf.RD) {

-    int i, frame_type, pred_type;

-    TXFM_MODE txfm_type;

+    int i, pred_type;

/*

      * This code does a single RD pass over the whole frame assuming

      * either compound, single or hybrid prediction as per whatever has

@@ -1964,26 +2006,19 @@

      * that for subsequent frames.

      * It does the same analysis for transform size selection also.

*/

-    if (cpi->common.frame_type == KEY_FRAME)

-      frame_type = 0;

-    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)

-      frame_type = 3;

-    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)

-      frame_type = 1;

-    else

-      frame_type = 2;

+    int frame_type = get_frame_type(cpi);

     /* prediction (compound, single or hybrid) mode selection */

     if (frame_type == 3 || !cm->allow_comp_inter_inter)

       pred_type = SINGLE_PREDICTION_ONLY;

     else if (cpi->rd_prediction_type_threshes[frame_type][1]

-        > cpi->rd_prediction_type_threshes[frame_type][0]

-        && cpi->rd_prediction_type_threshes[frame_type][1]

-            > cpi->rd_prediction_type_threshes[frame_type][2]

-        && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)

+             > cpi->rd_prediction_type_threshes[frame_type][0]

+             && cpi->rd_prediction_type_threshes[frame_type][1]

+             > cpi->rd_prediction_type_threshes[frame_type][2]

+             && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)

       pred_type = COMP_PREDICTION_ONLY;

     else if (cpi->rd_prediction_type_threshes[frame_type][0]

-        > cpi->rd_prediction_type_threshes[frame_type][2])

+             > cpi->rd_prediction_type_threshes[frame_type][2])

       pred_type = SINGLE_PREDICTION_ONLY;

     else

       pred_type = HYBRID_PREDICTION;

@@ -1992,43 +2027,10 @@

     cpi->mb.e_mbd.lossless = 0;

     if (cpi->oxcf.lossless) {

-      txfm_type = ONLY_4X4;

       cpi->mb.e_mbd.lossless = 1;

-    } else

-#if 0

-      /* FIXME (rbultje): this code is disabled until we support cost updates

-       * while a frame is being encoded; the problem is that each time we

-       * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities

-       * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging

-       * further behind and not being chosen for subsequent frames either. This

-       * is essentially a local minimum problem that we can probably fix by

-       * estimating real costs more closely within a frame, perhaps by re-

-       * calculating costs on-the-fly as frame encoding progresses. */

-      if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-          cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&

-          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&

-          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

-        txfm_type = TX_MODE_SELECT;

-      } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]

-          && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]

-      ) {

-        txfm_type = ONLY_4X4;

-      } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

-        txfm_type = ALLOW_16X16;

-      } else

-      txfm_type = ALLOW_8X8;

-#else

-      txfm_type =

-          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]

-              > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?

-              ALLOW_32X32 : TX_MODE_SELECT;

-#endif

-    cpi->common.txfm_mode = txfm_type;

+    }

+    select_txfm_mode(cpi);

     cpi->common.comp_pred_mode = pred_type;

     encode_frame_internal(cpi);

@@ -2043,7 +2045,7 @@

       int diff;

       if (i == TX_MODE_SELECT)

         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,

-            2048 * (TX_SIZE_MAX_SB - 1), 0);

+                     2048 * (TX_SIZE_MAX_SB - 1), 0);

       diff = (int) (pd / cpi->common.MBs);

       cpi->rd_tx_select_threshes[frame_type][i] += diff;

       cpi->rd_tx_select_threshes[frame_type][i] /= 2;

@@ -2102,7 +2104,7 @@

         cpi->common.txfm_mode = ALLOW_8X8;

         reset_skip_txfm_size(cpi, TX_8X8);

       } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0

-          && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {

+                 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {

         cpi->common.txfm_mode = ONLY_4X4;

         reset_skip_txfm_size(cpi, TX_4X4);

       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -701,7 +701,7 @@

   sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;

   sf->adaptive_rd_thresh = 0;

   sf->use_lastframe_partitioning = 0;

-  sf->use_largest_txform = 0;

+  sf->tx_size_search_method = USE_FULL_RD;

   sf->use_8tap_always = 0;

   sf->use_avoid_tested_higherror = 0;

   sf->skip_lots_of_modes = 0;

@@ -744,17 +744,15 @@

       if (speed == 1) {

         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;

         sf->less_rectangular_check  = 1;

-        sf->use_largest_txform        = !(cpi->common.frame_type == KEY_FRAME ||

-                                          cpi->common.intra_only ||

-                                          cpi->common.show_frame == 0);

+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||

+                                      cpi->common.intra_only ||

+                                      cpi->common.show_frame == 0) ?

+                                     USE_FULL_RD :

+                                     USE_LARGESTINTRA);

         sf->disable_splitmv =

             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;

       if (speed == 2) {

-        sf->use_largest_txform        = !(cpi->common.frame_type == KEY_FRAME ||

-                                                  cpi->common.intra_only ||

-                                                  cpi->common.show_frame == 0);

         sf->adjust_thresholds_by_speed = 1;

         sf->less_rectangular_check  = 1;

         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;

@@ -763,15 +761,30 @@

         sf->use_lastframe_partitioning = 1;

         sf->adjust_partitioning_from_last_frame = 1;

         sf->last_partitioning_redo_frequency = 3;

+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||

+                                      cpi->common.intra_only ||

+                                      cpi->common.show_frame == 0) ?

+                                     USE_FULL_RD :

+                                     USE_LARGESTALL);

       if (speed == 3) {

         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;

         sf->partition_by_variance = 1;

+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||

+                                      cpi->common.intra_only ||

+                                      cpi->common.show_frame == 0) ?

+                                     USE_FULL_RD :

+                                     USE_LARGESTALL);

       if (speed == 4) {

         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;

         sf->use_one_partition_size_always = 1;

         sf->always_this_block_size = BLOCK_SIZE_MB16X16;

+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||

+                                      cpi->common.intra_only ||

+                                      cpi->common.show_frame == 0) ?

+                                     USE_FULL_RD :

+                                     USE_LARGESTALL);

/*

       if (speed == 2) {

@@ -788,7 +801,7 @@

*/

-     break;

+      break;

   }; /* switch */

--- a/vp9/encoder/vp9_onyx_int.h

+++ b/vp9/encoder/vp9_onyx_int.h

@@ -200,6 +200,13 @@

   HEX = 2

 } SEARCH_METHODS;

+typedef enum {

+  USE_FULL_RD = 0,

+  USE_LARGESTINTRA,

+  USE_LARGESTINTRA_MODELINTER,

+  USE_LARGESTALL

+} TX_SIZE_SEARCH_METHOD;

 typedef struct {

   int RD;

   SEARCH_METHODS search_method;

@@ -219,7 +226,7 @@

   int adaptive_rd_thresh;

   int skip_encode_sb;

   int use_lastframe_partitioning;

-  int use_largest_txform;

+  TX_SIZE_SEARCH_METHOD tx_size_search_method;

   int use_8tap_always;

   int use_avoid_tested_higherror;

   int skip_lots_of_modes;

@@ -588,6 +595,8 @@

   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]

                                       [VP9_SWITCHABLE_FILTERS];

   unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];

+  unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];

   int initial_width;

   int initial_height;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -279,6 +279,242 @@

+static enum BlockSize get_block_size(int bw, int bh) {

+  if (bw == 4 && bh == 4)

+    return BLOCK_4X4;

+  if (bw == 4 && bh == 8)

+    return BLOCK_4X8;

+  if (bw == 8 && bh == 4)

+    return BLOCK_8X4;

+  if (bw == 8 && bh == 8)

+    return BLOCK_8X8;

+  if (bw == 8 && bh == 16)

+    return BLOCK_8X16;

+  if (bw == 16 && bh == 8)

+    return BLOCK_16X8;

+  if (bw == 16 && bh == 16)

+    return BLOCK_16X16;

+  if (bw == 32 && bh == 32)

+    return BLOCK_32X32;

+  if (bw == 32 && bh == 16)

+    return BLOCK_32X16;

+  if (bw == 16 && bh == 32)

+    return BLOCK_16X32;

+  if (bw == 64 && bh == 32)

+    return BLOCK_64X32;

+  if (bw == 32 && bh == 64)

+    return BLOCK_32X64;

+  if (bw == 64 && bh == 64)

+    return BLOCK_64X64;

+  assert(0);

+  return -1;

+}

+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,

+                                           struct macroblockd_plane *pd) {

+  return get_block_size(plane_block_width(bsize, pd),

+                        plane_block_height(bsize, pd));

+}

+static double linear_interpolate(double x, int ntab, int inv_step,

+                                 const double *tab) {

+  double y = x * inv_step;

+  int d = (int) y;

+  if (d >= ntab - 1) {

+    return tab[ntab - 1];

+  } else {

+    double a = y - d;

+    return tab[d] * (1 - a) + tab[d + 1] * a;

+  }

+}

+static double model_rate_norm(double x) {

+  // Normalized rate

+  // This function models the rate for a Laplacian source

+  // source with given variance when quantized with a uniform quantizer

+  // with given stepsize. The closed form expression is:

+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],

+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),

+  // and H(x) is the binary entropy function.

+  static const int inv_rate_tab_step = 8;

+  static const double rate_tab[] = {

+    64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,

+    2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,

+    1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,

+    0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,

+    0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,

+    0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,

+    0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,

+    0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,

+    0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,

+    0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,

+    0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,

+    0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,

+    0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,

+    0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,

+    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,

+  };

+  const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);

+  assert(x >= 0.0);

+  return linear_interpolate(x, rate_tab_num, inv_rate_tab_step, rate_tab);

+}

+static double model_dist_norm(double x) {

+  // Normalized distortion

+  // This function models the normalized distortion for a Laplacian source

+  // source with given variance when quantized with a uniform quantizer

+  // with given stepsize. The closed form expression is:

+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))

+  // where x = qpstep / sqrt(variance)

+  // Note the actual distortion is Dn * variance.

+  static const int inv_dist_tab_step = 8;

+  static const double dist_tab[] = {

+    0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,

+    0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,

+    0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,

+    0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,

+    0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,

+    0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,

+    0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,

+    0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,

+    0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,

+    0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,

+    0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,

+    0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,

+    0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,

+    0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,

+    0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,

+  };

+  const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);

+  assert(x >= 0.0);

+  return linear_interpolate(x, dist_tab_num, inv_dist_tab_step, dist_tab);

+}

+static void model_rd_from_var_lapndz(int var, int n, int qstep,

+                                     int *rate, int64_t *dist) {

+  // This function models the rate and distortion for a Laplacian

+  // source with given variance when quantized with a uniform quantizer

+  // with given stepsize. The closed form expressions are in:

+  // Hang and Chen, "Source Model for transform video coder and its

+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.

+  // Sys. for Video Tech., April 1997.

+  vp9_clear_system_state();

+  if (var == 0 || n == 0) {

+    *rate = 0;

+    *dist = 0;

+  } else {

+    double D, R;

+    double s2 = (double) var / n;

+    double x = qstep / sqrt(s2);

+    D = model_dist_norm(x);

+    R = model_rate_norm(x);

+    if (R < 0) {

+      R = 0;

+      D = var;

+    }

+    *rate = (n * R * 256 + 0.5);

+    *dist = (n * D * s2 + 0.5);

+  }

+  vp9_clear_system_state();

+}

+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,

+                            MACROBLOCK *x, MACROBLOCKD *xd,

+                            int *out_rate_sum, int64_t *out_dist_sum) {

+  // Note our transform coeffs are 8 times an orthogonal transform.

+  // Hence quantizer step is also 8 times. To get effective quantizer

+  // we need to divide by 8 before sending to modeling function.

+  int i, rate_sum = 0, dist_sum = 0;

+  for (i = 0; i < MAX_MB_PLANE; ++i) {

+    struct macroblock_plane *const p = &x->plane[i];

+    struct macroblockd_plane *const pd = &xd->plane[i];

+    // TODO(dkovalev) the same code in get_plane_block_size

+    const int bw = plane_block_width(bsize, pd);

+    const int bh = plane_block_height(bsize, pd);

+    const enum BlockSize bs = get_block_size(bw, bh);

+    unsigned int sse;

+    int rate;

+    int64_t dist;

+    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,

+                              pd->dst.buf, pd->dst.stride, &sse);

+    // sse works better than var, since there is no dc prediction used

+    model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);

+    rate_sum += rate;

+    dist_sum += dist;

+  }

+  *out_rate_sum = rate_sum;

+  *out_dist_sum = dist_sum << 4;

+}

+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,

+                                 TX_SIZE tx_size,

+                                 MACROBLOCK *x, MACROBLOCKD *xd,

+                                 int *out_rate_sum, int64_t *out_dist_sum,

+                                 int *out_skip) {

+  int t, j, k;

+  enum BlockSize bs;

+  struct macroblock_plane *const p = &x->plane[0];

+  struct macroblockd_plane *const pd = &xd->plane[0];

+  const int bw = plane_block_width(bsize, pd);

+  const int bh = plane_block_height(bsize, pd);

+  int rate_sum = 0;

+  int64_t dist_sum = 0;

+  if (tx_size == TX_4X4) {

+    bs = BLOCK_4X4;

+    t = 4;

+  } else if (tx_size == TX_8X8) {

+    bs = BLOCK_8X8;

+    t = 8;

+  } else if (tx_size == TX_16X16) {

+    bs = BLOCK_16X16;

+    t = 16;

+  } else if (tx_size == TX_32X32) {

+    bs = BLOCK_32X32;

+    t = 32;

+  } else {

+    assert(0);

+  }

+  assert(bs <= get_block_size(bw, bh));

+  *out_skip = 1;

+  for (j = 0; j < bh; j+=t) {

+    for (k = 0; k < bw; k+=t) {

+      int rate;

+      int64_t dist;

+      unsigned int sse;

+      (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,

+                                p->src.stride,

+                                pd->dst.buf + j * pd->dst.stride + k,

+                                pd->dst.stride, &sse);

+      // sse works better than var, since there is no dc prediction used

+      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,

+                               &rate, &dist);

+      rate_sum += rate;

+      dist_sum += dist;

+      *out_skip &= (rate < 1024);

+    }

+  }

+  *out_rate_sum = rate_sum;

+  *out_dist_sum = (dist_sum << 4);

+}

 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,

                           intptr_t block_size, int64_t *ssz) {

   int i;

@@ -423,105 +659,6 @@

   return cost;

-static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,

-                                     int (*r)[2], int *rate,

-                                     int64_t *d, int64_t *distortion,

-                                     int *s, int *skip,

-                                     int64_t txfm_cache[NB_TXFM_MODES],

-                                     TX_SIZE max_txfm_size) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

-  int64_t rd[TX_SIZE_MAX_SB][2];

-  int n, m;

-  int s0, s1;

-  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);

-  for (n = TX_4X4; n <= max_txfm_size; n++) {

-    r[n][1] = r[n][0];

-    for (m = 0; m <= n - (n == max_txfm_size); m++) {

-      if (m == n)

-        r[n][1] += vp9_cost_zero(tx_probs[m]);

-      else

-        r[n][1] += vp9_cost_one(tx_probs[m]);

-    }

-  }

-  assert(skip_prob > 0);

-  s0 = vp9_cost_bit(skip_prob, 0);

-  s1 = vp9_cost_bit(skip_prob, 1);

-  for (n = TX_4X4; n <= max_txfm_size; n++) {

-    if (s[n]) {

-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);

-    } else {

-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);

-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);

-    }

-  }

-  if (max_txfm_size == TX_32X32 &&

-      (cm->txfm_mode == ALLOW_32X32 ||

-       (cm->txfm_mode == TX_MODE_SELECT &&

-        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

-        rd[TX_32X32][1] < rd[TX_4X4][1]))) {

-    mbmi->txfm_size = TX_32X32;

-  } else if (max_txfm_size >= TX_16X16 &&

-             (cm->txfm_mode == ALLOW_16X16 ||

-              cm->txfm_mode == ALLOW_32X32 ||

-              (cm->txfm_mode == TX_MODE_SELECT &&

-               rd[TX_16X16][1] < rd[TX_8X8][1] &&

-               rd[TX_16X16][1] < rd[TX_4X4][1]))) {

-    mbmi->txfm_size = TX_16X16;

-  } else if (cm->txfm_mode == ALLOW_8X8 ||

-             cm->txfm_mode == ALLOW_16X16 ||

-             cm->txfm_mode == ALLOW_32X32 ||

-           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {

-    mbmi->txfm_size = TX_8X8;

-  } else {

-    mbmi->txfm_size = TX_4X4;

-  }

-  *distortion = d[mbmi->txfm_size];

-  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];

-  *skip       = s[mbmi->txfm_size];

-  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];

-  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];

-  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];

-  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];

-  if (max_txfm_size == TX_32X32 &&

-      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

-      rd[TX_32X32][1] < rd[TX_4X4][1])

-    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];

-  else if (max_txfm_size >= TX_16X16 &&

-           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])

-    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];

-  else

-    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?

-                                 rd[TX_4X4][1] : rd[TX_8X8][1];

-}

-static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

-                                int shift, int64_t *sse) {

-  int64_t sum = 0, this_sse;

-  int plane;

-  *sse = 0;

-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

-    struct macroblockd_plane *p = &x->e_mbd.plane[plane];

-    const int bw = plane_block_width(bsize, p);

-    const int bh = plane_block_height(bsize, p);

-    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

-                           bw * bh, &this_sse);

-    *sse += this_sse;

-  }

-  *sse >>= shift;

-  return sum >> shift;

-}

 struct rdcost_block_args {

   VP9_COMMON *cm;

   MACROBLOCK *x;

@@ -568,6 +705,7 @@

                             args->bw * args->bh);

 static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,

                         BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

   MACROBLOCKD * const xd = &x->e_mbd;

@@ -575,7 +713,7 @@

   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;

   const int bw = 1 << bwl, bh = 1 << bhl;

   struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,

-                                    0, 0, 0, 0, 0 };

+    0, 0, 0, 0, 0 };

   vpx_memcpy(&args.t_above, xd->plane[plane].above_context,

              sizeof(ENTROPY_CONTEXT) * bw);

@@ -583,7 +721,6 @@

              sizeof(ENTROPY_CONTEXT) * bh);

   foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);

   return args.rate;

@@ -597,6 +734,49 @@

   return cost;

+static int block_error(int16_t *coeff, int16_t *dqcoeff,

+                       int block_size, int shift) {

+  int i;

+  int64_t error = 0;

+  for (i = 0; i < block_size; i++) {

+    int this_diff = coeff[i] - dqcoeff[i];

+    error += (unsigned)this_diff * this_diff;

+  }

+  error >>= shift;

+  return error > INT_MAX ? INT_MAX : (int)error;

+}

+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

+                           int shift, int64_t *sse) {

+  struct macroblockd_plane *p = &x->e_mbd.plane[0];

+  const int bw = plane_block_width(bsize, p);

+  const int bh = plane_block_height(bsize, p);

+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

+                              bw * bh, sse) >> shift;

+  *sse >>= shift;

+  return e;

+}

+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

+                                int shift, int64_t *sse) {

+  int64_t sum = 0, this_sse;

+  int plane;

+  *sse = 0;

+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

+    struct macroblockd_plane *p = &x->e_mbd.plane[plane];

+    const int bw = plane_block_width(bsize, p);

+    const int bh = plane_block_height(bsize, p);

+    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

+                           bw * bh, &this_sse);

+    *sse += this_sse;

+  }

+  *sse >>= shift;

+  return sum >> shift;

+}

 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,

                            int ss_txfrm_size, void *arg) {

   struct rdcost_block_args *args = arg;

@@ -635,6 +815,234 @@

   *skippable  = vp9_sby_is_skippable(xd, bsize);

+static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,

+                                     int *rate, int64_t *distortion,

+                                     int *skip, int64_t *sse,

+                                     BLOCK_SIZE_TYPE bs) {

+  const TX_SIZE max_txfm_size = TX_32X32

+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  if (max_txfm_size == TX_32X32 &&

+      (cm->txfm_mode == ALLOW_32X32 ||

+       cm->txfm_mode == TX_MODE_SELECT)) {

+    mbmi->txfm_size = TX_32X32;

+  } else if (max_txfm_size >= TX_16X16 &&

+             (cm->txfm_mode == ALLOW_16X16 ||

+              cm->txfm_mode == ALLOW_32X32 ||

+              cm->txfm_mode == TX_MODE_SELECT)) {

+    mbmi->txfm_size = TX_16X16;

+  } else if (cm->txfm_mode != ONLY_4X4) {

+    mbmi->txfm_size = TX_8X8;

+  } else {

+    mbmi->txfm_size = TX_4X4;

+  }

+  super_block_yrd_for_txfm(cm, x, rate, distortion, skip,

+                           &sse[mbmi->txfm_size], bs,

+                           mbmi->txfm_size);

+  cpi->txfm_stepdown_count[0]++;

+}

+static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,

+                                     int (*r)[2], int *rate,

+                                     int64_t *d, int64_t *distortion,

+                                     int *s, int *skip,

+                                     int64_t txfm_cache[NB_TXFM_MODES],

+                                     BLOCK_SIZE_TYPE bs) {

+  const TX_SIZE max_txfm_size = TX_32X32

+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

+  int64_t rd[TX_SIZE_MAX_SB][2];

+  int n, m;

+  int s0, s1;

+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    r[n][1] = r[n][0];

+    for (m = 0; m <= n - (n == max_txfm_size); m++) {

+      if (m == n)

+        r[n][1] += vp9_cost_zero(tx_probs[m]);

+      else

+        r[n][1] += vp9_cost_one(tx_probs[m]);

+    }

+  }

+  assert(skip_prob > 0);

+  s0 = vp9_cost_bit(skip_prob, 0);

+  s1 = vp9_cost_bit(skip_prob, 1);

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    if (s[n]) {

+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);

+    } else {

+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);

+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);

+    }

+  }

+  if (max_txfm_size == TX_32X32 &&

+      (cm->txfm_mode == ALLOW_32X32 ||

+       (cm->txfm_mode == TX_MODE_SELECT &&

+        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

+        rd[TX_32X32][1] < rd[TX_4X4][1]))) {

+    mbmi->txfm_size = TX_32X32;

+  } else if (max_txfm_size >= TX_16X16 &&

+             (cm->txfm_mode == ALLOW_16X16 ||

+              cm->txfm_mode == ALLOW_32X32 ||

+              (cm->txfm_mode == TX_MODE_SELECT &&

+               rd[TX_16X16][1] < rd[TX_8X8][1] &&

+               rd[TX_16X16][1] < rd[TX_4X4][1]))) {

+    mbmi->txfm_size = TX_16X16;

+  } else if (cm->txfm_mode == ALLOW_8X8 ||

+             cm->txfm_mode == ALLOW_16X16 ||

+             cm->txfm_mode == ALLOW_32X32 ||

+           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {

+    mbmi->txfm_size = TX_8X8;

+  } else {

+    mbmi->txfm_size = TX_4X4;

+  }

+  *distortion = d[mbmi->txfm_size];

+  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];

+  *skip       = s[mbmi->txfm_size];

+  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];

+  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];

+  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];

+  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];

+  if (max_txfm_size == TX_32X32 &&

+      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

+      rd[TX_32X32][1] < rd[TX_4X4][1])

+    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];

+  else if (max_txfm_size >= TX_16X16 &&

+           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])

+    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];

+  else

+    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?

+                                 rd[TX_4X4][1] : rd[TX_8X8][1];

+  if (max_txfm_size == TX_32X32 &&

+      rd[TX_32X32][1] < rd[TX_16X16][1] &&

+      rd[TX_32X32][1] < rd[TX_8X8][1] &&

+      rd[TX_32X32][1] < rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[0]++;

+  } else if (max_txfm_size >= TX_16X16 &&

+             rd[TX_16X16][1] < rd[TX_8X8][1] &&

+             rd[TX_16X16][1] < rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;

+  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;

+  } else {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;

+  }

+}

+static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,

+                                          int (*r)[2], int *rate,

+                                          int64_t *d, int64_t *distortion,

+                                          int *s, int *skip, int64_t *sse,

+                                          BLOCK_SIZE_TYPE bs,

+                                          int *model_used) {

+  const TX_SIZE max_txfm_size = TX_32X32

+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

+  int64_t rd[TX_SIZE_MAX_SB][2];

+  int n, m;

+  int s0, s1;

+  double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};

+  // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};

+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);

+  // for (n = TX_4X4; n <= max_txfm_size; n++)

+  //   r[n][0] = (r[n][0] * scale_r[n]);

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    r[n][1] = r[n][0];

+    for (m = 0; m <= n - (n == max_txfm_size); m++) {

+      if (m == n)

+        r[n][1] += vp9_cost_zero(tx_probs[m]);

+      else

+        r[n][1] += vp9_cost_one(tx_probs[m]);

+    }

+  }

+  assert(skip_prob > 0);

+  s0 = vp9_cost_bit(skip_prob, 0);

+  s1 = vp9_cost_bit(skip_prob, 1);

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    if (s[n]) {

+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);

+    } else {

+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);

+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);

+    }

+  }

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    rd[n][0] = (scale_rd[n] * rd[n][0]);

+    rd[n][1] = (scale_rd[n] * rd[n][1]);

+  }

+  if (max_txfm_size == TX_32X32 &&

+      (cm->txfm_mode == ALLOW_32X32 ||

+       (cm->txfm_mode == TX_MODE_SELECT &&

+        rd[TX_32X32][1] <= rd[TX_16X16][1] &&

+        rd[TX_32X32][1] <= rd[TX_8X8][1] &&

+        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {

+    mbmi->txfm_size = TX_32X32;

+  } else if (max_txfm_size >= TX_16X16 &&

+             (cm->txfm_mode == ALLOW_16X16 ||

+              cm->txfm_mode == ALLOW_32X32 ||

+              (cm->txfm_mode == TX_MODE_SELECT &&

+               rd[TX_16X16][1] <= rd[TX_8X8][1] &&

+               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {

+    mbmi->txfm_size = TX_16X16;

+  } else if (cm->txfm_mode == ALLOW_8X8 ||

+             cm->txfm_mode == ALLOW_16X16 ||

+             cm->txfm_mode == ALLOW_32X32 ||

+           (cm->txfm_mode == TX_MODE_SELECT &&

+            rd[TX_8X8][1] <= rd[TX_4X4][1])) {

+    mbmi->txfm_size = TX_8X8;

+  } else {

+    mbmi->txfm_size = TX_4X4;

+  }

+  if (model_used[mbmi->txfm_size]) {

+    // Actually encode using the chosen mode if a model was used, but do not

+    // update the r, d costs

+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip,

+                             &sse[mbmi->txfm_size], bs, mbmi->txfm_size);

+  } else {

+    *distortion = d[mbmi->txfm_size];

+    *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];

+    *skip       = s[mbmi->txfm_size];

+  }

+  if (max_txfm_size == TX_32X32 &&

+      rd[TX_32X32][1] <= rd[TX_16X16][1] &&

+      rd[TX_32X32][1] <= rd[TX_8X8][1] &&

+      rd[TX_32X32][1] <= rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[0]++;

+  } else if (max_txfm_size >= TX_16X16 &&

+             rd[TX_16X16][1] <= rd[TX_8X8][1] &&

+             rd[TX_16X16][1] <= rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;

+  } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;

+  } else {

+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;

+  }

+}

 static void super_block_yrd(VP9_COMP *cpi,

                             MACROBLOCK *x, int *rate, int64_t *distortion,

                             int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,

@@ -649,38 +1057,67 @@

   if (mbmi->ref_frame[0] > INTRA_FRAME)

     vp9_subtract_sby(x, bs);

-  if (cpi->sf.use_largest_txform) {

-    if (bs >= BLOCK_SIZE_SB32X32) {

-      mbmi->txfm_size = TX_32X32;

-    } else if (bs >= BLOCK_SIZE_MB16X16) {

-      mbmi->txfm_size = TX_16X16;

-    } else if (bs >= BLOCK_SIZE_SB8X8) {

-      mbmi->txfm_size = TX_8X8;

-    } else {

-      mbmi->txfm_size = TX_4X4;

-    }

+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||

+      (cpi->sf.tx_size_search_method != USE_FULL_RD &&

+       mbmi->ref_frame[0] == INTRA_FRAME)) {

     vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));

-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,

-                             mbmi->txfm_size);

+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, bs);

     if (psse)

-      *psse = sse[0];

+      *psse = sse[mbmi->txfm_size];

     return;

-  if (bs >= BLOCK_SIZE_SB32X32)

-    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],

-                             &sse[TX_32X32], bs, TX_32X32);

-  if (bs >= BLOCK_SIZE_MB16X16)

-    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],

-                             &sse[TX_16X16], bs, TX_16X16);

-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],

-                           &sse[TX_8X8], bs, TX_8X8);

-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],

-                           &sse[TX_4X4], bs, TX_4X4);

-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,

-                           skip, txfm_cache,

-                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)

-                           - (bs < BLOCK_SIZE_MB16X16));

+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&

+      mbmi->ref_frame[0] > INTRA_FRAME) {

+    int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};

+    if (bs >= BLOCK_SIZE_SB32X32) {

+      if (model_used[TX_32X32]) {

+        model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,

+                             &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);

+      } else {

+        super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],

+                                 &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32);

+      }

+    }

+    if (bs >= BLOCK_SIZE_MB16X16) {

+      if (model_used[TX_16X16]) {

+        model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,

+                             &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

+      } else {

+        super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],

+                                 &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16);

+      }

+    }

+    if (model_used[TX_8X8]) {

+      model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,

+                           &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);

+    } else {

+      super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],

+                               &sse[TX_8X8], bs, TX_8X8);

+    }

+    if (model_used[TX_4X4]) {

+      model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,

+                           &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);

+    } else {

+      super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],

+                               &sse[TX_4X4], bs, TX_4X4);

+    }

+    choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,

+                                  skip, sse, bs, model_used);

+  } else {

+    if (bs >= BLOCK_SIZE_SB32X32)

+      super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],

+                               &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32);

+    if (bs >= BLOCK_SIZE_MB16X16)

+      super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],

+                               &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16);

+    super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],

+                             &sse[TX_8X8], bs, TX_8X8);

+    super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],

+                             &sse[TX_4X4], bs, TX_4X4);

+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,

+                             skip, txfm_cache, bs);

+  }

   if (psse)

     *psse = sse[mbmi->txfm_size];

@@ -909,8 +1346,10 @@

     return best_rd;

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    txfm_cache[i] = INT64_MAX;

+  if (cpi->sf.tx_size_search_method == USE_FULL_RD) {

+    for (i = 0; i < NB_TXFM_MODES; i++)

+      txfm_cache[i] = INT64_MAX;

+  }

   /* Y Search for 32x32 intra prediction mode */

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

@@ -943,11 +1382,13 @@

       *skippable      = s;

-    for (i = 0; i < NB_TXFM_MODES; i++) {

-      int64_t adj_rd = this_rd + local_txfm_cache[i] -

-                       local_txfm_cache[cpi->common.txfm_mode];

-      if (adj_rd < txfm_cache[i]) {

-        txfm_cache[i] = adj_rd;

+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        int64_t adj_rd = this_rd + local_txfm_cache[i] -

+            local_txfm_cache[cpi->common.txfm_mode];

+        if (adj_rd < txfm_cache[i]) {

+          txfm_cache[i] = adj_rd;

+        }

@@ -1246,50 +1687,6 @@

   return r;

-static enum BlockSize get_block_size(int bw, int bh) {

-  if (bw == 4 && bh == 4)

-    return BLOCK_4X4;

-  if (bw == 4 && bh == 8)

-    return BLOCK_4X8;

-  if (bw == 8 && bh == 4)

-    return BLOCK_8X4;

-  if (bw == 8 && bh == 8)

-    return BLOCK_8X8;

-  if (bw == 8 && bh == 16)

-    return BLOCK_8X16;

-  if (bw == 16 && bh == 8)

-    return BLOCK_16X8;

-  if (bw == 16 && bh == 16)

-    return BLOCK_16X16;

-  if (bw == 32 && bh == 32)

-    return BLOCK_32X32;

-  if (bw == 32 && bh == 16)

-    return BLOCK_32X16;

-  if (bw == 16 && bh == 32)

-    return BLOCK_16X32;

-  if (bw == 64 && bh == 32)

-    return BLOCK_64X32;

-  if (bw == 32 && bh == 64)

-    return BLOCK_32X64;

-  if (bw == 64 && bh == 64)

-    return BLOCK_64X64;

-  assert(0);

-  return -1;

-}

 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {

   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;

   x->plane[0].src.buf =

@@ -1837,195 +2234,6 @@

   return scaled_ref_frame;

-static double linear_interpolate(double x, int ntab, double step,

-                                 const double *tab) {

-  double y = x / step;

-  int d = (int) y;

-  double a = y - d;

-  if (d >= ntab - 1)

-    return tab[ntab - 1];

-  else

-    return tab[d] * (1 - a) + tab[d + 1] * a;

-}

-static double model_rate_norm(double x) {

-  // Normalized rate

-  // This function models the rate for a Laplacian source

-  // source with given variance when quantized with a uniform quantizer

-  // with given stepsize. The closed form expressions are in:

-  // Hang and Chen, "Source Model for transform video coder and its

-  // application - Part I: Fundamental Theory", IEEE Trans. Circ.

-  // Sys. for Video Tech., April 1997.

-  static const double rate_tab_step = 0.125;

-  static const double rate_tab[] = {

-    256.0000, 4.944453, 3.949276, 3.371593,

-    2.965771, 2.654550, 2.403348, 2.193612,

-    2.014208, 1.857921, 1.719813, 1.596364,

-    1.484979, 1.383702, 1.291025, 1.205767,

-    1.126990, 1.053937, 0.985991, 0.922644,

-    0.863472, 0.808114, 0.756265, 0.707661,

-    0.662070, 0.619287, 0.579129, 0.541431,

-    0.506043, 0.472828, 0.441656, 0.412411,

-    0.384980, 0.359260, 0.335152, 0.312563,

-    0.291407, 0.271600, 0.253064, 0.235723,

-    0.219508, 0.204351, 0.190189, 0.176961,

-    0.164611, 0.153083, 0.142329, 0.132298,

-    0.122945, 0.114228, 0.106106, 0.098541,

-    0.091496, 0.084937, 0.078833, 0.073154,

-    0.067872, 0.062959, 0.058392, 0.054147,

-    0.050202, 0.046537, 0.043133, 0.039971,

-    0.037036, 0.034312, 0.031783, 0.029436,

-    0.027259, 0.025240, 0.023367, 0.021631,

-    0.020021, 0.018528, 0.017145, 0.015863,

-    0.014676, 0.013575, 0.012556, 0.011612,

-    0.010738, 0.009929, 0.009180, 0.008487,

-    0.007845, 0.007251, 0.006701, 0.006193,

-    0.005722, 0.005287, 0.004884, 0.004512,

-    0.004168, 0.003850, 0.003556, 0.003284,

-    0.003032, 0.002800, 0.002585, 0.002386,

-    0.002203, 0.002034, 0.001877, 0.001732,

-    0.001599, 0.001476, 0.001362, 0.001256,

-    0.001159, 0.001069, 0.000987, 0.000910,

-    0.000840, 0.000774, 0.000714, 0.000659,

-    0.000608, 0.000560, 0.000517, 0.000476,

-    0.000439, 0.000405, 0.000373, 0.000344,

-    0.000317, 0.000292, 0.000270, 0.000248,

-    0.000229, 0.000211, 0.000195, 0.000179,

-    0.000165, 0.000152, 0.000140, 0.000129,

-    0.000119, 0.000110, 0.000101, 0.000093,

-    0.000086, 0.000079, 0.000073, 0.000067,

-    0.000062, 0.000057, 0.000052, 0.000048,

-    0.000044, 0.000041, 0.000038, 0.000035,

-    0.000032, 0.000029, 0.000027, 0.000025,

-    0.000023, 0.000021, 0.000019, 0.000018,

-    0.000016, 0.000015, 0.000014, 0.000013,

-    0.000012, 0.000011, 0.000010, 0.000009,

-    0.000008, 0.000008, 0.000007, 0.000007,

-    0.000006, 0.000006, 0.000005, 0.000005,

-    0.000004, 0.000004, 0.000004, 0.000003,

-    0.000003, 0.000003, 0.000003, 0.000002,

-    0.000002, 0.000002, 0.000002, 0.000002,

-    0.000002, 0.000001, 0.000001, 0.000001,

-    0.000001, 0.000001, 0.000001, 0.000001,

-    0.000001, 0.000001, 0.000001, 0.000001,

-    0.000001, 0.000001, 0.000000, 0.000000,

-  };

-  const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);

-  assert(x >= 0.0);

-  return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab);

-}

-static double model_dist_norm(double x) {

-  // Normalized distortion

-  // This function models the normalized distortion for a Laplacian source

-  // source with given variance when quantized with a uniform quantizer

-  // with given stepsize. The closed form expression is:

-  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))

-  // where x = qpstep / sqrt(variance)

-  // Note the actual distortion is Dn * variance.

-  static const double dist_tab_step = 0.25;

-  static const double dist_tab[] = {

-    0.000000, 0.005189, 0.020533, 0.045381,

-    0.078716, 0.119246, 0.165508, 0.215979,

-    0.269166, 0.323686, 0.378318, 0.432034,

-    0.484006, 0.533607, 0.580389, 0.624063,

-    0.664475, 0.701581, 0.735418, 0.766092,

-    0.793751, 0.818575, 0.840761, 0.860515,

-    0.878045, 0.893554, 0.907238, 0.919281,

-    0.929857, 0.939124, 0.947229, 0.954306,

-    0.960475, 0.965845, 0.970512, 0.974563,

-    0.978076, 0.981118, 0.983750, 0.986024,

-    0.987989, 0.989683, 0.991144, 0.992402,

-    0.993485, 0.994417, 0.995218, 0.995905,

-    0.996496, 0.997002, 0.997437, 0.997809,

-    0.998128, 0.998401, 0.998635, 0.998835,

-    0.999006, 0.999152, 0.999277, 0.999384,

-    0.999475, 0.999553, 0.999619, 0.999676,

-    0.999724, 0.999765, 0.999800, 0.999830,

-    0.999855, 0.999877, 0.999895, 0.999911,

-    0.999924, 0.999936, 0.999945, 0.999954,

-    0.999961, 0.999967, 0.999972, 0.999976,

-    0.999980, 0.999983, 0.999985, 0.999988,

-    0.999989, 0.999991, 0.999992, 0.999994,

-    0.999995, 0.999995, 0.999996, 0.999997,

-    0.999997, 0.999998, 0.999998, 0.999998,

-    0.999999, 0.999999, 0.999999, 0.999999,

-    0.999999, 0.999999, 0.999999, 1.000000,

-  };

-  const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);

-  assert(x >= 0.0);

-  return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab);

-}

-static void model_rd_from_var_lapndz(int var, int n, int qstep,

-                                     int *rate, int64_t *dist) {

-  // This function models the rate and distortion for a Laplacian

-  // source with given variance when quantized with a uniform quantizer

-  // with given stepsize. The closed form expression is:

-  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],

-  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance)

-  vp9_clear_system_state();

-  if (var == 0 || n == 0) {

-    *rate = 0;

-    *dist = 0;

-  } else {

-    double D, R;

-    double s2 = (double) var / n;

-    double x = qstep / sqrt(s2);

-    // TODO(debargha): Make the modeling functions take (qstep^2 / s2)

-    // as argument rather than qstep / sqrt(s2) to obviate the need for

-    // the sqrt() operation.

-    D = model_dist_norm(x);

-    R = model_rate_norm(x);

-    if (R < 0) {

-      R = 0;

-      D = var;

-    }

-    *rate = (n * R * 256 + 0.5);

-    *dist = (n * D * s2 + 0.5);

-  }

-  vp9_clear_system_state();

-}

-static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,

-                                           struct macroblockd_plane *pd) {

-  return get_block_size(plane_block_width(bsize, pd),

-                        plane_block_height(bsize, pd));

-}

-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,

-                            MACROBLOCK *x, MACROBLOCKD *xd,

-                            int *out_rate_sum, int64_t *out_dist_sum) {

-  // Note our transform coeffs are 8 times an orthogonal transform.

-  // Hence quantizer step is also 8 times. To get effective quantizer

-  // we need to divide by 8 before sending to modeling function.

-  unsigned int sse;

-  int i, rate_sum = 0;

-  int64_t dist_sum = 0;

-  for (i = 0; i < MAX_MB_PLANE; ++i) {

-    struct macroblock_plane *const p = &x->plane[i];

-    struct macroblockd_plane *const pd = &xd->plane[i];

-    // TODO(dkovalev) the same code in get_plane_block_size

-    const int bw = plane_block_width(bsize, pd);

-    const int bh = plane_block_height(bsize, pd);

-    const enum BlockSize bs = get_block_size(bw, bh);

-    int rate;

-    int64_t dist;

-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,

-                       pd->dst.buf, pd->dst.stride, &sse);

-    model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);

-    rate_sum += rate;

-    dist_sum += dist;

-  }

-  *out_rate_sum = rate_sum;

-  *out_dist_sum = dist_sum << 4;

-}

 static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {

   MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

@@ -2564,7 +2772,6 @@

   int rate4x4_y, rate4x4_y_tokenonly;

   int64_t dist4x4_y;

   int64_t err4x4 = INT64_MAX;

-  int i;

   vpx_memset(&txfm_cache,0,sizeof(txfm_cache));

   ctx->skip = 0;

@@ -2597,11 +2804,14 @@

     vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));

     xd->mode_info_context->mbmi.txfm_size = TX_4X4;

   } else {

+    int i;

     *returnrate = rate_y + rate_uv +

         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

     *returndist = dist_y + (dist_uv >> 2);

-    for (i = 0; i < NB_TXFM_MODES; i++) {

-      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];

+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];

+      }

     xd->mode_info_context->mbmi.txfm_size = txfm_size;

     xd->mode_info_context->mbmi.mode = mode;

--

⑨