shithub: libvpx

Download patch

ref: faff6ed0fbb01ece1331021b749ec2f9114332ff
parent: 1f14bbb6248faf223159082be30dbb1d57a7fbcd
author: Jingning Han <jingning@google.com>
date: Mon Jul 8 12:48:47 EDT 2013

Skip duplicate block encoding in the rd loop

This speed feature allows the encoder to largely remove the spatial
dependency between blocks inside a 64x64 superblock, thereby removing
the need to repeatedly encode superblocks per partition type in the
rate-distortion optimization loop.

A major challenge lies in the intra modes tested in the rate-distortion
optimization loop. The subsequent blocks do not have access to the
reconstructed boundary pixels without the intermediate coding steps.
This was resolved by using the original pixels for intra prediction
in the rd loop, followed by an appropriately designed distortion
modeling on the quantization parameters. Experiments also suggested
that the performance impact is more discernible at lower bit-rate/psnr
settings. Hence a quantizer dependent threshold is applied to deactivate
skip of block coding.

For bus_cif at 2000 kbps,
speed 0: runtime 269854ms -> 237774ms (12% speed-up) at 0.05dB
         performance loss.

speed 1: runtime 65312ms  -> 61536ms, (7% speed-up) at 0.04dB
         performance loss.

This operation is currently turned on in settings of speed 1.

Change-Id: Ib689741dfff8dd38365d8c1b92860a3e176f56ec

--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -141,6 +141,7 @@
 
   // indicate if it is in the rd search loop or encoding process
   int rd_search;
+  int skip_encode;
 
   // TODO(jingning): Need to refactor the structure arrays that buffers the
   // coding mode decisions of each partition type.
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1866,6 +1866,20 @@
     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
+  if (cpi->sf.skip_encode_sb) {
+    int j;
+    unsigned int intra_count = 0, inter_count = 0;
+    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+      intra_count += cpi->intra_inter_count[j][0];
+      inter_count += cpi->intra_inter_count[j][1];
+    }
+    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
+    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
+    cpi->sf.skip_encode_frame &= cm->show_frame;
+  } else {
+    cpi->sf.skip_encode_frame = 0;
+  }
+
   // 256 rate units to the bit,
   // projected_frame_size in units of BYTES
   cpi->projected_frame_size = totalrate >> 8;
@@ -2276,6 +2290,10 @@
   const int bwl = mi_width_log2(bsize);
   const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
   x->rd_search = 0;
+  x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
+                    xd->q_index < QIDX_SKIP_THRESH);
+  if (x->skip_encode)
+    return;
 
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -18,6 +18,7 @@
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
+  x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ?
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -610,7 +610,8 @@
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
-                          dst, pd->dst.stride,
+                          x->skip_encode ? src : dst,
+                          x->skip_encode ? p->src.stride : pd->dst.stride,
                           dst, pd->dst.stride);
   vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
                      src, p->src.stride, dst, pd->dst.stride);
@@ -617,6 +618,9 @@
 
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
 
+
+  if (x->skip_encode)
+    return;
 
   // if (x->optimize)
   // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -721,6 +721,7 @@
   sf->mode_search_skip_flags = 0;
   sf->last_chroma_intra_mode = TM_PRED;
   sf->use_rd_breakout = 0;
+  sf->skip_encode_sb = 0;
 
   // Skip any mode not chosen at size < X for all sizes > X
   // Hence BLOCK_SIZE_SB64X64 (skip is off)
@@ -769,6 +770,7 @@
                                      FLAG_SKIP_COMP_BESTINTRA;
         sf->last_chroma_intra_mode = H_PRED;
         sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -790,6 +792,7 @@
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->last_chroma_intra_mode = DC_PRED;
         sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -804,6 +807,7 @@
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -247,6 +247,7 @@
   int comp_inter_joint_search_thresh;
   int adaptive_rd_thresh;
   int skip_encode_sb;
+  int skip_encode_frame;
   int use_lastframe_partitioning;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
   int use_8tap_always;
@@ -277,7 +278,6 @@
 } SPEED_FEATURES;
 
 typedef struct VP9_COMP {
-
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -685,6 +685,15 @@
   args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                 &this_sse) >> shift;
   args->sse += this_sse >> shift;
+
+  if (x->skip_encode &&
+      xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+    // TODO(jingning): tune the model to better capture the distortion.
+    int64_t p = (pd->dequant[1] * pd->dequant[1] *
+                    (1 << ss_txfrm_size)) >> shift;
+    args->dist += p;
+    args->sse  += p;
+  }
 }
 
 static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -1169,6 +1178,7 @@
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
   uint8_t *src, *dst;
   int16_t *src_diff, *coeff;
 
@@ -1215,15 +1225,15 @@
                                              p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
         dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        pd->dst.buf,
-                                        pd->dst.stride);
+                                        pd->dst.buf, dst_stride);
         vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
                                 TX_4X4, mode,
-                                dst, pd->dst.stride,
-                                dst, pd->dst.stride);
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride);
         vp9_subtract_block(4, 4, src_diff, 8,
                            src, src_stride,
-                           dst, pd->dst.stride);
+                           dst, dst_stride);
 
         tx_type = get_tx_type_4x4(xd, block);
         if (tx_type != DCT_DCT) {
@@ -1272,24 +1282,30 @@
     }
   }
 
+  if (x->skip_encode)
+    return best_rd;
+
   for (idy = 0; idy < bh; ++idy) {
     for (idx = 0; idx < bw; ++idx) {
       block = ib + idy * 2 + idx;
       xd->mode_info_context->bmi[block].as_mode = *best_mode;
+      src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                      p->src.buf, src_stride);
       dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      pd->dst.buf,
-                                      pd->dst.stride);
+                                      pd->dst.buf, dst_stride);
 
       vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
-                              *best_mode, dst, pd->dst.stride,
-                              dst, pd->dst.stride);
+                              *best_mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride);
       // inverse transform
       if (best_tx_type != DCT_DCT)
         vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                            pd->dst.stride, best_tx_type);
+                             dst_stride, best_tx_type);
       else
         xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           pd->dst.stride);
+                           dst_stride);
     }
   }
 
@@ -2897,6 +2913,7 @@
   int64_t dist4x4_y;
   int64_t err4x4 = INT64_MAX;
 
+  x->skip_encode = 0;
   vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
@@ -3006,9 +3023,11 @@
   int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
   int best_skip2 = 0;
 
+  x->skip_encode = (cpi->sf.skip_encode_frame &&
+                    xd->q_index < QIDX_SKIP_THRESH);
+
   for (i = 0; i < 4; i++) {
     int j;
-
     for (j = 0; j < MAX_REF_FRAMES; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -15,6 +15,8 @@
 #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 
+#define QIDX_SKIP_THRESH     115
+
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);