shithub: libvpx

Download patch

ref: 72033fcff8e1ce2e94f578910a1cfa77cd1bf533
parent: ea77b03479668d6c9336b582f43b1b58f1ab1e7f
author: Jingning Han <jingning@google.com>
date: Wed Oct 16 08:43:03 EDT 2013

Make memory alloc in pick_mode_context bsize aware

This commit makes the buffer allocation of zcoeff_blk array in
pick_mode_context block size aware. It calculates the number of
4x4 blocks in the partition and assigns the memory space accordingly.
This process (and the uninitialization) is done once for each encoding
pass. It allows memory copy of smaller buffer when possible.

For football at 600kbps, the runtimes improve by about 1%:
speed 1, 45961ms -> 45472ms
speed 2, 23863ms -> 23598ms

Change-Id: Id2ca24906fa89f46fa5fe742ec4b8efc2a61f877

--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -26,7 +26,8 @@
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
-  uint8_t zcoeff_blk[256];
+  uint8_t *zcoeff_blk;
+  int num_4x4_blk;
   int skip;
   int_mv best_ref_mv;
   int_mv second_best_ref_mv;
@@ -176,6 +177,45 @@
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
 };
+
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      return &x->sb64_context;
+    case BLOCK_64X32:
+      return &x->sb64x32_context[xd->sb_index];
+    case BLOCK_32X64:
+      return &x->sb32x64_context[xd->sb_index];
+    case BLOCK_32X32:
+      return &x->sb32_context[xd->sb_index];
+    case BLOCK_32X16:
+      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X32:
+      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X16:
+      return &x->mb_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X8:
+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X4:
+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X8:
+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X4:
+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
 
 struct rdcost_block_args {
   MACROBLOCK *x;
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -419,7 +419,7 @@
 
   x->skip = ctx->skip;
   vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
-             sizeof(ctx->zcoeff_blk));
+             sizeof(uint8_t) * ctx->num_4x4_blk);
 
   if (!output_enabled)
     return;
@@ -696,45 +696,6 @@
     // Count of last ref frame 0,0 usage
     if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
       cpi->inter_zz_count++;
-  }
-}
-
-// TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the temporary buffers, when recursive
-// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  switch (bsize) {
-    case BLOCK_64X64:
-      return &x->sb64_context;
-    case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
-    case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
-    case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
-    case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    default:
-      assert(0);
-      return NULL;
   }
 }
 
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1414,6 +1414,94 @@
   } while (++i <= MV_MAX);
 }
 
+static void init_pick_mode_context(VP9_COMP *cpi) {
+  int i;
+  MACROBLOCK  *x  = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON  *cm = &cpi->common;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            ctx->num_4x4_blk = num_4x4_blk;
+            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          ctx->num_4x4_blk = num_4x4_blk;
+          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        ctx->num_4x4_blk = num_4x4_blk;
+        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      ctx->num_4x4_blk = num_4x4_blk;
+      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+    }
+  }
+}
+
+static void free_pick_mode_context(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            vpx_free(ctx->zcoeff_blk);
+            ctx->zcoeff_blk = 0;
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          vpx_free(ctx->zcoeff_blk);
+          ctx->zcoeff_blk = 0;
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        vpx_free(ctx->zcoeff_blk);
+        ctx->zcoeff_blk = 0;
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      vpx_free(ctx->zcoeff_blk);
+      ctx->zcoeff_blk = 0;
+    }
+  }
+}
+
 VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   int i, j;
   volatile union {
@@ -1450,6 +1538,8 @@
 
   init_config((VP9_PTR)cpi, oxcf);
 
+  init_pick_mode_context(cpi);
+
   cm->current_video_frame   = 0;
   cpi->kf_overspend_bits            = 0;
   cpi->kf_bitrate_adjustment        = 0;
@@ -1913,6 +2003,7 @@
 #endif
   }
 
+  free_pick_mode_context(&cpi->mb);
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3587,7 +3587,7 @@
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(ctx->zcoeff_blk));
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -4327,7 +4327,7 @@
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(ctx->zcoeff_blk));
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
           best_bmodes[i] = xd->mi_8x8[0]->bmi[i];