shithub: libvpx

Download patch

ref: c34d91a84eaca1df605e92f1854677cac5b6a3f5
parent: aabae97e57378356cdc8c94af5e369f8363b9831
parent: 52cf4dcaea10f97d25d8a3585704a1e47b384751
author: Scott LaVarnway <slavarnway@google.com>
date: Thu Mar 1 01:20:02 EST 2012

Merge "Packing bitstream on-the-fly with delayed context updates"

--- a/configure
+++ b/configure
@@ -39,6 +39,7 @@
   ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
+  ${toggle_onthefly_bitpacking}   enable on-the-fly bitpacking in real-time encoding
   ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
   ${toggle_runtime_cpu_detect}    runtime cpu detection
   ${toggle_shared}                shared library support
@@ -253,6 +254,7 @@
     static_msvcrt
     spatial_resampling
     realtime_only
+    onthefly_bitpacking
     error_concealment
     shared
     static
@@ -297,6 +299,7 @@
     mem_tracker
     spatial_resampling
     realtime_only
+    onthefly_bitpacking
     error_concealment
     shared
     static
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -24,6 +24,7 @@
 #include "bitstream.h"
 
 #include "defaultcoefcounts.h"
+#include "vp8/common/common.h"
 
 const int vp8cx_base_skip_false_prob[128] =
 {
@@ -159,7 +160,7 @@
     );
 }
 
-static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *const stop = p + xcount;
     unsigned int split;
@@ -398,7 +399,7 @@
             const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
             int tokens = stop - p;
 
-            pack_tokens_c(w, p, tokens);
+            vp8_pack_tokens_c(w, p, tokens);
         }
 
         vp8_stop_encode(w);
@@ -417,7 +418,7 @@
         const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
         int tokens = stop - p;
 
-        pack_tokens_c(w, p, tokens);
+        vp8_pack_tokens_c(w, p, tokens);
     }
 
 }
@@ -783,6 +784,7 @@
     }
 }
 
+#if 0
 /* This function is used for debugging probability trees. */
 static void print_prob_tree(vp8_prob
      coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
@@ -814,6 +816,7 @@
     fprintf(f, "}\n");
     fclose(f);
 }
+#endif
 
 static void sum_probs_over_prev_coef_context(
         const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
@@ -943,7 +946,6 @@
 
                 int t = 0;      /* token/prob index */
 
-
                 vp8_tree_probs_from_distribution(
                     MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
                     cpi->frame_coef_probs [i][j][k],
@@ -1048,10 +1050,33 @@
     return savings;
 }
 
-static void update_coef_probs(VP8_COMP *cpi)
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+int vp8_update_coef_context(VP8_COMP *cpi)
 {
+    int savings = 0;
+
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        /* Reset to default counts/probabilities at key frames */
+        vp8_copy(cpi->coef_counts, default_coef_counts);
+    }
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+    return savings;
+}
+#endif
+
+void vp8_update_coef_probs(VP8_COMP *cpi)
+{
     int i = 0;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     vp8_writer *const w = cpi->bc;
+#endif
     int savings = 0;
 
     vp8_clear_system_state(); //__asm emms;
@@ -1131,7 +1156,11 @@
                         cpi->common.frame_type == KEY_FRAME && newp != *Pold)
                         u = 1;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    cpi->update_probs[i][j][k][t] = u;
+#else
                     vp8_write(w, u, upd);
+#endif
 
 
 #ifdef ENTROPY_STATS
@@ -1143,7 +1172,9 @@
                         /* send/use new probability */
 
                         *Pold = newp;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
                         vp8_write_literal(w, newp, 8);
+#endif
 
                         savings += s;
 
@@ -1172,6 +1203,50 @@
     while (++i < BLOCK_TYPES);
 
 }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+static void pack_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+    vp8_writer *const w = cpi->bc;
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const char u = cpi->update_probs[i][j][k][t] ;
+
+                    vp8_write(w, u, upd);
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+                        vp8_write_literal(w, newp, 8);
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+#endif
+
 #ifdef PACKET_TESTING
 FILE *vpxlogc = 0;
 #endif
@@ -1434,6 +1509,7 @@
         vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
     }
 
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
     {
         if (pc->frame_type == KEY_FRAME)
@@ -1441,6 +1517,7 @@
         else
             pc->refresh_entropy_probs = 0;
     }
+#endif
 
     vp8_write_bit(bc, pc->refresh_entropy_probs);
 
@@ -1458,6 +1535,9 @@
 
     vp8_clear_system_state();  //__asm emms;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    pack_coef_probs(cpi);
+#else
     if (pc->refresh_entropy_probs == 0)
     {
         // save a copy for later refresh
@@ -1464,7 +1544,8 @@
         vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
     }
 
-    update_coef_probs(cpi);
+    vp8_update_coef_probs(cpi);
+#endif
 
 #ifdef ENTROPY_STATS
     active_section = 2;
@@ -1512,6 +1593,45 @@
 
     cpi->partition_sz[0] = *size;
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        const int num_part = (1 << pc->multi_token_partition);
+        unsigned char * dp = cpi->partition_d[0] + cpi->partition_sz[0];
+
+        if (num_part > 1)
+        {
+            /* write token part sizes (all but last) if more than 1 */
+            validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0],
+                            &pc->error);
+
+            cpi->partition_sz[0] += 3*(num_part-1);
+
+            for(i = 1; i < num_part; i++)
+            {
+                write_partition_size(dp, cpi->partition_sz[i]);
+                dp += 3;
+            }
+        }
+
+        if (!cpi->output_partition)
+        {
+            /* concatenate partition buffers */
+            for(i = 0; i < num_part; i++)
+            {
+                vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                cpi->partition_d[i+1] = dp;
+                dp += cpi->partition_sz[i+1];
+            }
+        }
+
+        /* update total size */
+        *size = 0;
+        for(i = 0; i < num_part+1; i++)
+        {
+            *size += cpi->partition_sz[i];
+        }
+    }
+#else
     if (pc->multi_token_partition != ONE_PARTITION)
     {
         int num_part = 1 << pc->multi_token_partition;
@@ -1561,6 +1681,7 @@
         *size += cpi->bc[1].pos;
         cpi->partition_sz[1] = cpi->bc[1].pos;
     }
+#endif
 }
 
 #ifdef ENTROPY_STATS
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -35,7 +35,10 @@
 # define pack_mb_row_tokens(a,b)               \
     vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 #else
-# define pack_tokens(a,b,c)                    pack_tokens_c(a,b,c)
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+
+# define pack_tokens(a,b,c)                    vp8_pack_tokens_c(a,b,c)
 # define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
 # define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
 #endif
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -28,6 +28,9 @@
 #include <limits.h>
 #include "vp8/common/invtrans.h"
 #include "vpx_ports/vpx_timer.h"
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+#include "bitstream.h"
+#endif
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
@@ -373,10 +376,17 @@
     int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
     int map_index = (mb_row * cpi->common.mb_cols);
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    const int num_part = (1 << cm->multi_token_partition);
+    TOKENEXTRA * tp_start = cpi->tok;
+    vp8_writer *w;
+#endif
+
 #if CONFIG_MULTITHREAD
     const int nsync = cpi->mt_sync_range;
-    const int rightmost_col = cm->mb_cols - 1;
+    const int rightmost_col = cm->mb_cols + nsync;
     volatile const int *last_row_current_mb_col;
+    volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
     if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
         last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
@@ -384,6 +394,13 @@
         last_row_current_mb_col = &rightmost_col;
 #endif
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(num_part > 1)
+        w= &cpi->bc[1 + (mb_row % num_part)];
+    else
+        w = &cpi->bc[1];
+#endif
+
     // reset above block coeffs
     xd->above_context = cm->above_context;
 
@@ -411,6 +428,10 @@
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+        *tp = cpi->tok;
+#endif
         // Distance of Mb to the left & right edges, specified in
         // 1/8th pel units as they are always compared to values
         // that are in 1/8th pel units
@@ -435,12 +456,13 @@
         vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        if (cpi->b_multi_threaded != 0)
         {
+            *current_mb_col = mb_col - 1; // set previous MB done
+
             if ((mb_col & (nsync - 1)) == 0)
             {
-                while (mb_col > (*last_row_current_mb_col - nsync)
-                        && (*last_row_current_mb_col) != (cm->mb_cols - 1))
+                while (mb_col > (*last_row_current_mb_col - nsync))
                 {
                     x86_pause_hint();
                     thread_sleep(0);
@@ -495,13 +517,13 @@
 
 #endif
 
-            // Count of last ref frame 0,0 useage
+            // Count of last ref frame 0,0 usage
             if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                 cpi->inter_zz_count ++;
 
             // Special case code for cyclic refresh
             // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+            // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
             if ((cpi->current_layer == 0) &&
                 (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
             {
@@ -525,7 +547,14 @@
 
         cpi->tplist[mb_row].stop = *tp;
 
-        // Increment pointer into gf useage flags structure.
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        /* pack tokens for this MB */
+        {
+            int tok_count = *tp - tp_start;
+            pack_tokens(w, tp_start, tok_count);
+        }
+#endif
+        // Increment pointer into gf usage flags structure.
         x->gf_active_ptr++;
 
         // Increment the activity mask pointers.
@@ -539,39 +568,29 @@
         recon_yoffset += 16;
         recon_uvoffset += 8;
 
-        // Keep track of segment useage
+        // Keep track of segment usage
         segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
 
         // skip to next mb
         xd->mode_info_context++;
         x->partition_info++;
-
         xd->above_context++;
-#if CONFIG_MULTITHREAD
-        if (cpi->b_multi_threaded != 0)
-        {
-            cpi->mt_current_mb_col[mb_row] = mb_col;
-        }
-#endif
     }
 
     //extend the recon for intra prediction
-    vp8_extend_mb_row(
-        &cm->yv12_fb[dst_fb_idx],
-        xd->dst.y_buffer + 16,
-        xd->dst.u_buffer + 8,
-        xd->dst.v_buffer + 8);
+    vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                        xd->dst.y_buffer + 16,
+                        xd->dst.u_buffer + 8,
+                        xd->dst.v_buffer + 8);
 
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded != 0)
+        *current_mb_col = rightmost_col;
+#endif
+
     // this is to account for the border
     xd->mode_info_context++;
     x->partition_info++;
-
-#if CONFIG_MULTITHREAD
-    if ((cpi->b_multi_threaded != 0) && (mb_row == cm->mb_rows - 1))
-    {
-        sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
-    }
-#endif
 }
 
 void init_encode_frame_mb_context(VP8_COMP *cpi)
@@ -599,7 +618,7 @@
     if (cm->frame_type == KEY_FRAME)
         vp8_init_mbmode_probs(cm);
 
-    // Copy data over into macro block data sturctures.
+    // Copy data over into macro block data structures.
     x->src = * cpi->Source;
     xd->pre = cm->yv12_fb[cm->lst_fb_idx];
     xd->dst = cm->yv12_fb[cm->new_fb_idx];
@@ -656,10 +675,13 @@
     MACROBLOCK *const x = & cpi->mb;
     VP8_COMMON *const cm = & cpi->common;
     MACROBLOCKD *const xd = & x->e_mbd;
-
     TOKENEXTRA *tp = cpi->tok;
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    BOOL_CODER * bc = &cpi->bc[1]; // bc[0] is for control partition
+    const int num_part = (1 << cm->multi_token_partition);
+#endif
 
     vpx_memset(segment_counts, 0, sizeof(segment_counts));
     totalrate = 0;
@@ -694,6 +716,7 @@
     cpi->prediction_error = 0;
     cpi->intra_error = 0;
     cpi->skip_true_count = 0;
+    cpi->tok_count = 0;
 
 #if 0
     // Experimental code
@@ -704,6 +727,7 @@
     xd->mode_info_context = cm->mi;
 
     vp8_zero(cpi->MVcount);
+
     vp8_zero(cpi->coef_counts);
 
     vp8cx_frame_init_quantizer(cpi);
@@ -722,10 +746,23 @@
         build_activity_map(cpi);
     }
 
-    // re-initencode frame context.
+    // re-init encode frame context.
     init_encode_frame_mb_context(cpi);
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
     {
+        int i;
+        for(i = 0; i < num_part; i++)
+        {
+            vp8_start_encode(&bc[i], cpi->partition_d[i + 1],
+                    cpi->partition_d_end[i + 1]);
+            bc[i].error = &cm->error;
+        }
+    }
+
+#endif
+
+    {
         struct vpx_usec_timer  emr_timer;
         vpx_usec_timer_start(&emr_timer);
 
@@ -748,7 +785,11 @@
             {
                 vp8_zero(cm->left_context)
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#else
                 tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+#endif
 
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
@@ -761,12 +802,14 @@
                 x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
                 x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
 
+                if(mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
             }
 
             sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
 
-            cpi->tok_count = 0;
-
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
             {
                 cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
@@ -799,9 +842,12 @@
             // for each macroblock row in image
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
             {
-
                 vp8_zero(cm->left_context)
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#endif
+
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
                 // adjust to the next row of mbs
@@ -811,16 +857,25 @@
             }
 
             cpi->tok_count = tp - cpi->tok;
+        }
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            int i;
+            for(i = 0; i < num_part; i++)
+            {
+                vp8_stop_encode(&bc[i]);
+                cpi->partition_sz[i+1] = bc[i].pos;
+            }
         }
+#endif
 
         vpx_usec_timer_mark(&emr_timer);
         cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
     }
 
 
-    // Work out the segment probabilites if segmentation is enabled
+    // Work out the segment probabilities if segmentation is enabled
     if (xd->segmentation_enabled)
     {
         int tot_count;
@@ -908,8 +963,9 @@
     }
 #endif
 
-    // Adjust the projected reference frame useage probability numbers to reflect
-    // what we have just seen. This may be usefull when we make multiple itterations
+#if ! CONFIG_REALTIME_ONLY
+    // Adjust the projected reference frame usage probability numbers to reflect
+    // what we have just seen. This may be useful when we make multiple iterations
     // of the recode loop rather than continuing to use values from the previous frame.
     if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
         (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
@@ -916,12 +972,7 @@
     {
       vp8_convert_rfct_to_prob(cpi);
     }
-
-#if 0
-    // Keep record of the total distortion this time around for future use
-    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
-
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -13,6 +13,8 @@
 #include "vp8/common/common.h"
 #include "vp8/common/extend.h"
 
+#include "bitstream.h"
+
 #if CONFIG_MULTITHREAD
 
 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
@@ -74,6 +76,10 @@
             MACROBLOCK *x = &mbri->mb;
             MACROBLOCKD *xd = &x->e_mbd;
             TOKENEXTRA *tp ;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+            TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24);
+            const int num_part = (1 << cm->multi_token_partition);
+#endif
 
             int *segment_counts = mbri->segment_counts;
             int *totalrate = &mbri->totalrate;
@@ -91,9 +97,15 @@
                 int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                 int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                 int map_index = (mb_row * cm->mb_cols);
-                volatile int *last_row_current_mb_col;
+                volatile const int *last_row_current_mb_col;
+                volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
+#else
                 tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+                cpi->tplist[mb_row].start = tp;
+#endif
 
                 last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
 
@@ -107,10 +119,6 @@
                 recon_yoffset = (mb_row * recon_y_stride * 16);
                 recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-                cpi->tplist[mb_row].start = tp;
-
-                //printf("Thread mb_row = %d\n", mb_row);
-
                 // Set the mb activity pointer to the start of the row.
                 x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
@@ -117,9 +125,11 @@
                 // for each macroblock col in image
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                 {
+                    *current_mb_col = mb_col - 1;
+
                     if ((mb_col & (nsync - 1)) == 0)
                     {
-                        while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
+                        while (mb_col > (*last_row_current_mb_col - nsync))
                         {
                             x86_pause_hint();
                             thread_sleep(0);
@@ -126,6 +136,10 @@
                         }
                     }
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    tp = tp_start;
+#endif
+
                     // Distance of Mb to the various image edges.
                     // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
                     xd->mb_to_left_edge = -((mb_col * 16) << 3);
@@ -154,7 +168,7 @@
                         vp8_activity_masking(cpi, x);
 
                     // Is segmentation enabled
-                    // MB level adjutment to quantizer
+                    // MB level adjustment to quantizer
                     if (xd->segmentation_enabled)
                     {
                         // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
@@ -196,13 +210,13 @@
 
 #endif
 
-                        // Count of last ref frame 0,0 useage
+                        // Count of last ref frame 0,0 usage
                         if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                             cpi->inter_zz_count++;
 
                         // Special case code for cyclic refresh
                         // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                        // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                        // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
                         if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
                         {
                             const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
@@ -223,9 +237,17 @@
 
                         }
                     }
-                    cpi->tplist[mb_row].stop = tp;
 
-                    // Increment pointer into gf useage flags structure.
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* pack tokens for this MB */
+                    {
+                        int tok_count = tp - tp_start;
+                        pack_tokens(w, tp_start, tok_count);
+                    }
+#else
+                    cpi->tplist[mb_row].stop = tp;
+#endif
+                    // Increment pointer into gf usage flags structure.
                     x->gf_active_ptr++;
 
                     // Increment the activity mask pointers.
@@ -239,7 +261,7 @@
                     recon_yoffset += 16;
                     recon_uvoffset += 8;
 
-                    // Keep track of segment useage
+                    // Keep track of segment usage
                     segment_counts[xd->mode_info_context->mbmi.segment_id]++;
 
                     // skip to next mb
@@ -246,17 +268,15 @@
                     xd->mode_info_context++;
                     x->partition_info++;
                     xd->above_context++;
-
-                    cpi->mt_current_mb_col[mb_row] = mb_col;
                 }
 
-                //extend the recon for intra prediction
-                vp8_extend_mb_row(
-                    &cm->yv12_fb[dst_fb_idx],
-                    xd->dst.y_buffer + 16,
-                    xd->dst.u_buffer + 8,
-                    xd->dst.v_buffer + 8);
+                vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                                    xd->dst.y_buffer + 16,
+                                    xd->dst.u_buffer + 8,
+                                    xd->dst.v_buffer + 8);
 
+                *current_mb_col = mb_col + nsync;
+
                 // this is to account for the border
                 xd->mode_info_context++;
                 x->partition_info++;
@@ -271,7 +291,6 @@
 
                 if (mb_row == cm->mb_rows - 1)
                 {
-                    //SetEvent(cpi->h_event_main);
                     sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
                 }
             }
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -42,6 +42,11 @@
 #include <stdio.h>
 #include <limits.h>
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+extern int vp8_update_coef_context(VP8_COMP *cpi);
+extern void vp8_update_coef_probs(VP8_COMP *cpi);
+#endif
+
 extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
 extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
 extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
@@ -1106,8 +1111,11 @@
         vpx_free(cpi->tok);
 
     {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */
+#else
         unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
-
+#endif
         CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
     }
 
@@ -1514,6 +1522,10 @@
     cm->refresh_last_frame = 1;
     cm->refresh_entropy_probs = 1;
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    cpi->oxcf.token_partitions = 3;
+#endif
+
     if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
         cm->multi_token_partition =
             (TOKEN_PARTITION) cpi->oxcf.token_partitions;
@@ -3725,12 +3737,40 @@
         }
 #endif
 
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            if(cpi->oxcf.error_resilient_mode)
+                cm->refresh_entropy_probs = 0;
+
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                if (cm->frame_type == KEY_FRAME)
+                    cm->refresh_entropy_probs = 1;
+            }
+
+            if (cm->refresh_entropy_probs == 0)
+            {
+                // save a copy for later refresh
+                vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+            }
+
+            vp8_update_coef_context(cpi);
+
+            vp8_update_coef_probs(cpi);
+
+            // transform / motion compensation build reconstruction frame
+            // +pack coef partitions
+            vp8_encode_frame(cpi);
+
+            /* cpi->projected_frame_size is not needed for RT mode */
+        }
+#else
         // transform / motion compensation build reconstruction frame
         vp8_encode_frame(cpi);
 
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
-
+#endif
         vp8_clear_system_state();  //__asm emms;
 
         // Test to see if the stats generated for this frame indicate that we should have coded a key frame
@@ -4093,10 +4133,12 @@
 
     update_reference_frames(cm);
 
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     if (cpi->oxcf.error_resilient_mode)
     {
         cm->refresh_entropy_probs = 0;
     }
+#endif
 
 #if CONFIG_MULTITHREAD
     /* wait that filter_level is picked so that we can continue with stream packing */
@@ -4818,6 +4860,29 @@
         vpx_usec_timer_start(&tsctimer);
         vpx_usec_timer_start(&ticktimer);
     }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        const int num_part = (1 << cm->multi_token_partition);
+        /* the available bytes in dest */
+        const unsigned long dest_size = dest_end - dest;
+        const int tok_part_buff_size = (dest_size * 9) / (10 * num_part);
+
+        unsigned char *dp = dest;
+
+        cpi->partition_d[0] = dp;
+        dp += dest_size/10;         /* reserve 1/10 for control partition */
+        cpi->partition_d_end[0] = dp;
+
+        for(i = 0; i < num_part; i++)
+        {
+            cpi->partition_d[i + 1] = dp;
+            dp += tok_part_buff_size;
+            cpi->partition_d_end[i + 1] = dp;
+        }
+    }
+#endif
 
     // start with a 0 size frame
     *size = 0;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -443,9 +443,12 @@
     unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
 
     unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+
     //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]);   //not used any more
     //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
     vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
     unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
 
     int gfu_boost;
@@ -543,6 +546,8 @@
 
     TOKENLIST *tplist;
     unsigned int partition_sz[MAX_PARTITIONS];
+    unsigned char *partition_d[MAX_PARTITIONS];
+    unsigned char *partition_d_end[MAX_PARTITIONS];
     // end of multithread data
 
 
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -66,7 +66,11 @@
             0,                          /* noise_sensitivity */
             0,                          /* Sharpness */
             0,                          /* static_thresh */
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+            VP8_EIGHT_TOKENPARTITION,
+#else
             VP8_ONE_TOKENPARTITION,     /* token_partitions */
+#endif
             0,                          /* arnr_max_frames */
             3,                          /* arnr_strength */
             3,                          /* arnr_type*/
@@ -241,6 +245,11 @@
         RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
     }
 
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(cfg->g_threads > (1 << vp8_cfg->token_partitions))
+        ERROR("g_threads cannot be bigger than number of token partitions");
+#endif
+
     return VPX_CODEC_OK;
 }
 
@@ -919,7 +928,13 @@
 
                     for (i = 0; i < num_partitions; ++i)
                     {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                        pkt.data.frame.buf = cpi->partition_d[i];
+#else
                         pkt.data.frame.buf = cx_data;
+                        cx_data += cpi->partition_sz[i];
+                        cx_data_sz -= cpi->partition_sz[i];
+#endif
                         pkt.data.frame.sz = cpi->partition_sz[i];
                         pkt.data.frame.partition_id = i;
                         /* don't set the fragment bit for the last partition */
@@ -926,9 +941,15 @@
                         if (i == (num_partitions - 1))
                             pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
                         vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-                        cx_data += cpi->partition_sz[i];
-                        cx_data_sz -= cpi->partition_sz[i];
                     }
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* In lagged mode the encoder can buffer multiple frames.
+                     * We don't want this in partitioned output because
+                     * partitions are spread all over the output buffer.
+                     * So, force an exit!
+                     */
+                    cx_data_sz -= ctx->cx_data_sz / 2;
+#endif
                 }
                 else
                 {
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -489,7 +489,14 @@
     if(fwrite(header, 1, 12, outfile));
 }
 
+static void write_ivf_frame_size(FILE *outfile, size_t size)
+{
+    char             header[4];
+    mem_put_le32(header, size);
+    fwrite(header, 1, 4, outfile);
+}
 
+
 typedef off_t EbmlLoc;
 
 
@@ -945,7 +952,6 @@
 
 
 #include "args.h"
-
 static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
         "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
@@ -980,6 +986,8 @@
         "Stream frame rate (rate/scale)");
 static const arg_def_t use_ivf          = ARG_DEF(NULL, "ivf", 0,
         "Output IVF (default is WebM)");
+static const arg_def_t out_part = ARG_DEF("P", "output-partitions", 0,
+        "Makes encoder output partitions. Requires IVF output!");
 static const arg_def_t q_hist_n         = ARG_DEF(NULL, "q-hist", 1,
         "Show quantizer histogram (n-buckets)");
 static const arg_def_t rate_hist_n         = ARG_DEF(NULL, "rate-hist", 1,
@@ -989,7 +997,7 @@
     &debugmode,
     &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline,
     &best_dl, &good_dl, &rt_dl,
-    &verbosearg, &psnrarg, &use_ivf, &q_hist_n, &rate_hist_n,
+    &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n, &rate_hist_n,
     NULL
 };
 
@@ -1492,6 +1500,7 @@
     int                       show_psnr;
     int                       have_framerate;
     struct vpx_rational       framerate;
+    int                       out_part;
     int                       debug;
     int                       show_q_hist_buckets;
     int                       show_rate_hist_buckets;
@@ -1603,6 +1612,8 @@
             global->framerate = arg_parse_rational(&arg);
             global->have_framerate = 1;
         }
+        else if (arg_match(&arg,&out_part, argi))
+            global->out_part = 1;
         else if (arg_match(&arg, &debugmode, argi))
             global->debug = 1;
         else if (arg_match(&arg, &q_hist_n, argi))
@@ -2081,11 +2092,14 @@
                                struct global_config *global)
 {
     int i;
+    int flags = 0;
 
+    flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
+    flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+
     /* Construct Encoder Context */
     vpx_codec_enc_init(&stream->encoder, global->codec->iface,
-                       &stream->config.cfg,
-                       global->show_psnr ? VPX_CODEC_USE_PSNR : 0);
+                        &stream->config.cfg, flags);
     ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
 
     /* Note that we bypass the vpx_codec_control wrapper macro because
@@ -2154,12 +2168,18 @@
 
     while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter)))
     {
+        static size_t fsize = 0;
+        static off_t ivf_header_pos = 0;
+
         *got_data = 1;
 
         switch (pkt->kind)
         {
         case VPX_CODEC_CX_FRAME_PKT:
-            stream->frames_out++;
+            if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT))
+            {
+                stream->frames_out++;
+            }
             fprintf(stderr, " %6luF",
                     (unsigned long)pkt->data.frame.sz);
 
@@ -2175,9 +2195,28 @@
             }
             else
             {
-                write_ivf_frame_header(stream->file, pkt);
-                if(fwrite(pkt->data.frame.buf, 1,
-                          pkt->data.frame.sz, stream->file));
+                if (pkt->data.frame.partition_id <= 0)
+                {
+                    ivf_header_pos = ftello(stream->file);
+                    fsize = pkt->data.frame.sz;
+
+                    write_ivf_frame_header(stream->file, pkt);
+                }
+                else
+                {
+                    fsize += pkt->data.frame.sz;
+
+                    if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT))
+                    {
+                        off_t currpos = ftello(stream->file);
+                        fseeko(stream->file, ivf_header_pos, SEEK_SET);
+                        write_ivf_frame_size(stream->file, fsize);
+                        fseeko(stream->file, currpos, SEEK_SET);
+                    }
+                }
+
+                fwrite(pkt->data.frame.buf, 1,
+                       pkt->data.frame.sz, stream->file);
             }
             stream->nbytes += pkt->data.raw.sz;
             break;