shithub: libvpx

Download patch

ref: bbc24a65c41b1a5a460967a2f5719ddb5489c1d5
parent: 308e31a3ef97fa7a5bf9a232b15587955e5ec89f
parent: d5c46bdfc0d7bcc25e3ff549a0dc03d759050c19
author: John Koleszar <jkoleszar@google.com>
date: Tue Apr 26 04:27:39 EDT 2011

Merge remote branch 'internal/upstream' into HEAD

Conflicts:
	vp8/common/alloccommon.c
	vp8/encoder/rdopt.c

Change-Id: Ic34b33577423031e277235ffa6bcaff7b252e5cb

--- a/libmkv/EbmlIDs.h
+++ b/libmkv/EbmlIDs.h
@@ -120,7 +120,7 @@
     //video
     Video = 0xE0,
     FlagInterlaced = 0x9A,
-//  StereoMode = 0x53B8,
+    StereoMode = 0x53B8,
     PixelWidth = 0xB0,
     PixelHeight = 0xBA,
     PixelCropBottom = 0x54AA,
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -138,25 +138,25 @@
     {
     case 0:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 0;
         cm->full_pixel = 0;
         break;
     case 1:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 2:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 3:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 1;
         break;
@@ -171,7 +171,7 @@
 
     oci->mb_no_coeff_skip = 1;
     oci->no_lpf = 0;
-    oci->simpler_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
     oci->use_bilinear_mc_filter = 0;
     oci->full_pixel = 0;
     oci->multi_token_partition = ONE_PARTITION;
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -38,9 +38,8 @@
 /*ARMV6 loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -51,20 +50,18 @@
 }
 
 void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -75,20 +72,18 @@
 }
 
 void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -101,12 +96,11 @@
 }
 
 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -114,9 +108,8 @@
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -129,12 +122,11 @@
 }
 
 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -145,9 +137,8 @@
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -155,20 +146,18 @@
 }
 
 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -176,20 +165,18 @@
 }
 
 void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -199,12 +186,11 @@
 }
 
 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -212,9 +198,8 @@
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -224,12 +209,11 @@
 }
 
 void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -171,9 +171,7 @@
 
     unsigned char partitioning;
     unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-    unsigned char dc_diff;
     unsigned char need_to_clamp_mvs;
-
     unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
 } MB_MODE_INFO;
 
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -25,9 +25,8 @@
 
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -38,20 +37,18 @@
 }
 
 void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -62,20 +59,18 @@
 }
 
 void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -88,12 +83,11 @@
 }
 
 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -101,9 +95,8 @@
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -116,12 +109,11 @@
 }
 
 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -353,6 +345,9 @@
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
 
             filter_level = baseline_filter_level[Segment];
 
@@ -365,17 +360,17 @@
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
                 /* don't apply across umv border */
                 if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
             }
 
             y_ptr += 16;
@@ -457,6 +452,10 @@
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+
             filter_level = baseline_filter_level[Segment];
 
             /* Apply any context driven MB level adjustment */
@@ -465,17 +464,17 @@
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
                 /* don't apply across umv border */
                 if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
             }
 
             y_ptr += 16;
@@ -565,20 +564,24 @@
         for (mb_col = 0; mb_col < mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+
             filter_level = baseline_filter_level[Segment];
 
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
             }
 
             y_ptr += 16;
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -41,7 +41,7 @@
 
 #define prototype_loopfilter_block(sym) \
     void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
-             int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+             int ystride, int uv_stride, loop_filter_info *lfi)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/loopfilter_x86.h"
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -109,6 +109,7 @@
         int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
         int Sharpness;          // parameter used for sharpening output: recommendation 0:
         int cpu_used;
+        unsigned int rc_max_intra_bitrate_pct;
 
         // mode ->
         //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
@@ -139,8 +140,9 @@
 
         int end_usage; // vbr or cbr
 
-        // shoot to keep buffer full at all times by undershooting a bit 95 recommended
+        // buffer targeting aggressiveness
         int under_shoot_pct;
+        int over_shoot_pct;
 
         // buffering parameters
         int starting_buffer_level;  // in seconds
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -120,7 +120,6 @@
     int experimental;
     int mb_no_coeff_skip;
     int no_lpf;
-    int simpler_lpf;
     int use_bilinear_mc_filter;
     int full_pixel;
 
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -804,11 +804,14 @@
             for (j = 0; j < mb_cols; j++)
             {
                 char zz[4];
+                int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+                              mi[mb_index].mbmi.mode != SPLITMV &&
+                              mi[mb_index].mbmi.mb_skip_coeff));
 
                 if (oci->frame_type == KEY_FRAME)
                     sprintf(zz, "a");
                 else
-                    sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
+                    sprintf(zz, "%c", dc_diff + '0');
 
                 vp8_blit_text(zz, y_ptr, post->y_stride);
                 mb_index ++;
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -53,9 +53,8 @@
 
 // Horizontal MB filtering
 void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -63,9 +62,8 @@
 }
 
 void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -74,9 +72,8 @@
 
 // Vertical MB Filtering
 void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -84,9 +81,8 @@
 }
 
 void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -95,9 +91,8 @@
 
 // Horizontal B Filtering
 void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     // These should all be done at once with one call, instead of 3
     loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
     loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
@@ -108,9 +103,8 @@
 }
 
 void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -121,9 +115,8 @@
 
 // Vertical B Filtering
 void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -131,9 +124,8 @@
 }
 
 void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -42,9 +42,8 @@
 #if HAVE_MMX
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -56,12 +55,11 @@
 
 
 void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
@@ -68,9 +66,8 @@
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -82,12 +79,11 @@
 
 
 void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
@@ -94,9 +90,8 @@
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -110,12 +105,11 @@
 
 
 void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -124,9 +118,8 @@
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -140,12 +133,11 @@
 
 
 void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -156,9 +148,8 @@
 /* Horizontal MB filtering */
 #if HAVE_SSE2
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -167,12 +158,11 @@
 
 
 void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
@@ -179,9 +169,8 @@
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -190,12 +179,11 @@
 
 
 void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
@@ -202,9 +190,8 @@
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -215,12 +202,11 @@
 
 
 void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -229,9 +215,8 @@
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -242,12 +227,11 @@
 
 
 void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -111,9 +111,8 @@
  */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-
         vp8_build_intra_predictors_mbuv_s(xd);
         RECON_INVOKE(&pbi->common.rtcd.recon,
                      build_intra_predictors_mby_s)(xd);
@@ -195,11 +194,10 @@
         clamp_mvs(xd);
     }
 
-    xd->mode_info_context->mbmi.dc_diff = 1;
-
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED ||
+                  xd->mode_info_context->mbmi.mode == SPLITMV);
+    if (!eobtotal)
     {
-        xd->mode_info_context->mbmi.dc_diff = 0;
         skip_recon_mb(pbi, xd);
         return;
     }
@@ -208,7 +206,7 @@
         mb_init_dequantizer(pbi, xd);
 
     /* do prediction */
-    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
         vp8_build_intra_predictors_mbuv(xd);
 
@@ -255,7 +253,7 @@
                          xd->predictor, xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
-    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+    else if (xd->mode_info_context->mbmi.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
         {
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -108,12 +108,10 @@
         clamp_mvs(xd);
     }
 
-    xd->mode_info_context->mbmi.dc_diff = 1;
-
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED ||
+                  xd->mode_info_context->mbmi.mode == SPLITMV);
+    if (!eobtotal)
     {
-        xd->mode_info_context->mbmi.dc_diff = 0;
-
         /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/
         if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
         {
@@ -322,6 +320,7 @@
 
                         if (pbi->common.filter_level)
                         {
+                            int skip_lf;
                             if( mb_row != pc->mb_rows-1 )
                             {
                                 /* Save decoded MB last row data for next-row decoding */
@@ -349,6 +348,10 @@
 
                             /* update loopfilter info */
                             Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                            xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                            xd->mode_info_context->mbmi.mb_skip_coeff);
+
                             filter_level = pbi->mt_baseline_filter_level[Segment];
                             /* Distance of Mb to the various image edges.
                              * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
@@ -360,17 +363,17 @@
                             if (filter_level)
                             {
                                 if (mb_col > 0)
-                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                                if (xd->mode_info_context->mbmi.dc_diff > 0)
-                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                if (!skip_lf)
+                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
                                 /* don't apply across umv border */
                                 if (mb_row > 0)
-                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                                if (xd->mode_info_context->mbmi.dc_diff > 0)
-                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                if (!skip_lf)
+                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
                             }
                         }
 
@@ -810,6 +813,7 @@
 
                 if (pbi->common.filter_level)
                 {
+                    int skip_lf;
                     /* Save decoded MB last row data for next-row decoding */
                     if(mb_row != pc->mb_rows-1)
                     {
@@ -837,6 +841,9 @@
 
                     /* update loopfilter info */
                     Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                    xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                    xd->mode_info_context->mbmi.mb_skip_coeff);
                     filter_level = pbi->mt_baseline_filter_level[Segment];
                     /* Distance of Mb to the various image edges.
                      * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
@@ -848,17 +855,17 @@
                     if (filter_level)
                     {
                         if (mb_col > 0)
-                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                        if (xd->mode_info_context->mbmi.dc_diff > 0)
-                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                        if (!skip_lf)
+                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
                         /* don't apply across umv border */
                         if (mb_row > 0)
-                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                        if (xd->mode_info_context->mbmi.dc_diff > 0)
-                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                        if (!skip_lf)
+                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
                     }
                 }
 
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1538,11 +1538,6 @@
     {
         if (cpi->common.mb_no_coeff_skip)
         {
-            if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
-                xd->mode_info_context->mbmi.dc_diff = 0;
-            else
-                xd->mode_info_context->mbmi.dc_diff = 1;
-
             xd->mode_info_context->mbmi.mb_skip_coeff = 1;
             cpi->skip_true_count ++;
             vp8_fix_contexts(xd);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1525,10 +1525,6 @@
     cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
     // Initialise the starting buffer levels
-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
     cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
@@ -1701,6 +1697,10 @@
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
 
+    cpi->oxcf.starting_buffer_level =
+        rescale(cpi->oxcf.starting_buffer_level,
+                cpi->oxcf.target_bandwidth, 1000);
+
     // Set or reset optimal and maximum buffer levels.
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
@@ -1750,8 +1750,6 @@
     // Only allow dropped frames in buffered mode
     cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
 
-    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;
-
     if (!cm->use_bilinear_mc_filter)
         cm->mcomp_filter_type = SIXTAP;
     else
@@ -2726,16 +2724,17 @@
         if (cpi->pass == 2)
             vp8_calc_auto_iframe_target_size(cpi);
 
-        // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction
         else
 #endif
-            cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2;
-
-        // in error resilient mode the first frame is bigger since it likely contains
-        // all the static background
-        if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2))
         {
-            cpi->this_frame_target *= 3;      // 5;
+            /* 1 Pass there is no information on which to base size so use
+             * bandwidth per second * fraction of the initial buffer
+             * level
+             */
+            cpi->this_frame_target = cpi->oxcf.starting_buffer_level / 2;
+
+            if(cpi->this_frame_target > cpi->oxcf.target_bandwidth * 3 / 2)
+                cpi->this_frame_target = cpi->oxcf.target_bandwidth * 3 / 2;
         }
 
         // Key frame from VFW/auto-keyframe/first frame
@@ -2769,6 +2768,19 @@
         }
     }
 
+    /* Apply limits on keyframe target.
+     *
+     * TODO: move this after consolidating
+     * vp8_calc_iframe_target_size() and vp8_calc_auto_iframe_target_size()
+     */
+    if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_max_intra_bitrate_pct)
+    {
+        unsigned int max_rate = cpi->av_per_frame_bandwidth
+                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+        if (cpi->this_frame_target > max_rate)
+            cpi->this_frame_target = max_rate;
+    }
     return 1;
 }
 
@@ -5264,35 +5276,6 @@
         {
             unsigned int sse;
             Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
-        }
-
-        src += 16 * source->y_stride;
-        dst += 16 * dest->y_stride;
-    }
-
-    return Total;
-}
-
-
-static int calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
-{
-    int i, j;
-    int Total = 0;
-
-    unsigned char *src = source->y_buffer;
-    unsigned char *dst = dest->y_buffer;
-    (void)rtcd;
-
-    // Loop through the Y plane raw and reconstruction data summing (square differences)
-    for (i = 0; i < source->y_height; i += 16)
-    {
-        for (j = 0; j < source->y_width; j += 16)
-        {
-            unsigned int sse;
-            VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
-
-            if (sse < 8096)
-                Total += sse;
         }
 
         src += 16 * source->y_stride;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -510,7 +510,6 @@
     int auto_adjust_key_quantizer;
     int keyquantizer;
     int auto_worst_q;
-    int filter_type;
     int cpu_used;
     int chroma_boost;
     int horiz_scale;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -938,7 +938,6 @@
         best_mbmode.uv_mode = 0;
         best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
         best_mbmode.partitioning = 0;
-        best_mbmode.dc_diff = 0;
 
         vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
         vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -707,8 +707,6 @@
     int min_frame_target;
     int Adjustment;
 
-    // Set the min frame bandwidth.
-    //min_frame_target = estimate_min_frame_size( cpi );
     min_frame_target = 0;
 
     if (cpi->pass == 2)
@@ -862,11 +860,6 @@
         }
     }
 
-    // Set a reduced data rate target for our initial Q calculation.
-    // This should help to save bits during earier sections.
-    if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100))
-        cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100;
-
     // Sanity check that the total sum of adjustments is not above the maximum allowed
     // That is that having allowed for KF and GF penalties we have not pushed the
     // current interframe target to low. If the adjustment we apply here is not capable of recovering
@@ -903,11 +896,6 @@
                     percent_low =
                         (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
                         one_percent_bits;
-
-                    if (percent_low > 100)
-                        percent_low = 100;
-                    else if (percent_low < 0)
-                        percent_low = 0;
                 }
                 // Are we overshooting the long term clip data rate...
                 else if (cpi->bits_off_target < 0)
@@ -915,16 +903,16 @@
                     // Adjust per frame data target downwards to compensate.
                     percent_low = (int)(100 * -cpi->bits_off_target /
                                        (cpi->total_byte_count * 8));
-
-                    if (percent_low > 100)
-                        percent_low = 100;
-                    else if (percent_low < 0)
-                        percent_low = 0;
                 }
 
+                if (percent_low > cpi->oxcf.under_shoot_pct)
+                    percent_low = cpi->oxcf.under_shoot_pct;
+                else if (percent_low < 0)
+                    percent_low = 0;
+
                 // lower the target bandwidth for this frame.
-                cpi->this_frame_target =
-                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+                cpi->this_frame_target -= (cpi->this_frame_target * percent_low)
+                                          / 200;
 
                 // Are we using allowing control of active_worst_allowed_q
                 // according to buffer level.
@@ -995,20 +983,29 @@
             }
             else
             {
-                int percent_high;
+                int percent_high = 0;
 
-                if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                     && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
                 {
-                    percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8));
+                    percent_high = (cpi->buffer_level
+                                    - cpi->oxcf.optimal_buffer_level)
+                                   / one_percent_bits;
+                }
+                else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                {
+                    percent_high = (int)((100 * cpi->bits_off_target)
+                                         / (cpi->total_byte_count * 8));
+                }
 
-                    if (percent_high > 100)
-                        percent_high = 100;
-                    else if (percent_high < 0)
-                        percent_high = 0;
+                if (percent_high > cpi->oxcf.over_shoot_pct)
+                    percent_high = cpi->oxcf.over_shoot_pct;
+                else if (percent_high < 0)
+                    percent_high = 0;
 
-                    cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100;
+                cpi->this_frame_target += (cpi->this_frame_target *
+                                           percent_high) / 200;
 
-                }
 
                 // Are we allowing control of active_worst_allowed_q according to bufferl level.
                 if (cpi->auto_worst_q)
@@ -1464,40 +1461,7 @@
     return Q;
 }
 
-static int estimate_min_frame_size(VP8_COMP *cpi)
-{
-    double correction_factor;
-    int bits_per_mb_at_max_q;
 
-    // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt.
-    if (cpi->common.current_video_frame < 10)
-    {
-        if (cpi->pass == 2)
-            return (cpi->min_frame_bandwidth);
-        else
-            return cpi->per_frame_bandwidth / 3;
-    }
-
-    /*  // Select the appropriate correction factor based upon type of frame.
-        if ( cpi->common.frame_type == KEY_FRAME )
-            correction_factor = cpi->key_frame_rate_correction_factor;
-        else
-        {
-            if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame )
-                correction_factor = cpi->gf_rate_correction_factor;
-            else
-                correction_factor = cpi->rate_correction_factor;
-        }*/
-
-    // We estimate at half the value we get from vp8_bits_per_mb
-    correction_factor = cpi->rate_correction_factor / 2.0;
-
-    bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]);
-
-    return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS;
-}
-
-
 static int estimate_keyframe_frequency(VP8_COMP *cpi)
 {
     int i;
@@ -1513,8 +1477,10 @@
         /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
          * whichever is smaller.
          */
+        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
         av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
-        if (av_key_frame_frequency > cpi->oxcf.key_freq)
+
+        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
             av_key_frame_frequency = cpi->oxcf.key_freq;
 
         cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -298,9 +298,6 @@
 #endif
     vp8_set_speed_features(cpi);
 
-    if (cpi->common.simpler_lpf)
-        cpi->common.filter_type = SIMPLE_LOOPFILTER;
-
     q = (int)pow(vp8_dc_quant(QIndex,0), 1.25);
 
     if (q < 8)
@@ -2526,7 +2523,6 @@
         best_mbmode.uv_mode = 0;
         best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
         best_mbmode.partitioning = 0;
-        best_mbmode.dc_diff = 0;
 
         vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
         vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -224,18 +224,9 @@
     int plane_type;
     int b;
 
-    TOKENEXTRA *start = *t;
-    TOKENEXTRA *tp = *t;
-
-    x->mode_info_context->mbmi.dc_diff = 1;
-
-
-#if 1
-
     x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x);
     if (x->mode_info_context->mbmi.mb_skip_coeff)
     {
-
         cpi->skip_true_count++;
 
         if (!cpi->common.mb_no_coeff_skip)
@@ -245,17 +236,11 @@
             vp8_fix_contexts(x);
         }
 
-        if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-            x->mode_info_context->mbmi.dc_diff = 0;
-        else
-            x->mode_info_context->mbmi.dc_diff = 1;
-
-
         return;
     }
 
     cpi->skip_false_count++;
-#endif
+
 #if 0
     vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
 #endif
@@ -282,42 +267,6 @@
                             A + vp8_block2above[b],
                             L + vp8_block2left[b], cpi);
 
-#if 0
-
-    if (cpi->common.mb_no_coeff_skip)
-    {
-        int skip = 1;
-
-        while ((tp != *t) && skip)
-        {
-            skip = (skip && (tp->Token == DCT_EOB_TOKEN));
-            tp ++;
-        }
-
-        if (skip != x->mbmi.mb_skip_coeff)
-            skip += 0;
-
-        x->mbmi.mb_skip_coeff = skip;
-
-        if (x->mbmi.mb_skip_coeff == 1)
-        {
-            x->mbmi.dc_diff = 0;
-            //redo the coutnts
-            vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
-
-            *t = start;
-            cpi->skip_true_count++;
-            //skip_true_count++;
-        }
-        else
-        {
-
-            cpi->skip_false_count++;
-            //skip_false_count++;
-        }
-    }
-
-#endif
 }
 
 
@@ -499,13 +448,6 @@
     stuff2nd_order_b(x->block + 24, t, 1, x->frame_type,
                      A + vp8_block2above[24], L + vp8_block2left[24], cpi);
     plane_type = 0;
-
-
-    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-        x->mode_info_context->mbmi.dc_diff = 0;
-    else
-        x->mode_info_context->mbmi.dc_diff = 1;
-
 
     for (b = 0; b < 16; b++)
         stuff1st_order_b(x->block + b, t, plane_type, x->frame_type,
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -1,0 +1,254 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse4 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp8_regular_quantize_b_sse4)
+sym(vp8_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rdi
+    push        rsi
+
+    ALIGN_STACK 16, rax
+    %define qcoeff      0 ; 32
+    %define stack_size 32
+    sub         rsp, stack_size
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 8, u
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_zbin]
+    mov         rdx, [rdi + vp8_block_round]
+    movd        xmm7, [rdi + vp8_block_zbin_extra]
+
+    ; z
+    movdqa      xmm0, [rax]
+    movdqa      xmm1, [rax + 16]
+
+    ; duplicate zbin_oq_value
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm1, 15
+
+    ; (z ^ sz)
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm1
+
+    ; x = abs(z)
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm1
+
+    ; zbin
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm3
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm6, xmm4
+    psubw       xmm7, xmm5
+
+    ; round
+    movdqa      xmm4, [rdx]
+    movdqa      xmm5, [rdx + 16]
+
+    mov         rax, [rdi + vp8_block_quant_shift]
+    mov         rcx, [rdi + vp8_block_quant]
+    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
+
+    ; x + round
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    ; quant
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm4, xmm2
+    pmulhw      xmm5, xmm3
+
+    ; y += x
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    pxor        xmm4, xmm4
+%if ABI_IS_32BIT
+    movdqa      [rsp + qcoeff], xmm4
+    movdqa      [rsp + qcoeff + 16], xmm4
+%else
+    pxor        xmm8, xmm8
+%endif
+
+    ; quant_shift
+    movdqa      xmm5, [rax]
+
+    ; zrun_zbin_boost
+    mov         rax, rdx
+
+%macro ZIGZAG_LOOP 5
+    ; x
+    pextrw      ecx, %4, %2
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          rq_zigzag_loop_%1           ; x < zbin
+
+    pextrw      edi, %3, %2                 ; y
+
+    ; downshift by quant_shift[rc]
+    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          rq_zigzag_loop_%1           ; !y
+%if ABI_IS_32BIT
+    mov         WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+    pinsrw      %5, edi, %2                 ; qcoeff[rc]
+%endif
+    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
+rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+    mov         rcx, [rsi + vp8_blockd_dequant]
+    mov         rdi, [rsi + vp8_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+    movdqa      xmm4, [rsp + qcoeff]
+    movdqa      xmm5, [rsp + qcoeff + 16]
+%else
+    %define     xmm5 xmm8
+%endif
+
+    ; y ^ sz
+    pxor        xmm4, xmm0
+    pxor        xmm5, xmm1
+    ; x = (y ^ sz) - sz
+    psubw       xmm4, xmm0
+    psubw       xmm5, xmm1
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp8_blockd_qcoeff]
+
+    pmullw      xmm0, xmm4
+    pmullw      xmm1, xmm5
+
+    ; store qcoeff
+    movdqa      [rcx], xmm4
+    movdqa      [rcx + 16], xmm5
+
+    ; store dqcoeff
+    movdqa      [rdi], xmm0
+    movdqa      [rdi + 16], xmm1
+
+    ; select the last value (in zig_zag order) for EOB
+    pxor        xmm6, xmm6
+    pcmpeqw     xmm4, xmm6
+    pcmpeqw     xmm5, xmm6
+
+    packsswb    xmm4, xmm5
+    pshufb      xmm4, [GLOBAL(zig_zag1d)]
+    pmovmskb    edx, xmm4
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax
+    bsr         eax, edx
+    sub         edi, edx
+    sar         edi, 31
+    add         eax, 1
+    and         eax, edi
+
+    mov         [rsi + vp8_blockd_eob], eax
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    add         rsp, stack_size
+    pop         rsp
+
+    pop         rsi
+    pop         rdi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %undef xmm5
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+    RESTORE_XMM
+  %endif
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+; vp8/common/entropy.c: vp8_default_zig_zag1d
+zig_zag1d:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -51,4 +51,17 @@
 
 #endif /* HAVE_SSSE3 */
 
+
+#if HAVE_SSE4_1
+extern prototype_quantize_block(vp8_regular_quantize_b_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b_sse4
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE4_1 */
+
 #endif /* QUANTIZE_X86_H */
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -313,6 +313,8 @@
         cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
         cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
         cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+
+        cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse4;
     }
 #endif
 
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,8 @@
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
 #endif
     RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
-    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
+    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
     RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
     //RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
@@ -307,6 +308,7 @@
     }
 
     oxcf->target_bandwidth       = cfg.rc_target_bitrate;
+    oxcf->rc_max_intra_bitrate_pct = cfg.rc_max_intra_bitrate_pct;
 
     oxcf->best_allowed_q          = cfg.rc_min_quantizer;
     oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
@@ -314,7 +316,7 @@
     oxcf->fixed_q = -1;
 
     oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
-    //oxcf->over_shoot_pct        = cfg.rc_overshoot_pct;
+    oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;
 
     oxcf->maximum_buffer_size     = cfg.rc_buf_sz;
     oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
@@ -360,6 +362,7 @@
         printf("key_freq: %d\n", oxcf->key_freq);
         printf("end_usage: %d\n", oxcf->end_usage);
         printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+        printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
         printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
         printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
         printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
@@ -1105,11 +1108,11 @@
         {0},                /* rc_twopass_stats_in */
 #endif
         256,                /* rc_target_bandwidth */
-
+        0,                  /* rc_max_intra_bitrate_pct */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
-        95,                 /* rc_undershoot_pct */
-        200,                /* rc_overshoot_pct */
+        100,                /* rc_undershoot_pct */
+        100,                /* rc_overshoot_pct */
 
         6000,               /* rc_max_buffer_size */
         4000,               /* rc_buffer_initial_size; */
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -117,6 +117,7 @@
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -398,6 +398,21 @@
         unsigned int           rc_target_bitrate;
 
 
+        /*!\brief Max data rate for Intra frames
+         *
+         * This value controls additional clamping on the maximum size of a
+         * keyframe. It is expressed as a percentage of the average
+         * per-frame bitrate, with the special (and default) value 0 meaning
+         * unlimited, or no additional clamping beyond the codec's built-in
+         * algorithm.
+         *
+         * For example, to allocate no more than 4.5 frames worth of bitrate
+         * to a keyframe, set this to 450.
+         *
+         */
+        unsigned int           rc_max_intra_bitrate_pct;
+
+
         /*
          * quantizer settings
          */
@@ -430,20 +445,28 @@
          */
 
 
-        /*!\brief Rate control undershoot tolerance
+        /*!\brief Rate control adaptation undershoot control
          *
-         * This value, expressed as a percentage of the target bitrate, describes
-         * the target bitrate for easier frames, allowing bits to be saved for
-         * harder frames. Set to zero to use the codec default.
+         * This value, expressed as a percentage of the target bitrate,
+         * controls the maximum allowed adaptation speed of the codec.
+         * This factor controls the maximum amount of bits that can
+         * be subtracted from the target bitrate in order to compensate
+         * for prior overshoot.
+         *
+         * Valid values in the range 0-1000.
          */
         unsigned int           rc_undershoot_pct;
 
 
-        /*!\brief Rate control overshoot tolerance
+        /*!\brief Rate control adaptation overshoot control
          *
-         * This value, expressed as a percentage of the target bitrate, describes
-         * the maximum allowed bitrate for a given frame.  Set to zero to use the
-         * codec default.
+         * This value, expressed as a percentage of the target bitrate,
+         * controls the maximum allowed adaptation speed of the codec.
+         * This factor controls the maximum amount of bits that can
+         * be added to the target bitrate in order to compensate for
+         * prior undershoot.
+         *
+         * Valid values in the range 0-1000.
          */
         unsigned int           rc_overshoot_pct;
 
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -260,6 +260,16 @@
     return stats->buf;
 }
 
+/* Stereo 3D packed frame format */
+typedef enum stereo_format
+{
+    STEREO_FORMAT_MONO       = 0,
+    STEREO_FORMAT_LEFT_RIGHT = 1,
+    STEREO_FORMAT_BOTTOM_TOP = 2,
+    STEREO_FORMAT_TOP_BOTTOM = 3,
+    STEREO_FORMAT_RIGHT_LEFT = 11
+} stereo_format_t;
+
 enum video_file_type
 {
     FILE_TYPE_RAW,
@@ -610,7 +620,8 @@
 static void
 write_webm_file_header(EbmlGlobal                *glob,
                        const vpx_codec_enc_cfg_t *cfg,
-                       const struct vpx_rational *fps)
+                       const struct vpx_rational *fps,
+                       stereo_format_t            stereo_fmt)
 {
     {
         EbmlLoc start;
@@ -654,6 +665,7 @@
                     Ebml_StartSubElement(glob, &videoStart, Video);
                     Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
                     Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
+                    Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt);
                     Ebml_SerializeFloat(glob, FrameRate, frameRate);
                     Ebml_EndSubElement(glob, &videoStart); //Video
                 }
@@ -920,6 +932,16 @@
         "Frame width");
 static const arg_def_t height           = ARG_DEF("h", "height", 1,
         "Frame height");
+static const struct arg_enum_list stereo_mode_enum[] = {
+    {"mono"      , STEREO_FORMAT_MONO},
+    {"left-right", STEREO_FORMAT_LEFT_RIGHT},
+    {"bottom-top", STEREO_FORMAT_BOTTOM_TOP},
+    {"top-bottom", STEREO_FORMAT_TOP_BOTTOM},
+    {"right-left", STEREO_FORMAT_RIGHT_LEFT},
+    {NULL, 0}
+};
+static const arg_def_t stereo_mode      = ARG_DEF_ENUM(NULL, "stereo-mode", 1,
+        "Stereo 3D video format", stereo_mode_enum);
 static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,
         "Stream timebase (frame duration)");
 static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,
@@ -930,7 +952,7 @@
 static const arg_def_t *global_args[] =
 {
     &use_yv12, &use_i420, &usage, &threads, &profile,
-    &width, &height, &timebase, &framerate, &error_resilient,
+    &width, &height, &stereo_mode, &timebase, &framerate, &error_resilient,
     &lag_in_frames, NULL
 };
 
@@ -966,11 +988,14 @@
         "Client initial buffer size (ms)");
 static const arg_def_t buf_optimal_sz     = ARG_DEF(NULL, "buf-optimal-sz", 1,
         "Client optimal buffer size (ms)");
+static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
+        "Max I-frame bitrate (pct)");
 static const arg_def_t *rc_args[] =
 {
     &dropframe_thresh, &resize_allowed, &resize_up_thresh, &resize_down_thresh,
     &end_usage, &target_bitrate, &min_quantizer, &max_quantizer,
     &undershoot_pct, &overshoot_pct, &buf_sz, &buf_initial_sz, &buf_optimal_sz,
+    &max_intra_rate_pct,
     NULL
 };
 
@@ -1088,7 +1113,6 @@
 
 #define ARG_CTRL_CNT_MAX 10
 
-
 int main(int argc, const char **argv_)
 {
     vpx_codec_ctx_t        encoder;
@@ -1124,6 +1148,7 @@
     uint64_t                 psnr_samples_total = 0;
     double                   psnr_totals[4] = {0, 0, 0, 0};
     int                      psnr_count = 0;
+    stereo_format_t          stereo_fmt = STEREO_FORMAT_MONO;
 
     exec_name = argv_[0];
     ebml.last_pts_ms = -1;
@@ -1263,6 +1288,8 @@
             cfg.g_w = arg_parse_uint(&arg);
         else if (arg_match(&arg, &height, argi))
             cfg.g_h = arg_parse_uint(&arg);
+        else if (arg_match(&arg, &stereo_mode, argi))
+            stereo_fmt = arg_parse_enum_or_int(&arg);
         else if (arg_match(&arg, &timebase, argi))
             cfg.g_timebase = arg_parse_rational(&arg);
         else if (arg_match(&arg, &error_resilient, argi))
@@ -1283,6 +1310,8 @@
             cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
         else if (arg_match(&arg, &target_bitrate, argi))
             cfg.rc_target_bitrate = arg_parse_uint(&arg);
+        else if (arg_match(&arg, &max_intra_rate_pct, argi))
+            cfg.rc_max_intra_bitrate_pct = arg_parse_uint(&arg);
         else if (arg_match(&arg, &min_quantizer, argi))
             cfg.rc_min_quantizer = arg_parse_uint(&arg);
         else if (arg_match(&arg, &max_quantizer, argi))
@@ -1565,7 +1594,7 @@
         if(write_webm)
         {
             ebml.stream = outfile;
-            write_webm_file_header(&ebml, &cfg, &arg_framerate);
+            write_webm_file_header(&ebml, &cfg, &arg_framerate, stereo_fmt);
         }
         else
             write_ivf_file_header(outfile, &cfg, codec->fourcc, 0);