shithub: libvpx

Download patch

ref: e1b90ce862b6989d8ad755ffbbd9d59849d93a4f
parent: 9d325df746cd79f434cea45768b38dc49b61e3aa
parent: 2e102855f4f69148d17771f584c26e1498ec82e2
author: John Koleszar <jkoleszar@google.com>
date: Wed Apr 27 20:05:07 EDT 2011

Merge remote branch 'internal/upstream' into HEAD

--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -43,6 +43,12 @@
         vp8_build_intra_predictors_mby;
     rtcd->recon.build_intra_predictors_mby_s =
         vp8_build_intra_predictors_mby_s;
+    rtcd->recon.build_intra_predictors_mbuv =
+        vp8_build_intra_predictors_mbuv;
+    rtcd->recon.build_intra_predictors_mbuv_s =
+        vp8_build_intra_predictors_mbuv_s;
+    rtcd->recon.intra4x4_predict =
+        vp8_intra4x4_predict;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -26,6 +26,9 @@
 #define prototype_build_intra_predictors(sym) \
     void sym(MACROBLOCKD *x)
 
+#define prototype_intra4x4_predict(sym) \
+    void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
+
 struct vp8_recon_rtcd_vtable;
 
 #if ARCH_X86 || ARCH_X86_64
@@ -88,11 +91,30 @@
 extern prototype_build_intra_predictors\
     (vp8_recon_build_intra_predictors_mby_s);
 
+#ifndef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mbuv);
 
+#ifndef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mbuv_s);
+
+#ifndef vp8_recon_intra4x4_predict
+#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
+#endif
+extern prototype_intra4x4_predict\
+    (vp8_recon_intra4x4_predict);
+
+
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
 typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
 typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
+typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
 typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
@@ -105,6 +127,9 @@
     vp8_recon_mb_fn_t    recon_mby;
     vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
     vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
+    vp8_intra4x4_pred_fn_t intra4x4_predict;
 } vp8_recon_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -543,107 +543,3 @@
         RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
     }
 }
-void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
-{
-    unsigned char *dst_ptr = x->dst.y_buffer;
-
-    if (x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        vp8_build_inter16x16_predictors_mb_s(x);
-    }
-    else
-    {
-        /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
-         * if sth is wrong, go back to what it is in build_inter_predictors_mb.
-         */
-        int i;
-
-        if (x->mode_info_context->mbmi.partitioning < 3)
-        {
-            for (i = 0; i < 4; i++)
-            {
-                unsigned char *ptr_base;
-                unsigned char *ptr;
-                BLOCKD *d = &x->block[bbb[i]];
-
-                ptr_base = *(d->base_pre);
-                ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-
-                if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
-                {
-                    x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-                }
-                else
-                {
-                    RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-                }
-            }
-        }
-        else
-        {
-            for (i = 0; i < 16; i += 2)
-            {
-                BLOCKD *d0 = &x->block[i];
-                BLOCKD *d1 = &x->block[i+1];
-
-                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                {
-                    /*build_inter_predictors2b(x, d0, 16);*/
-                    unsigned char *ptr_base;
-                    unsigned char *ptr;
-
-                    ptr_base = *(d0->base_pre);
-                    ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
-                    if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
-                    {
-                        x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
-                    }
-                    else
-                    {
-                        RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
-                    }
-                }
-                else
-                {
-                    vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
-                    vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
-                }
-            }
-        }
-
-        for (i = 16; i < 24; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            {
-                /*build_inter_predictors2b(x, d0, 8);*/
-                unsigned char *ptr_base;
-                unsigned char *ptr;
-
-                ptr_base = *(d0->base_pre);
-                ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
-                if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
-                {
-                    x->subpixel_predict8x4(ptr, d0->pre_stride,
-                        d0->bmi.mv.as_mv.col & 7,
-                        d0->bmi.mv.as_mv.row & 7,
-                        dst_ptr, x->dst.uv_stride);
-                }
-                else
-                {
-                    RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
-                        d0->pre_stride, dst_ptr, x->dst.uv_stride);
-                }
-            }
-            else
-            {
-                vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
-                vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
-            }
-        }
-    }
-}
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -13,7 +13,6 @@
 #define __INC_RECONINTER_H
 
 extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
 extern void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x);
 
 extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,9 +14,4 @@
 
 extern void init_intra_left_above_pixels(MACROBLOCKD *x);
 
-extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
-
-extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
-
 #endif
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -14,7 +14,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "reconintra.h"
 
-void vp8_predict_intra4x4(BLOCKD *x,
+void vp8_intra4x4_predict(BLOCKD *x,
                           int b_mode,
                           unsigned char *predictor)
 {
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -229,3 +229,411 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc_mmx2)
+sym(vp8_intra_pred_uv_dc_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movd        mm1,        [rsi]
+    movd        mm2,        [rsi+4]
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm2,        mm0
+    paddw       mm1,        mm2
+    pshufw      mm2,        mm1, 0x0e
+    paddw       mm1,        mm2
+    pshufw      mm2,        mm1, 0x01
+    paddw       mm1,        mm2
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        mm1, 0x0
+    lea         edx,        [edx+ecx+8]
+    sar         edx,        4
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dctop_mmx2)
+sym(vp8_intra_pred_uv_dctop_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movd        mm1,        [rsi]
+    movd        mm2,        [rsi+4]
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm2,        mm0
+    paddw       mm1,        mm2
+    pshufw      mm2,        mm1, 0x0e
+    paddw       mm1,        mm2
+    pshufw      mm2,        mm1, 0x01
+    paddw       mm1,        mm2
+
+    ; add up
+    paddw       mm1,        [GLOBAL(dc_4)]
+    psraw       mm1,        3
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dcleft_mmx2)
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+4]
+
+    ; add up
+    shr         edx,        3
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc128_mmx)
+sym(vp8_intra_pred_uv_dc128_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    ; end prolog
+
+    ; write out
+    movq        mm1,        [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1)
+sym(vp8_intra_pred_uv_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read top row
+    mov         edx,        4
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm2,       [GLOBAL(dc_1024)]
+%endif
+    movq        xmm1,       [rsi]
+    punpcklbw   xmm1,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm3,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    punpcklqdq  xmm3,       xmm3
+%else
+    pshufb      xmm3,       xmm2
+%endif
+    psubw       xmm1,       xmm3
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_uv_tm_%1_loop:
+    movd        xmm3,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm3,       xmm3
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm3,       xmm2
+    pshufb      xmm5,       xmm2
+%endif
+    paddw       xmm3,       xmm1
+    paddw       xmm5,       xmm1
+    packuswb    xmm3,       xmm5
+    movq  [rdi    ],        xmm3
+    movhps[rdi+rcx],        xmm3
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_uv_tm_%1_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_ve_mmx)
+sym(vp8_intra_pred_uv_ve_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movq        mm1,        [rax]
+
+    ; write out
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_ho_mmx2)
+sym(vp8_intra_pred_uv_ho_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read from left and write out
+    mov         edx,        4
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    dec         rsi
+vp8_intra_pred_uv_ho_mmx2_loop:
+    movd        mm0,        [rsi]
+    movd        mm1,        [rsi+rax]
+    punpcklbw   mm0,        mm0
+    punpcklbw   mm1,        mm1
+    pshufw      mm0,        mm0, 0x0
+    pshufw      mm1,        mm1, 0x0
+    movq  [rdi    ],        mm0
+    movq  [rdi+rcx],        mm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_uv_ho_mmx2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+dc_128:
+    times 8 db 128
+dc_4:
+    times 4 dw 4
+align 16
+dc_1024:
+    times 8 dw 0x400
--- /dev/null
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -1,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp8/common/recon.h"
+#include "recon_x86.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+    void sym(unsigned char *dst, int dst_stride, \
+             const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static inline void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+                                                       unsigned char *dst_u,
+                                                       unsigned char *dst_v,
+                                                       int dst_stride,
+                                                       build_intra_predictors_mbuv_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.uv_mode;
+    build_intra_predictors_mbuv_fn_t fn;
+    int src_stride = x->dst.uv_stride;
+
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+        case  H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_uv_dc_mmx2; break;
+                } else {
+                    fn = vp8_intra_pred_uv_dctop_mmx2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+            } else {
+                fn = vp8_intra_pred_uv_dc128_mmx; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
+    fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+                                        &x->predictor[320], 8,
+                                        vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+                                        &x->predictor[320], 8,
+                                        vp8_intra_pred_uv_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+                                        x->dst.v_buffer, x->dst.uv_stride,
+                                        vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+                                        x->dst.v_buffer, x->dst.uv_stride,
+                                        vp8_intra_pred_uv_tm_ssse3);
+}
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -46,6 +46,8 @@
 extern prototype_recon_block(vp8_recon2b_sse2);
 extern prototype_recon_block(vp8_recon4b_sse2);
 extern prototype_copy_block(vp8_copy_mem16x16_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon2
@@ -56,6 +58,26 @@
 
 #undef  vp8_recon_copy16x16
 #define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
+
+#undef  vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
+
+#undef  vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3
+
+#undef  vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
 
 #endif
 #endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -88,6 +88,10 @@
         rtcd->recon.recon2      = vp8_recon2b_sse2;
         rtcd->recon.recon4      = vp8_recon4b_sse2;
         rtcd->recon.copy16x16   = vp8_copy_mem16x16_sse2;
+        rtcd->recon.build_intra_predictors_mbuv =
+            vp8_build_intra_predictors_mbuv_sse2;
+        rtcd->recon.build_intra_predictors_mbuv_s =
+            vp8_build_intra_predictors_mbuv_s_sse2;
 
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;
 
@@ -126,6 +130,11 @@
         rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_ssse3;
         rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3;
         rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_ssse3;
+
+        rtcd->recon.build_intra_predictors_mbuv =
+            vp8_build_intra_predictors_mbuv_ssse3;
+        rtcd->recon.build_intra_predictors_mbuv_s =
+            vp8_build_intra_predictors_mbuv_s_ssse3;
     }
 #endif
 
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -51,19 +51,26 @@
 #define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
     do \
     { \
-        int shift; \
-        for(shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); shift >= 0; ) \
+        int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+        int loop_end, x; \
+        size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+        \
+        x = shift + CHAR_BIT - bits_left; \
+        loop_end = 0; \
+        if(x >= 0) \
         { \
-            if((_bufptr) >= (_bufend)) { \
-                (_count) = VP8_LOTS_OF_BITS; \
-                break; \
-            } \
-            (_count) += 8; \
+            (_count) += VP8_LOTS_OF_BITS; \
+            loop_end = x; \
+            if(!bits_left) break; \
+        } \
+        while(shift >= loop_end) \
+        { \
+            (_count) += CHAR_BIT; \
             (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
-            shift -= 8; \
+            shift -= CHAR_BIT; \
         } \
     } \
-    while(0)
+    while(0) \
 
 
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
@@ -119,18 +126,19 @@
 
 static int vp8dx_bool_error(BOOL_DECODER *br)
 {
-  /* Check if we have reached the end of the buffer.
-   *
-   * Variable 'count' stores the number of bits in the 'value' buffer,
-   * minus 8. So if count == 8, there are 16 bits available to be read.
-   * Normally, count is filled with 8 and one byte is filled into the
-   * value buffer. When we reach the end of the buffer, count is instead
-   * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real
-   * bits from the bitstream. So the last bit in the bitstream will be
-   * represented by count == VP8_LOTS_OF_BITS - 16.
-   */
-    if ((br->count > VP8_BD_VALUE_SIZE)
-        && (br->count <= VP8_LOTS_OF_BITS - 16))
+    /* Check if we have reached the end of the buffer.
+     *
+     * Variable 'count' stores the number of bits in the 'value' buffer, minus
+     * 8. The top byte is part of the algorithm, and the remainder is buffered
+     * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+     * occupied, 8 for the algorithm and 8 in the buffer.
+     *
+     * When reading a byte from the user's buffer, count is filled with 8 and
+     * one byte is filled into the value buffer. When we reach the end of the
+     * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+     * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+     */
+    if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
     {
        /* We have tried to decode bits after the end of
         * stream was encountered.
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -113,7 +113,7 @@
 {
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8_build_intra_predictors_mbuv_s(xd);
+        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
         RECON_INVOKE(&pbi->common.rtcd.recon,
                      build_intra_predictors_mby_s)(xd);
     }
@@ -213,7 +213,7 @@
     /* do prediction */
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8_build_intra_predictors_mbuv(xd);
+        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
 
         if (xd->mode_info_context->mbmi.mode != B_PRED)
         {
@@ -264,7 +264,8 @@
         {
 
             BLOCKD *b = &xd->block[i];
-            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+            RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+                         (b, b->bmi.mode, b->predictor);
 
             if (xd->eobs[i] > 1)
             {
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1529,7 +1529,7 @@
 
         }
         else
-            vp8_build_inter_predictors_mb_s(xd);
+            vp8_build_inter16x16_predictors_mb_s(xd);
     }
 
     if (!x->skip)
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -32,7 +32,8 @@
 #endif
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
 {
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
+    RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
+                 (b, best_mode, b->predictor);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
 
@@ -113,7 +114,7 @@
 
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_intra_predictors_mbuv(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -56,7 +56,6 @@
 extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);
 extern void vp8_dmachine_specific_config(VP8_COMP *cpi);
 extern void vp8_cmachine_specific_config(VP8_COMP *cpi);
-extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi);
 extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
 extern void print_parms(VP8_CONFIG *ocf, char *filenam);
 extern unsigned int vp8_get_processor_freq();
@@ -1508,10 +1507,7 @@
 
     cpi->auto_gold = 1;
     cpi->auto_adjust_gold_quantizer = 1;
-    cpi->goldquantizer = 1;
     cpi->goldfreq = 7;
-    cpi->auto_adjust_key_quantizer = 1;
-    cpi->keyquantizer = 1;
 
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
@@ -2711,79 +2707,8 @@
 
 #endif
 }
-// return of 0 means drop frame
-static int pick_frame_size(VP8_COMP *cpi)
-{
-    VP8_COMMON *cm = &cpi->common;
 
-    // First Frame is a special case
-    if (cm->current_video_frame == 0)
-    {
-#if !(CONFIG_REALTIME_ONLY)
 
-        if (cpi->pass == 2)
-            vp8_calc_auto_iframe_target_size(cpi);
-
-        else
-#endif
-        {
-            /* 1 Pass there is no information on which to base size so use
-             * bandwidth per second * fraction of the initial buffer
-             * level
-             */
-            cpi->this_frame_target = cpi->oxcf.starting_buffer_level / 2;
-
-            if(cpi->this_frame_target > cpi->oxcf.target_bandwidth * 3 / 2)
-                cpi->this_frame_target = cpi->oxcf.target_bandwidth * 3 / 2;
-        }
-
-        // Key frame from VFW/auto-keyframe/first frame
-        cm->frame_type = KEY_FRAME;
-
-    }
-    // Special case for forced key frames
-    // The frame sizing here is still far from ideal for 2 pass.
-    else if (cm->frame_flags & FRAMEFLAGS_KEY)
-    {
-        cm->frame_type = KEY_FRAME;
-        resize_key_frame(cpi);
-        vp8_calc_iframe_target_size(cpi);
-    }
-    else if (cm->frame_type == KEY_FRAME)
-    {
-        vp8_calc_auto_iframe_target_size(cpi);
-    }
-    else
-    {
-        // INTER frame: compute target frame size
-        cm->frame_type = INTER_FRAME;
-        vp8_calc_pframe_target_size(cpi);
-
-        // Check if we're dropping the frame:
-        if (cpi->drop_frame)
-        {
-            cpi->drop_frame = FALSE;
-            cpi->drop_count++;
-            return 0;
-        }
-    }
-
-    /* Apply limits on keyframe target.
-     *
-     * TODO: move this after consolidating
-     * vp8_calc_iframe_target_size() and vp8_calc_auto_iframe_target_size()
-     */
-    if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_max_intra_bitrate_pct)
-    {
-        unsigned int max_rate = cpi->av_per_frame_bandwidth
-                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
-
-        if (cpi->this_frame_target > max_rate)
-            cpi->this_frame_target = max_rate;
-    }
-    return 1;
-}
-
 static void set_quantizer(VP8_COMP *cpi, int Q)
 {
     VP8_COMMON *cm = &cpi->common;
@@ -3581,7 +3506,7 @@
     }
 
     // Decide how big to make the frame
-    if (!pick_frame_size(cpi))
+    if (!vp8_pick_frame_size(cpi))
     {
         cm->current_video_frame++;
         cpi->frames_since_key++;
@@ -3909,7 +3834,10 @@
         }
 
         if (cm->frame_type == KEY_FRAME)
+        {
+            resize_key_frame(cpi);
             vp8_setup_key_frame(cpi);
+        }
 
         // transform / motion compensation build reconstruction frame
         vp8_encode_frame(cpi);
@@ -3944,11 +3872,11 @@
 #else
             if (decide_key_frame(cpi))
             {
-                vp8_calc_auto_iframe_target_size(cpi);
-
                 // Reset all our sizing numbers and recode
                 cm->frame_type = KEY_FRAME;
 
+                vp8_pick_frame_size(cpi);
+
                 // Clear the Alt reference frame active flag when we have a key frame
                 cpi->source_alt_ref_active = FALSE;
 
@@ -3977,7 +3905,6 @@
                 loop_count++;
                 Loop = TRUE;
 
-                resize_key_frame(cpi);
                 continue;
             }
 #endif
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -505,10 +505,7 @@
     int interquantizer;
     int auto_gold;
     int auto_adjust_gold_quantizer;
-    int goldquantizer;
     int goldfreq;
-    int auto_adjust_key_quantizer;
-    int keyquantizer;
     int auto_worst_q;
     int cpu_used;
     int chroma_boost;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -194,7 +194,8 @@
         int this_rd;
 
         rate = mode_costs[mode];
-        vp8_predict_intra4x4(b, mode, b->predictor);
+        RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
+                     (b, mode, b->predictor);
         distortion = get_prediction_error(be, b, &rtcd->variance);
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -374,10 +374,28 @@
     cpi->common.refresh_alt_ref_frame = TRUE;
 }
 
-void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+                              double correction_factor)
 {
+    int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]);
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * MBs;
+    else
+        return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP8_COMP *cpi)
+{
     // boost defaults to half second
     int kf_boost;
+    int target;
 
     // Clear down mmx registers to allow floating point in what follows
     vp8_clear_system_state();  //__asm emms;
@@ -384,52 +402,68 @@
 
     if (cpi->oxcf.fixed_q >= 0)
     {
-        vp8_calc_iframe_target_size(cpi);
-        return;
-    }
+        int Q = cpi->oxcf.key_q;
 
-    if (cpi->pass == 2)
+        target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs,
+                                    cpi->key_frame_rate_correction_factor);
+    }
+    else if (cpi->pass == 2)
     {
-        cpi->this_frame_target = cpi->per_frame_bandwidth;      // New Two pass RC
+        // New Two pass RC
+        target = cpi->per_frame_bandwidth;
     }
+    // First Frame is a special case
+    else if (cpi->common.current_video_frame == 0)
+    {
+        /* 1 Pass there is no information on which to base size so use
+         * bandwidth per second * fraction of the initial buffer
+         * level
+         */
+        target = cpi->oxcf.starting_buffer_level / 2;
+
+        if(target > cpi->oxcf.target_bandwidth * 3 / 2)
+            target = cpi->oxcf.target_bandwidth * 3 / 2;
+    }
     else
     {
+        // if this keyframe was forced, use a more recent Q estimate
+        int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
+                ? cpi->avg_frame_qindex : cpi->ni_av_qi;
+
         // Boost depends somewhat on frame rate
         kf_boost = (int)(2 * cpi->output_frame_rate - 16);
 
         // adjustment up based on q
-        kf_boost = kf_boost * kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
 
         // frame separation adjustment ( down)
         if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
-            kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+            kf_boost = (int)(kf_boost
+                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));
 
         if (kf_boost < 16)
             kf_boost = 16;
 
-        // Reset the active worst quality to the baseline value for key frames.
-        cpi->active_worst_quality = cpi->worst_quality;
-
-        cpi->this_frame_target = ((16 + kf_boost)  * cpi->per_frame_bandwidth) >> 4;
+        target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
     }
 
 
-    // Should the next frame be an altref frame
-    if (cpi->pass != 2)
+    if (cpi->oxcf.rc_max_intra_bitrate_pct)
     {
-        // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
+        unsigned int max_rate = cpi->per_frame_bandwidth
+                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
 
-        /*if ( cpi->oxcf.fixed_q == -1)
-        {
-            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
-            else
-                cpi->source_alt_ref_pending = FALSE;
-        }*/
+        if (target > max_rate)
+            target = max_rate;
     }
 
-    if (0)
+    cpi->this_frame_target = target;
+
+    // TODO: if we separate rate targeting from Q targetting, move this.
+    // Reset the active worst quality to the baseline value for key frames.
+    cpi->active_worst_quality = cpi->worst_quality;
+
+#if 0
     {
         FILE *f;
 
@@ -442,8 +476,10 @@
 
         fclose(f);
     }
+#endif
 }
 
+
 //  Do the best we can to define the parameteres for the next GF based on what information we have available.
 static void calc_gf_params(VP8_COMP *cpi)
 {
@@ -609,101 +645,10 @@
         }*/
     }
 }
-/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */
-static int baseline_bits_at_q(int frame_kind, int Q, int MBs)
-{
-    int Bpm = vp8_bits_per_mb[frame_kind][Q];
 
-    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-     * largest Bpm takes 20 bits.
-     */
-    if (MBs > (1 << 11))
-        return (Bpm >> BPER_MB_NORMBITS) * MBs;
-    else
-        return (Bpm * MBs) >> BPER_MB_NORMBITS;
-}
 
-void vp8_calc_iframe_target_size(VP8_COMP *cpi)
+static void calc_pframe_target_size(VP8_COMP *cpi)
 {
-    int Q;
-    int Boost = 100;
-
-    Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex;
-
-    if (cpi->auto_adjust_key_quantizer == 1)
-    {
-        // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames.
-        // The enhanced Q is calculated so as to boost the key frame size by a factor
-        // specified in kf_boost_qadjustment. Also, can adjust based on distance
-        // between key frames.
-
-        // Adjust boost based upon ambient Q
-        Boost = kf_boost_qadjustment[Q];
-
-        // Make the Key frame boost less if the seperation from the previous key frame is small
-        if (cpi->frames_since_key < 16)
-            Boost = Boost * kf_boost_seperation_adjustment[cpi->frames_since_key] / 100;
-        else
-            Boost = Boost * kf_boost_seperation_adjustment[15] / 100;
-
-        // Apply limits on boost
-        if (Boost > kf_gf_boost_qlimits[Q])
-            Boost = kf_gf_boost_qlimits[Q];
-        else if (Boost < 120)
-            Boost = 120;
-    }
-
-    // Keep a record of the boost that was used
-    cpi->last_boost = Boost;
-
-    // Should the next frame be an altref frame
-    if (cpi->pass != 2)
-    {
-        // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
-
-        /*if ( cpi->oxcf.fixed_q == -1)
-        {
-            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
-            else
-                cpi->source_alt_ref_pending = FALSE;
-        }*/
-    }
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100;
-    }
-    else
-    {
-
-        int bits_per_mb_at_this_q ;
-
-        if (cpi->oxcf.error_resilient_mode == 1)
-        {
-            cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth;
-            return;
-        }
-
-        // Rate targetted scenario:
-        // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target
-        bits_per_mb_at_this_q = (int)(.5 +
-                                      cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]);
-
-        cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100;
-
-        // Reset the active worst quality to the baseline value for key frames.
-        if (cpi->pass < 2)
-            cpi->active_worst_quality = cpi->worst_quality;
-    }
-}
-
-
-
-void vp8_calc_pframe_target_size(VP8_COMP *cpi)
-{
     int min_frame_target;
     int Adjustment;
 
@@ -1194,7 +1139,9 @@
                     }
                 }
                 else
-                    cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100;
+                    cpi->this_frame_target =
+                        (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+                         * cpi->last_boost) / 100;
 
             }
             // If there is an active ARF at this location use the minimum
@@ -1316,22 +1263,7 @@
     }
 }
 
-static int estimate_bits_at_q(VP8_COMP *cpi, int Q)
-{
-    int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]);
 
-    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-     * largest Bpm takes 20 bits.
-     */
-    if (cpi->common.MBs > (1 << 11))
-        return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs;
-    else
-        return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS;
-
-}
-
-
 int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
 {
     int Q = cpi->active_worst_quality;
@@ -1613,4 +1545,27 @@
             }
         }
     }
+}
+
+
+// return of 0 means drop frame
+int vp8_pick_frame_size(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    if (cm->frame_type == KEY_FRAME)
+        calc_iframe_target_size(cpi);
+    else
+    {
+        calc_pframe_target_size(cpi);
+
+        // Check if we're dropping the frame:
+        if (cpi->drop_frame)
+        {
+            cpi->drop_frame = FALSE;
+            cpi->drop_count++;
+            return 0;
+        }
+    }
+    return 1;
 }
--- a/vp8/encoder/ratectrl.h
+++ b/vp8/encoder/ratectrl.h
@@ -17,11 +17,12 @@
 extern void vp8_restore_coding_context(VP8_COMP *cpi);
 
 extern void vp8_setup_key_frame(VP8_COMP *cpi);
-extern void vp8_calc_iframe_target_size(VP8_COMP *cpi);
-extern void vp8_calc_pframe_target_size(VP8_COMP *cpi);
 extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
 extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
 extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
 extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+// return of 0 means drop frame
+extern int vp8_pick_frame_size(VP8_COMP *cpi);
 
 #endif
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -681,7 +681,8 @@
 
         rate = bmode_costs[mode];
 
-        vp8_predict_intra4x4(b, mode, b->predictor);
+        RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict)
+                     (b, mode, b->predictor);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b(be, b);
@@ -870,7 +871,8 @@
         int this_rd;
 
         x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-        vp8_build_intra_predictors_mbuv(&x->e_mbd);
+        RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
+                     (&x->e_mbd);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
                       x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
                       x->src.uv_stride);
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -89,6 +89,7 @@
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -725,6 +725,7 @@
     int                     vp8_dbg_display_mv = 0;
 #endif
     struct input_ctx        input = {0};
+    int                     frames_corrupted = 0;
 
     /* Parse command line */
     exec_name = argv_[0];
@@ -1018,6 +1019,7 @@
         vpx_codec_iter_t  iter = NULL;
         vpx_image_t    *img;
         struct vpx_usec_timer timer;
+        int                   corrupted;
 
         vpx_usec_timer_start(&timer);
 
@@ -1037,6 +1039,14 @@
 
         ++frame_in;
 
+        if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted))
+        {
+            fprintf(stderr, "Failed VP8_GET_FRAME_CORRUPTED: %s\n",
+                    vpx_codec_error(&decoder));
+            goto fail;
+        }
+        frames_corrupted += corrupted;
+
         if ((img = vpx_codec_get_frame(&decoder, &iter)))
             ++frame_out;
 
@@ -1102,6 +1112,9 @@
         fprintf(stderr, "\n");
     }
 
+    if (frames_corrupted)
+        fprintf(stderr, "WARNING: %d frames corrupted.\n",frames_corrupted);
+
 fail:
 
     if (vpx_codec_destroy(&decoder))
@@ -1120,5 +1133,5 @@
     fclose(infile);
     free(argv);
 
-    return EXIT_SUCCESS;
+    return frames_corrupted ? EXIT_FAILURE : EXIT_SUCCESS;
 }
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -918,7 +918,7 @@
     &debugmode,
     &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline,
     &best_dl, &good_dl, &rt_dl,
-    &verbosearg, &psnrarg, &use_ivf, &framerate,
+    &verbosearg, &psnrarg, &use_ivf,
     NULL
 };