shithub: libvpx

Download patch

ref: 0ba3fffc3a3361c4ff37fee13b6c8d88ff80ea06
parent: b4bb910b57dc590b300e28cef0db9a4daa82ac98
parent: d889035fe6802b64567c2ed250c1dff0eb377acf
author: John Koleszar <jkoleszar@google.com>
date: Fri Apr 15 20:05:08 EDT 2011

Merge remote branch 'origin/master' into experimental

Change-Id: I6ee7c49138576326887b32316cffe8d3e48aa044

--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -38,7 +38,7 @@
     dest_ptr1 = d - el;
     dest_ptr2 = d + w;
 
-    for (i = 0; i < h - 0 + 1; i++)
+    for (i = 0; i < h; i++)
     {
         vpx_memset(dest_ptr1, src_ptr1[0], el);
         vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -194,10 +194,6 @@
 
     mov         rdi, arg(2)                     ;output_ptr
 
-;;
-;;    cmp         esi, DWORD PTR [rax]
-;;    je          vp8_filter_block1d16_h4_ssse3
-
     mov         rsi, arg(0)                     ;src_ptr
 
     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
@@ -271,61 +267,7 @@
     pop rdi
     pop rsi
     RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp8_filter_block1d16_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-filter_block1d16_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
-
-    pmaddubsw   xmm2, xmm6
-    movdqa      xmm0, xmm3
-    pshufb      xmm3, [GLOBAL(shuf3b)]
-    pshufb      xmm0, [GLOBAL(shuf2b)]
-
-    paddsw      xmm1, [GLOBAL(rd)]
-    paddsw      xmm1, xmm2
-
-    pmaddubsw   xmm0, xmm5
-    pmaddubsw   xmm3, xmm6
-
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm0
-    paddsw      xmm3, [GLOBAL(rd)]
-    psraw       xmm3, 7
-    packuswb    xmm3, xmm3
-
-    punpcklqdq  xmm1, xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         filter_block1d16_h4_rowloop_ssse3
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -194,13 +194,13 @@
 #define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse;}}, v=INT_MAX;)// checks if (r,c) has better score than previous best
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
 
 //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
 
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
 {
     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
     unsigned char *z = (*(b->base_src) + b->src);
@@ -214,6 +214,7 @@
     unsigned int whichdir;
     unsigned int halfiters = 4;
     unsigned int quarteriters = 4;
+    int thismse;
 
     int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
     int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
@@ -226,6 +227,7 @@
 
     // calculate central point error
     besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    *distortion = besterr;
     besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
@@ -314,7 +316,7 @@
 #undef CHECK_BETTER
 #undef MIN
 #undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -325,6 +327,7 @@
     int left, right, up, down, diag;
     unsigned int sse;
     int whichdir ;
+    int thismse;
 
 
     // Trap uncodable vectors
@@ -332,6 +335,7 @@
     {
         bestmv->row <<= 3;
         bestmv->col <<= 3;
+        *distortion = INT_MAX;
         return INT_MAX;
     }
 
@@ -342,50 +346,55 @@
 
     // calculate central point error
     bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
     }
 
     this_mv.col += 8;
-    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
     }
 
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse =  vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
     }
 
     this_mv.row += 8;
-    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
     }
 
 
@@ -400,32 +409,33 @@
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 3:
     default:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
         break;
     }
 
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
 //  }
@@ -448,30 +458,32 @@
     if (startmv.col & 7)
     {
         this_mv.col = startmv.col - 2;
-        left = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.col = (startmv.col - 8) | 6;
-        left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
     }
 
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
     }
 
     this_mv.col += 4;
-    right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
     }
 
     // go up then down and check error
@@ -480,30 +492,32 @@
     if (startmv.row & 7)
     {
         this_mv.row = startmv.row - 2;
-        up = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.row = (startmv.row - 8) | 6;
-        up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
     }
 
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
     }
 
     this_mv.row += 4;
-    down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
     }
 
 
@@ -525,12 +539,12 @@
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+                thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+                thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
             }
         }
         else
@@ -540,12 +554,12 @@
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+                thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+                thismse = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
             }
         }
 
@@ -556,12 +570,12 @@
         if (startmv.row & 7)
         {
             this_mv.row -= 2;
-            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.row = (startmv.row - 8) | 6;
-            diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
         }
 
         break;
@@ -571,12 +585,12 @@
         if (startmv.col & 7)
         {
             this_mv.col -= 2;
-            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.col = (startmv.col - 8) | 6;
-            diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
         }
 
         break;
@@ -583,24 +597,23 @@
     case 3:
         this_mv.col += 2;
         this_mv.row += 2;
-        diag = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         break;
     }
 
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
-//  }
-
     return bestmse;
 }
 
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -610,6 +623,7 @@
     unsigned char *z = (*(b->base_src) + b->src);
     int left, right, up, down, diag;
     unsigned int sse;
+    int thismse;
 
     // Trap uncodable vectors
     if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
@@ -616,6 +630,7 @@
     {
         bestmv->row <<= 3;
         bestmv->col <<= 3;
+        *distortion = INT_MAX;
         return INT_MAX;
     }
 
@@ -626,50 +641,55 @@
 
     // calculate central point error
     bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
     }
 
     this_mv.col += 8;
-    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
     }
 
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
     }
 
     this_mv.row += 8;
-    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
     }
 
     // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
@@ -713,44 +733,48 @@
 #else
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = (this_mv.row - 8) | 4;
-    diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
     this_mv.col += 8;
-    diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = startmv.row + 4;
-    diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
     this_mv.col += 8;
-    diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
     }
 
 #endif
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -49,7 +49,7 @@
 
 typedef int (fractional_mv_step_fp)
     (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,
-     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]);
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion);
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,7 @@
 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
 
 
-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)
 {
     (void) b;
     (void) d;
@@ -58,6 +58,7 @@
     (void) error_per_bit;
     (void) vfp;
     (void) mvcost;
+    (void) distortion;
     bestmv->row <<= 3;
     bestmv->col <<= 3;
     return 0;
@@ -459,6 +460,8 @@
 
     int skip_mode[4] = {0, 0, 0, 0};
 
+    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode, when Speed >= 15, no sub-pixel search. */
+
     vpx_memset(mode_mv, 0, sizeof(mode_mv));
     vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
     vpx_memset(near_mv, 0, sizeof(near_mv));
@@ -639,10 +642,10 @@
         switch (this_mode)
         {
         case B_PRED:
-            distortion2 = *returndistortion;                    // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes
-            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
-            rate2 += rate;
-            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+            // Pass best so far to vp8_pick_intra4x4mby_modes to use as breakout
+            distortion2 = *returndistortion;
+            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x,
+                                         &rate, &distortion2);
 
             if (distortion2 == INT_MAX)
             {
@@ -650,6 +653,11 @@
             }
             else
             {
+                rate2 += rate;
+                distortion2 = VARIANCE_INVOKE
+                                (&cpi->rtcd.variance, get16x16prederror)(
+                                    x->src.y_buffer, x->src.y_stride,
+                                    x->e_mbd.predictor, 16, 0x7fffffff);
                 this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
                 if (this_rd < best_intra_rd)
@@ -788,7 +796,7 @@
             }
 
             if (bestsme < INT_MAX)
-                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2);
 
             mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
             mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -818,7 +826,8 @@
             x->e_mbd.block[0].bmi.mode = this_mode;
             x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;
 
-            distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
+            if((this_mode != NEWMV) || !(have_subp_search))
+                distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
 
             this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1270,12 +1270,14 @@
 
                 if (bestsme < INT_MAX)
                 {
+                    int distortion;
+
                     if (!cpi->common.full_pixel)
                         cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost);
+                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion);
                     else
                         vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost);
+                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion);
                 }
             } /* NEW4X4 */
 
@@ -2253,8 +2255,10 @@
                 x->mv_row_max = tmp_row_max;
 
                 if (bestsme < INT_MAX)
-                    // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost);  // normal mvc=11
-                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost);
+                    {
+                        int dis; /* TODO: use dis in distortion calculation later. */
+                        cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis);
+                }
 
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -208,10 +208,11 @@
     // Try sub-pixel MC?
     //if (bestsme > error_thresh && bestsme < INT_MAX)
     {
+        int distortion;
         bestsme = cpi->find_fractional_mv_step(x, b, d,
                     &d->bmi.mv.as_mv, &best_ref_mv1,
                     x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
-                    mvcost);
+                    mvcost, &distortion);
     }
 #endif