shithub: libvpx

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -884,6 +884,8 @@

                 link_with_cc=gcc

                 tune_cflags="-march="

             setup_gnu_toolchain

+                #for 32 bit x86 builds, -O3 did not turn on this flag

+                enabled optimizations && check_add_cflags -fomit-frame-pointer

;;

         esac

--- a/vp8/common/threading.h

+++ b/vp8/common/threading.h

@@ -12,8 +12,6 @@

 #ifndef _PTHREAD_EMULATION

 #define _PTHREAD_EMULATION

-#define VPXINFINITE 10000       /* 10second. */

 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD

 /* Thread management macros */

@@ -28,7 +26,7 @@

 #define pthread_t HANDLE

 #define pthread_attr_t DWORD

 #define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)

-#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))

+#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))

 #define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)

 #define thread_sleep(nms) Sleep(nms)

 #define pthread_cancel(thread) terminate_thread(thread,0)

@@ -62,7 +60,7 @@

 #define sem_t HANDLE

 #define pause(voidpara) __asm PAUSE

 #define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL)

-#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE))

+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))

 #define sem_post(sem) SetEvent(*sem)

 #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)

 #define thread_sleep(nms) Sleep(nms)

--- a/vp8/decoder/onyxd_if.c

+++ b/vp8/decoder/onyxd_if.c

@@ -76,7 +76,6 @@

     pbi->common.current_video_frame = 0;

     pbi->ready_for_new_data = 1;

-    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/

 #if CONFIG_MULTITHREAD

     pbi->max_threads = oxcf->max_threads;

     vp8_decoder_create_threads(pbi);

@@ -252,7 +251,6 @@

     VP8D_COMP *pbi = (VP8D_COMP *) ptr;

     VP8_COMMON *cm = &pbi->common;

     int retcode = 0;

-    struct vpx_usec_timer timer;

     /*if(pbi->ready_for_new_data == 0)

         return -1;*/

@@ -317,8 +315,6 @@

     pbi->common.error.setjmp = 1;

-    vpx_usec_timer_start(&timer);

     /*cm->current_video_frame++;*/

     pbi->Source = source;

     pbi->source_sz = size;

@@ -379,15 +375,9 @@

         if(pbi->common.filter_level)

-            struct vpx_usec_timer lpftimer;

-            vpx_usec_timer_start(&lpftimer);

             /* Apply the loop filter if appropriate. */

             vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);

-            vpx_usec_timer_mark(&lpftimer);

-            pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);

             cm->last_frame_type = cm->frame_type;

             cm->last_filter_type = cm->filter_type;

             cm->last_sharpness_level = cm->sharpness_level;

@@ -397,11 +387,6 @@

     vp8_clear_system_state();

-    vpx_usec_timer_mark(&timer);

-    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);

-    pbi->time_decoding += pbi->decode_microseconds;

     /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/

--- a/vp8/decoder/onyxd_int.h

+++ b/vp8/decoder/onyxd_int.h

@@ -81,12 +81,6 @@

     const unsigned char *Source;

     unsigned int   source_sz;

-    unsigned int CPUFreq;

-    unsigned int decode_microseconds;

-    unsigned int time_decoding;

-    unsigned int time_loop_filtering;

 #if CONFIG_MULTITHREAD

     /* variable for threading */

--- a/vp8/encoder/block.h

+++ b/vp8/encoder/block.h

@@ -34,7 +34,7 @@

     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

     short *quant;

     short *quant_fast;

-    short *quant_shift;

+    unsigned char *quant_shift;

     short *zbin;

     short *zrun_zbin_boost;

     short *round;

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -147,7 +147,7 @@

 #define EXACT_QUANT

 #ifdef EXACT_QUANT

 static void vp8cx_invert_quant(int improved_quant, short *quant,

-                               short *shift, short d)

+                               unsigned char *shift, short d)

     if(improved_quant)

--- a/vp8/encoder/mcomp.c

+++ b/vp8/encoder/mcomp.c

@@ -194,13 +194,13 @@

 #define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.

 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;

 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost

-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse;}}, v=INT_MAX;)// checks if (r,c) has better score than previous best

+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best

 #define MIN(x,y) (((x)<(y))?(x):(y))

 #define MAX(x,y) (((x)>(y))?(x):(y))

 //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }

-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)

+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)

     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;

     unsigned char *z = (*(b->base_src) + b->src);

@@ -226,7 +226,7 @@

     bestmv->col <<= 3;

     // calculate central point error

-    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);

+    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);

     *distortion = besterr;

     besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

@@ -316,7 +316,7 @@

 #undef CHECK_BETTER

 #undef MIN

 #undef MAX

-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)

+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)

     int bestmse = INT_MAX;

     MV startmv;

@@ -345,7 +345,7 @@

     startmv = *bestmv;

     // calculate central point error

-    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);

+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);

     *distortion = bestmse;

     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

@@ -360,6 +360,7 @@

         *bestmv = this_mv;

         bestmse = left;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col += 8;

@@ -371,6 +372,7 @@

         *bestmv = this_mv;

         bestmse = right;

         *distortion = thismse;

+        *sse1 = sse;

     // go up then down and check error

@@ -384,6 +386,7 @@

         *bestmv = this_mv;

         bestmse = up;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.row += 8;

@@ -395,6 +398,7 @@

         *bestmv = this_mv;

         bestmse = down;

         *distortion = thismse;

+        *sse1 = sse;

@@ -436,6 +440,7 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

 //  }

@@ -473,6 +478,7 @@

         *bestmv = this_mv;

         bestmse = left;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col += 4;

@@ -484,6 +490,7 @@

         *bestmv = this_mv;

         bestmse = right;

         *distortion = thismse;

+        *sse1 = sse;

     // go up then down and check error

@@ -507,6 +514,7 @@

         *bestmv = this_mv;

         bestmse = up;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.row += 4;

@@ -518,6 +526,7 @@

         *bestmv = this_mv;

         bestmse = down;

         *distortion = thismse;

+        *sse1 = sse;

@@ -608,12 +617,13 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

     return bestmse;

-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)

+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)

     int bestmse = INT_MAX;

     MV startmv;

@@ -640,7 +650,7 @@

     startmv = *bestmv;

     // calculate central point error

-    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);

+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);

     *distortion = bestmse;

     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);

@@ -655,6 +665,7 @@

         *bestmv = this_mv;

         bestmse = left;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col += 8;

@@ -666,6 +677,7 @@

         *bestmv = this_mv;

         bestmse = right;

         *distortion = thismse;

+        *sse1 = sse;

     // go up then down and check error

@@ -679,6 +691,7 @@

         *bestmv = this_mv;

         bestmse = up;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.row += 8;

@@ -690,6 +703,7 @@

         *bestmv = this_mv;

         bestmse = down;

         *distortion = thismse;

+        *sse1 = sse;

     // somewhat strangely not doing all the diagonals for half pel is slower than doing them.

@@ -741,6 +755,7 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col += 8;

@@ -752,6 +767,7 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col = (this_mv.col - 8) | 4;

@@ -764,6 +780,7 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

     this_mv.col += 8;

@@ -775,6 +792,7 @@

         *bestmv = this_mv;

         bestmse = diag;

         *distortion = thismse;

+        *sse1 = sse;

 #endif

--- a/vp8/encoder/mcomp.h

+++ b/vp8/encoder/mcomp.h

@@ -49,7 +49,7 @@

 typedef int (fractional_mv_step_fp)

     (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,

-     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion);

+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse);

 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;

 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;

 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -244,17 +244,17 @@

     DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

--- a/vp8/encoder/pickinter.c

+++ b/vp8/encoder/pickinter.c

@@ -50,7 +50,7 @@

 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);

-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion)

+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse)

     (void) b;

     (void) d;

@@ -59,6 +59,7 @@

     (void) vfp;

     (void) mvcost;

     (void) distortion;

+    (void) sse;

     bestmv->row <<= 3;

     bestmv->col <<= 3;

     return 0;

@@ -443,7 +444,7 @@

     int bestsme;

     //int all_rds[MAX_MODES];         // Experimental debug code.

     int best_mode_index = 0;

-    int sse = INT_MAX;

+    unsigned int sse = INT_MAX;

     MV mvp;

     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};

@@ -796,7 +797,7 @@

             if (bestsme < INT_MAX)

-                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2);

+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse);

             mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;

             mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -827,7 +828,7 @@

             x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;

             if((this_mode != NEWMV) || !(have_subp_search))

-                distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));

+                distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse);

             this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

--- a/vp8/encoder/quantize.c

+++ b/vp8/encoder/quantize.c

@@ -27,7 +27,7 @@

     short *zbin_ptr        = b->zbin;

     short *round_ptr       = b->round;

     short *quant_ptr       = b->quant_fast;

-    short *quant_shift_ptr = b->quant_shift;

+    unsigned char *quant_shift_ptr = b->quant_shift;

     short *qcoeff_ptr      = d->qcoeff;

     short *dqcoeff_ptr     = d->dqcoeff;

     short *dequant_ptr     = d->dequant;

@@ -112,7 +112,7 @@

     short *zbin_ptr        = b->zbin;

     short *round_ptr       = b->round;

     short *quant_ptr       = b->quant;

-    short *quant_shift_ptr = b->quant_shift;

+    unsigned char *quant_shift_ptr = b->quant_shift;

     short *qcoeff_ptr      = d->qcoeff;

     short *dqcoeff_ptr     = d->dqcoeff;

     short *dequant_ptr     = d->dequant;

@@ -166,7 +166,7 @@

     int sz;

     short *coeff_ptr;

     short *quant_ptr;

-    short *quant_shift_ptr;

+    unsigned char *quant_shift_ptr;

     short *qcoeff_ptr;

     short *dqcoeff_ptr;

     short *dequant_ptr;

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -1271,13 +1271,14 @@

                 if (bestsme < INT_MAX)

                     int distortion;

+                    unsigned int sse;

                     if (!cpi->common.full_pixel)

                         cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],

-                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion);

+                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion, &sse);

                     else

                         vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],

-                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion);

+                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion, &sse);

             } /* NEW4X4 */

@@ -2255,9 +2256,10 @@

                 x->mv_row_max = tmp_row_max;

                 if (bestsme < INT_MAX)

-                    {

-                        int dis; /* TODO: use dis in distortion calculation later. */

-                        cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis);

+                {

+                    int dis; /* TODO: use dis in distortion calculation later. */

+                    unsigned int sse;

+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse);

                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;

@@ -2304,7 +2306,8 @@

             else if (x->encode_breakout)

-                int sum, sse;

+                int sum;

+                unsigned int sse;

                 int threshold = (xd->block[0].dequant[1]

                             * xd->block[0].dequant[1] >>4);

@@ -2313,7 +2316,7 @@

                 VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)

                     (x->src.y_buffer, x->src.y_stride,

-                     x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);

+                     x->e_mbd.predictor, 16, &sse, &sum);

                 if (sse < threshold)

@@ -2337,8 +2340,7 @@

                             distortion_uv = sse2;

                             disable_skip = 1;

-                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2,

-                                             distortion2);

+                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

                             break;

--- a/vp8/encoder/ssim.c

+++ b/vp8/encoder/ssim.c

@@ -290,8 +290,8 @@

-const static long long c1 =  426148; // (256^2*(.01*255)^2

-const static long long c2 = 3835331; //(256^2*(.03*255)^2

+const static long long cc1 =  26634; // (64^2*(.01*255)^2

+const static long long cc2 = 239708; // (64^2*(.03*255)^2

 static double similarity

@@ -303,11 +303,20 @@

     int count

-    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);

+    long long ssim_n, ssim_d;

+    long long c1, c2;

-    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*

-            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;

+    //scale the constants by number of pixels

+    c1 = (cc1*count*count)>>12;

+    c2 = (cc2*count*count)>>12;

+    ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr-

+          (long long) 2*sum_s*sum_r+c2);

+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*

+        ((long long)count*sum_sq_s-(long long)sum_s*sum_s +

+        (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ;

     return ssim_n * 1.0 / ssim_d;

@@ -332,18 +341,33 @@

            const vp8_variance_rtcd_vtable_t *rtcd)

     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;

-    double ssim3;

-    long long ssim_n;

-    long long ssim_d;

+    long long ssim3;

+    long long ssim_n,ssim_n1,ssim_n2;

+    long long ssim_d,ssim_d1,ssim_d2;

+    long long ssim_t1,ssim_t2;

+    long long c1, c2;

+    // normalize by 256/64

+    c1 = cc1*16;

+    c2 = cc2*16;

     rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);

-    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);

+    ssim_n1 = (2*sum_s*sum_r+ c1);

-    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*

-            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;

+    ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2);

-    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;

-    return (long)( 256*ssim3 * ssim3 );

+    ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1);

+    ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s +

+                    (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ;

+    ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;

+    ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;

+    ssim3 = 256 *ssim_t1 * ssim_t2;

+    if(ssim3 <0 )

+        ssim3=0;

+    return (long)( ssim3  );

 // TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels

 // such that the window regions overlap block boundaries to penalize blocking

@@ -361,18 +385,20 @@

     int i,j;

+    int samples =0;

     double ssim_total=0;

-    // we can sample points as frequently as we like start with 1 per 8x8

-    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)

+    // we can sample points as frequently as we like start with 1 per 4x4

+    for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)

-        for(j=0; j < width; j+=8 )

+        for(j=0; j < width-8; j+=4 )

-            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);

+            double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd);

+            ssim_total += v;

+            samples++;

-    ssim_total /= (width/8 * height /8);

+    ssim_total /= samples;

     return ssim_total;

@@ -405,4 +431,4 @@

     *weight = 1;

     return ssimv;

-}

+}

\ No newline at end of file

--- a/vp8/encoder/temporal_filter.c

+++ b/vp8/encoder/temporal_filter.c

@@ -209,10 +209,11 @@

     //if (bestsme > error_thresh && bestsme < INT_MAX)

         int distortion;

+        unsigned int sse;

         bestsme = cpi->find_fractional_mv_step(x, b, d,

                     &d->bmi.mv.as_mv, &best_ref_mv1,

                     x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],

-                    mvcost, &distortion);

+                    mvcost, &distortion, &sse);

 #endif

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ b/vp8/encoder/x86/quantize_sse2.asm

@@ -142,7 +142,7 @@

     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

     ; downshift by quant_shift[rc]

-    movsx       ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc]

+    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

     sar         edi, cl                     ; also sets Z bit

     je          rq_zigzag_loop_%1           ; !y

     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

--- a/vp8/encoder/x86/ssim_opt.asm

+++ b/vp8/encoder/x86/ssim_opt.asm

@@ -16,12 +16,12 @@

         paddusw         xmm14, xmm4  ; sum_r

         movdqa          xmm1, xmm3

         pmaddwd         xmm1, xmm1

-        paddq           xmm13, xmm1 ; sum_sq_s

+        paddd           xmm13, xmm1 ; sum_sq_s

         movdqa          xmm2, xmm4

         pmaddwd         xmm2, xmm2

-        paddq           xmm12, xmm2 ; sum_sq_r

+        paddd           xmm12, xmm2 ; sum_sq_r

         pmaddwd         xmm3, xmm4

-        paddq           xmm11, xmm3  ; sum_sxr

+        paddd           xmm11, xmm3  ; sum_sxr

 %endmacro

 ; Sum across the register %1 starting with q words

@@ -66,6 +66,7 @@

     push        rbp

     mov         rbp, rsp

     SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM

     push        rsi

     push        rdi

     ; end prolog

@@ -115,19 +116,20 @@

     SUM_ACROSS_Q    xmm11

     mov             rdi,arg(4)

-    movq            [rdi], xmm15;

+    movd            [rdi], xmm15;

     mov             rdi,arg(5)

-    movq            [rdi], xmm14;

+    movd            [rdi], xmm14;

     mov             rdi,arg(6)

-    movq            [rdi], xmm13;

+    movd            [rdi], xmm13;

     mov             rdi,arg(7)

-    movq            [rdi], xmm12;

+    movd            [rdi], xmm12;

     mov             rdi,arg(8)

-    movq            [rdi], xmm11;

+    movd            [rdi], xmm11;

     ; begin epilog

     pop         rdi

     pop         rsi

+    RESTORE_XMM

     UNSHADOW_ARGS

     pop         rbp

ret

@@ -154,6 +156,7 @@

     push        rbp

     mov         rbp, rsp

     SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM

     push        rsi

     push        rdi

     ; end prolog

@@ -174,11 +177,8 @@

 NextRow2:

     ;grab source and reference pixels

-    movq            xmm5, [rsi]

-    movq            xmm6, [rdi]

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

+    movq            xmm3, [rsi]

+    movq            xmm4, [rdi]

     punpcklbw       xmm3, xmm0 ; low_s

     punpcklbw       xmm4, xmm0 ; low_r

@@ -197,19 +197,20 @@

     SUM_ACROSS_Q    xmm11

     mov             rdi,arg(4)

-    movq            [rdi], xmm15;

+    movd            [rdi], xmm15;

     mov             rdi,arg(5)

-    movq            [rdi], xmm14;

+    movd            [rdi], xmm14;

     mov             rdi,arg(6)

-    movq            [rdi], xmm13;

+    movd            [rdi], xmm13;

     mov             rdi,arg(7)

-    movq            [rdi], xmm12;

+    movd            [rdi], xmm12;

     mov             rdi,arg(8)

-    movq            [rdi], xmm11;

+    movd            [rdi], xmm11;

     ; begin epilog

     pop         rdi

     pop         rsi

+    RESTORE_XMM

     UNSHADOW_ARGS

     pop         rbp

ret

--- a/vpx_ports/x86.h

+++ b/vpx_ports/x86.h

@@ -151,8 +151,8 @@

     __asm__ __volatile__ ("pause \n\t")

 #else

 #if ARCH_X86_64

-/* No pause intrinsic for windows x64 */

-#define x86_pause_hint()

+#define x86_pause_hint()\

+    _mm_pause();

 #else

 #define x86_pause_hint()\

     __asm pause

--

⑨