shithub: libvpx

--- a/vp8/common/arm/arm_systemdependent.c

+++ b/vp8/common/arm/arm_systemdependent.c

@@ -19,14 +19,6 @@

 #include "vp8/common/idct.h"

 #include "vp8/common/onyxc_int.h"

-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);

-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);

 void vp8_arch_arm_common_init(VP8_COMMON *ctx)

 #if CONFIG_RUNTIME_CPU_DETECT

@@ -106,31 +98,12 @@

         rtcd->recon.recon2      = vp8_recon2b_neon;

         rtcd->recon.recon4      = vp8_recon4b_neon;

         rtcd->recon.recon_mb    = vp8_recon_mb_neon;

+        rtcd->recon.build_intra_predictors_mby =

+            vp8_build_intra_predictors_mby_neon;

+        rtcd->recon.build_intra_predictors_mby_s =

+            vp8_build_intra_predictors_mby_s_neon;

 #endif

-#endif

-#if HAVE_ARMV6

-#if CONFIG_RUNTIME_CPU_DETECT

-    if (has_media)

-#endif

-    {

-        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;

-        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

-    }

-#endif

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-    if (has_neon)

-#endif

-    {

-        vp8_build_intra_predictors_mby_ptr =

-         vp8_build_intra_predictors_mby_neon;

-        vp8_build_intra_predictors_mby_s_ptr =

-         vp8_build_intra_predictors_mby_s_neon;

-    }

 #endif

--- a/vp8/common/arm/recon_arm.h

+++ b/vp8/common/arm/recon_arm.h

@@ -53,6 +53,9 @@

 extern prototype_recon_macroblock(vp8_recon_mb_neon);

+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);

+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);

 #if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_recon_recon

 #define vp8_recon_recon vp8_recon_b_neon

@@ -74,6 +77,13 @@

 #undef  vp8_recon_recon_mb

 #define vp8_recon_recon_mb vp8_recon_mb_neon

+#undef  vp8_recon_build_intra_predictors_mby

+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon

+#undef  vp8_recon_build_intra_predictors_mby_s

+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon

 #endif

 #endif

--- a/vp8/common/generic/systemdependent.c

+++ b/vp8/common/generic/systemdependent.c

@@ -20,12 +20,6 @@

 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);

 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);

-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);

 void vp8_machine_specific_config(VP8_COMMON *ctx)

 #if CONFIG_RUNTIME_CPU_DETECT

@@ -45,6 +39,10 @@

     rtcd->recon.recon4      = vp8_recon4b_c;

     rtcd->recon.recon_mb    = vp8_recon_mb_c;

     rtcd->recon.recon_mby   = vp8_recon_mby_c;

+    rtcd->recon.build_intra_predictors_mby =

+        vp8_build_intra_predictors_mby;

+    rtcd->recon.build_intra_predictors_mby_s =

+        vp8_build_intra_predictors_mby_s;

     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;

     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;

@@ -75,9 +73,6 @@

 #endif

 #endif

-    /* Pure C: */

-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;

-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

 #if ARCH_X86 || ARCH_X86_64

     vp8_arch_x86_common_init(ctx);

--- a/vp8/common/recon.h

+++ b/vp8/common/recon.h

@@ -23,6 +23,9 @@

 #define prototype_recon_macroblock(sym) \

     void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)

+#define prototype_build_intra_predictors(sym) \

+    void sym(MACROBLOCKD *x)

 struct vp8_recon_rtcd_vtable;

 #if ARCH_X86 || ARCH_X86_64

@@ -73,9 +76,23 @@

 #endif

 extern prototype_recon_macroblock(vp8_recon_recon_mby);

+#ifndef vp8_recon_build_intra_predictors_mby

+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby

+#endif

+extern prototype_build_intra_predictors\

+    (vp8_recon_build_intra_predictors_mby);

+#ifndef vp8_recon_build_intra_predictors_mby_s

+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s

+#endif

+extern prototype_build_intra_predictors\

+    (vp8_recon_build_intra_predictors_mby_s);

 typedef prototype_copy_block((*vp8_copy_block_fn_t));

 typedef prototype_recon_block((*vp8_recon_fn_t));

 typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));

+typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));

 typedef struct vp8_recon_rtcd_vtable

     vp8_copy_block_fn_t  copy16x16;

@@ -86,6 +103,8 @@

     vp8_recon_fn_t       recon4;

     vp8_recon_mb_fn_t    recon_mb;

     vp8_recon_mb_fn_t    recon_mby;

+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;

+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby;

 } vp8_recon_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT

--- a/vp8/common/reconintra.h

+++ b/vp8/common/reconintra.h

@@ -14,13 +14,6 @@

 extern void init_intra_left_above_pixels(MACROBLOCKD *x);

-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);

-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);

 extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);

 extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -115,8 +115,8 @@

         vp8_build_intra_predictors_mbuv_s(xd);

-        vp8_build_intra_predictors_mby_s_ptr(xd);

+        RECON_INVOKE(&pbi->common.rtcd.recon,

+                     build_intra_predictors_mby_s)(xd);

     else

@@ -214,7 +214,8 @@

         if (xd->mode_info_context->mbmi.mode != B_PRED)

-            vp8_build_intra_predictors_mby_ptr(xd);

+            RECON_INVOKE(&pbi->common.rtcd.recon,

+                         build_intra_predictors_mby)(xd);

         } else {

             vp8_intra_prediction_down_copy(xd);

--- a/vp8/encoder/arm/arm_csystemdependent.c

+++ b/vp8/encoder/arm/arm_csystemdependent.c

@@ -71,8 +71,8 @@

         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;

         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/

-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/

+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/

+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;

 #endif

--- /dev/null

+++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm

@@ -1,0 +1,224 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_fast_quantize_b_armv6|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    BLOCK *b

+; r1    BLOCKD *d

+|vp8_fast_quantize_b_armv6| PROC

+    stmfd   sp!, {r1, r4-r11, lr}

+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff

+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast

+    ldr     r5, [r0, #vp8_block_round]      ; round

+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff

+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff

+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant

+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction

+                                    ; is used to update the counter so that

+                                    ; it can be used to mark nonzero

+                                    ; quantized coefficient pairs.

+    mov     r1, #0                  ; flags for quantized coeffs

+    ; PART 1: quantization and dequantization loop

+loop

+    ldr     r9, [r3], #4            ; [z1 | z0]

+    ldr     r10, [r5], #4           ; [r1 | r0]

+    ldr     r11, [r4], #4           ; [q1 | q0]

+    ssat16  lr, #1, r9              ; [sz1 | sz0]

+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]

+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz

+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]

+    ldr     r12, [r3], #4           ; [z3 | z2]

+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]

+    smultt  r9, r9, r11             ; [(x1+r1)*q1]

+    ldr     r10, [r5], #4           ; [r3 | r2]

+    ssat16  r11, #1, r12            ; [sz3 | sz2]

+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]

+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]

+    ldr     r9, [r4], #4            ; [q3 | q2]

+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz

+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]

+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]

+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]

+    smultt  r12, r12, r9            ; [(x3+r3)*q3]

+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz

+    cmp     r0, #0                  ; check if zero

+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs

+    str     r0, [r6], #4            ; *qcoeff++ = x

+    ldr     r9, [r8], #4            ; [dq1 | dq0]

+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]

+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]

+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz

+    cmp     r10, #0                 ; check if zero

+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs

+    str     r10, [r6], #4           ; *qcoeff++ = x

+    ldr     r11, [r8], #4           ; [dq3 | dq2]

+    smulbb  r12, r0, r9             ; [x0*dq0]

+    smultt  r0, r0, r9              ; [x1*dq1]

+    smulbb  r9, r10, r11            ; [x2*dq2]

+    smultt  r10, r10, r11           ; [x3*dq3]

+    lsls    r2, r2, #2              ; update loop counter

+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]

+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]

+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]

+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]

+    add     r7, r7, #8              ; dqcoeff += 8

+    bne     loop

+    ; PART 2: check position for eob...

+    mov     lr, #0                  ; init eob

+    cmp     r1, #0                  ; coeffs after quantization?

+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer

+    beq     end                     ; skip eob calculations if all zero

+    ldr     r0, [r11, #vp8_blockd_qcoeff]

+    ; check shortcut for nonzero qcoeffs

+    tst    r1, #0x80

+    bne    quant_coeff_15_14

+    tst    r1, #0x20

+    bne    quant_coeff_13_11

+    tst    r1, #0x8

+    bne    quant_coeff_12_7

+    tst    r1, #0x40

+    bne    quant_coeff_10_9

+    tst    r1, #0x10

+    bne    quant_coeff_8_3

+    tst    r1, #0x2

+    bne    quant_coeff_6_5

+    tst    r1, #0x4

+    bne    quant_coeff_4_2

+    b      quant_coeff_1_0

+quant_coeff_15_14

+    ldrh    r2, [r0, #30]       ; rc=15, i=15

+    mov     lr, #16

+    cmp     r2, #0

+    bne     end

+    ldrh    r3, [r0, #28]       ; rc=14, i=14

+    mov     lr, #15

+    cmp     r3, #0

+    bne     end

+quant_coeff_13_11

+    ldrh    r2, [r0, #22]       ; rc=11, i=13

+    mov     lr, #14

+    cmp     r2, #0

+    bne     end

+quant_coeff_12_7

+    ldrh    r3, [r0, #14]       ; rc=7,  i=12

+    mov     lr, #13

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #20]       ; rc=10, i=11

+    mov     lr, #12

+    cmp     r2, #0

+    bne     end

+quant_coeff_10_9

+    ldrh    r3, [r0, #26]       ; rc=13, i=10

+    mov     lr, #11

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #24]       ; rc=12, i=9

+    mov     lr, #10

+    cmp     r2, #0

+    bne     end

+quant_coeff_8_3

+    ldrh    r3, [r0, #18]       ; rc=9,  i=8

+    mov     lr, #9

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #12]       ; rc=6,  i=7

+    mov     lr, #8

+    cmp     r2, #0

+    bne     end

+quant_coeff_6_5

+    ldrh    r3, [r0, #6]        ; rc=3,  i=6

+    mov     lr, #7

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #4]        ; rc=2,  i=5

+    mov     lr, #6

+    cmp     r2, #0

+    bne     end

+quant_coeff_4_2

+    ldrh    r3, [r0, #10]       ; rc=5,  i=4

+    mov     lr, #5

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #16]       ; rc=8,  i=3

+    mov     lr, #4

+    cmp     r2, #0

+    bne     end

+    ldrh    r3, [r0, #8]        ; rc=4,  i=2

+    mov     lr, #3

+    cmp     r3, #0

+    bne     end

+quant_coeff_1_0

+    ldrh    r2, [r0, #2]        ; rc=1,  i=1

+    mov     lr, #2

+    cmp     r2, #0

+    bne     end

+    mov     lr, #1              ; rc=0,  i=0

+end

+    str     lr, [r11, #vp8_blockd_eob]

+    ldmfd   sp!, {r1, r4-r11, pc}

+    ENDP

+loop_count

+    DCD     0x1000000

+    END

--- a/vp8/encoder/arm/quantize_arm.h

+++ b/vp8/encoder/arm/quantize_arm.h

@@ -12,6 +12,16 @@

 #ifndef QUANTIZE_ARM_H

 #define QUANTIZE_ARM_H

+#if HAVE_ARMV6

+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);

+#undef  vp8_quantize_fastquantb

+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6

+#endif /* HAVE_ARMV6 */

 #if HAVE_ARMV7

 extern prototype_quantize_block(vp8_fast_quantize_b_neon);

--- a/vp8/encoder/asm_enc_offsets.c

+++ b/vp8/encoder/asm_enc_offsets.c

@@ -65,6 +65,17 @@

 DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

+// offsets from BLOCK structure

+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));

+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));

+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));

+// offsets from BLOCKD structure

+DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));

+DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));

+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));

 // These two sizes are used in vp8cx_pack_tokens.  They are hard coded

 // so if the size changes this will have to be adjusted.

 #if HAVE_ARMV5TE

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -1322,7 +1322,8 @@

             int distortion2;

             x->e_mbd.mode_info_context->mbmi.mode = mode;

-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+                (&x->e_mbd);

             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);

             rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];

             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);

--- a/vp8/encoder/encodeintra.c

+++ b/vp8/encoder/encodeintra.c

@@ -80,7 +80,7 @@

     int b;

-    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);

     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);

--- a/vp8/encoder/ethreading.c

+++ b/vp8/encoder/ethreading.c

@@ -24,6 +24,35 @@

 extern void vp8_build_block_offsets(MACROBLOCK *x);

 extern void vp8_setup_block_ptrs(MACROBLOCK *x);

+#if CONFIG_MULTITHREAD

+extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);

+static THREAD_FUNCTION loopfilter_thread(void *p_data)

+{

+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);

+    VP8_COMMON *cm = &cpi->common;

+    while (1)

+    {

+        if (cpi->b_multi_threaded == 0)

+            break;

+        if (sem_wait(&cpi->h_event_start_lpf) == 0)

+        {

+            if (cpi->b_multi_threaded == FALSE) // we're shutting down

+                break;

+            loopfilter_frame(cpi, cm);

+            sem_post(&cpi->h_event_end_lpf);

+        }

+    }

+    return 0;

+}

+#endif

 static

 THREAD_FUNCTION thread_encoding_proc(void *p_data)

@@ -479,6 +508,15 @@

             pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);

+        {

+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;

+            sem_init(&cpi->h_event_start_lpf, 0, 0);

+            sem_init(&cpi->h_event_end_lpf, 0, 0);

+            lpfthd->ptr1 = (void *)cpi;

+            pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);

+        }

@@ -500,9 +538,14 @@

                 sem_destroy(&cpi->h_event_start_encoding[i]);

+            sem_post(&cpi->h_event_start_lpf);

+            pthread_join(cpi->h_filter_thread, 0);

         sem_destroy(&cpi->h_event_end_encoding);

+        sem_destroy(&cpi->h_event_end_lpf);

+        sem_destroy(&cpi->h_event_start_lpf);

         //free thread related resources

         vpx_free(cpi->h_event_start_encoding);

--- a/vp8/encoder/firstpass.c

+++ b/vp8/encoder/firstpass.c

@@ -841,10 +841,10 @@

 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];

 #define BASE_ERRPERMB   150

-static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)

+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)

     int Q;

-    int num_mbs = ((Height * Width) / (16 * 16));

+    int num_mbs = cpi->common.MBs;

     int target_norm_bits_per_mb;

     double err_per_mb = section_err / num_mbs;

@@ -941,10 +941,10 @@

     return Q;

-static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)

+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)

     int Q;

-    int num_mbs = ((Height * Width) / (16 * 16));

+    int num_mbs = cpi->common.MBs;

     int target_norm_bits_per_mb;

     double err_per_mb = section_err / num_mbs;

@@ -992,10 +992,10 @@

 // Estimate a worst case Q for a KF group

-static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)

+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)

     int Q;

-    int num_mbs = ((Height * Width) / (16 * 16));

+    int num_mbs = cpi->common.MBs;

     int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;

     int bits_per_mb_at_this_q;

@@ -1090,11 +1090,10 @@

 // For cq mode estimate a cq level that matches the observed

 // complexity and data rate.

-static int estimate_cq(VP8_COMP *cpi, double section_err,

-                       int section_target_bandwitdh, int Height, int Width)

+static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)

     int Q;

-    int num_mbs = ((Height * Width) / (16 * 16));

+    int num_mbs = cpi->common.MBs;

     int target_norm_bits_per_mb;

     double err_per_mb = section_err / num_mbs;

@@ -1608,7 +1607,7 @@

         arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));

         // Estimate if there are enough bits available to make worthwhile use of an arf.

-        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);

+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);

         // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.

         if (tmp_q < cpi->worst_quality)

@@ -2112,8 +2111,7 @@

             est_cq =

                 estimate_cq( cpi,

                              (cpi->total_coded_error_left / frames_left),

-                             (int)(cpi->bits_left / frames_left),

-                             cpi->common.Height, cpi->common.Width);

+                             (int)(cpi->bits_left / frames_left));

             cpi->cq_target_quality = cpi->oxcf.cq_level;

             if ( est_cq > cpi->cq_target_quality )

@@ -2125,9 +2123,7 @@

         cpi->maxq_min_limit = cpi->best_quality;

         tmp_q = estimate_max_q( cpi,

                                 (cpi->total_coded_error_left / frames_left),

-                                (int)(cpi->bits_left / frames_left),

-                                cpi->common.Height,

-                                cpi->common.Width);

+                                (int)(cpi->bits_left / frames_left));

         // Limit the maxq value returned subsequently.

         // This increases the risk of overspend or underspend if the initial

@@ -2155,7 +2151,7 @@

         if (frames_left < 1)

             frames_left = 1;

-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);

+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left));

         // Move active_worst_quality but in a damped way

         if (tmp_q > cpi->active_worst_quality)

@@ -2764,7 +2760,7 @@

             bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

         // Work out if spatial resampling is necessary

-        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);

+        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);

         // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section

         projected_bits_perframe = bits_per_frame;

@@ -2835,7 +2831,7 @@

                 effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;

                 // Now try again and see what Q we get with the smaller image size

-                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);

+                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);

                 if (0)

--- a/vp8/encoder/generic/csystemdependent.c

+++ b/vp8/encoder/generic/csystemdependent.c

@@ -103,6 +103,10 @@

     // Pure C:

     vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;

+#if CONFIG_PSNR

+    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;

+    cpi->rtcd.variance.ssimpf                = ssim_parms_c;

+#endif

 #if ARCH_X86 || ARCH_X86_64

     vp8_arch_x86_encoder_init(cpi);

--- a/vp8/encoder/mcomp.c

+++ b/vp8/encoder/mcomp.c

@@ -1415,7 +1415,7 @@

     int col_min = ref_col - distance;

     int col_max = ref_col + distance;

-    unsigned short sad_array8[8];

+    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);

     unsigned int sad_array[3];

     // Work out the mid point for the search

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -86,9 +86,11 @@

     YV12_BUFFER_CONFIG *source,

     YV12_BUFFER_CONFIG *dest,

     int lumamask,

-    double *weight

+    double *weight,

+    const vp8_variance_rtcd_vtable_t *rtcd

);

 extern double vp8_calc_ssimg

     YV12_BUFFER_CONFIG *source,

@@ -1522,8 +1524,7 @@

     VP8_COMP *cpi = (VP8_COMP *)(ptr);

     VP8_COMMON *cm = &cpi->common;

-    if (!cpi)

-        return;

+    cpi->oxcf = *oxcf;

     cpi->auto_gold = 1;

     cpi->auto_adjust_gold_quantizer = 1;

@@ -1535,50 +1536,15 @@

     cm->version = oxcf->Version;

     vp8_setup_version(cm);

-    if (oxcf == 0)

-    {

-        cpi->pass                     = 0;

+    // change includes all joint functionality

+    vp8_change_config(ptr, oxcf);

-        cpi->auto_worst_q              = 0;

-        cpi->oxcf.best_allowed_q            = MINQ;

-        cpi->oxcf.worst_allowed_q           = MAXQ;

-        cpi->oxcf.cq_level = MINQ;

+    // Initialize active best and worst q and average q values.

+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;

+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;

+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;

-        cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;

-        cpi->oxcf.starting_buffer_level     =   4000;

-        cpi->oxcf.optimal_buffer_level      =   5000;

-        cpi->oxcf.maximum_buffer_size       =   6000;

-        cpi->oxcf.under_shoot_pct           =  90;

-        cpi->oxcf.allow_df                 =   0;

-        cpi->oxcf.drop_frames_water_mark     =  20;

-        cpi->oxcf.allow_spatial_resampling  = 0;

-        cpi->oxcf.resample_down_water_mark   = 40;

-        cpi->oxcf.resample_up_water_mark     = 60;

-        cpi->oxcf.fixed_q = cpi->interquantizer;

-        cpi->filter_type = NORMAL_LOOPFILTER;

-        if (cm->simpler_lpf)

-            cpi->filter_type = SIMPLE_LOOPFILTER;

-        cpi->compressor_speed = 1;

-        cpi->horiz_scale = 0;

-        cpi->vert_scale = 0;

-        cpi->oxcf.two_pass_vbrbias = 50;

-        cpi->oxcf.two_pass_vbrmax_section = 400;

-        cpi->oxcf.two_pass_vbrmin_section = 0;

-        cpi->oxcf.Sharpness = 0;

-        cpi->oxcf.noise_sensitivity = 0;

-    }

-    else

-        cpi->oxcf = *oxcf;

-    // Convert target bandwidth from Kbit/s to Bit/s

-    cpi->oxcf.target_bandwidth       *= 1000;

+    // Initialise the starting buffer levels

     cpi->oxcf.starting_buffer_level =

         rescale(cpi->oxcf.starting_buffer_level,

                 cpi->oxcf.target_bandwidth, 1000);

@@ -1586,10 +1552,6 @@

     cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;

     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;

-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;

-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;

-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;

     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;

     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;

     cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;

@@ -1598,11 +1560,7 @@

     cpi->total_actual_bits            = 0;

     cpi->total_target_vs_actual       = 0;

-    // change includes all joint functionality

-   vp8_change_config(ptr, oxcf);

 #if VP8_TEMPORAL_ALT_REF

         int i;

@@ -1726,7 +1684,8 @@

-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;

+    cpi->baseline_gf_interval =

+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;

     cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;

@@ -1737,7 +1696,8 @@

     cm->refresh_entropy_probs = 1;

     if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)

-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;

+        cm->multi_token_partition =

+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;

     setup_features(cpi);

@@ -1758,12 +1718,12 @@

         cpi->oxcf.starting_buffer_level   = 60000;

         cpi->oxcf.optimal_buffer_level    = 60000;

         cpi->oxcf.maximum_buffer_size     = 240000;

     // Convert target bandwidth from Kbit/s to Bit/s

     cpi->oxcf.target_bandwidth       *= 1000;

+    // Set or reset optimal and maximum buffer levels.

     if (cpi->oxcf.optimal_buffer_level == 0)

         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;

     else

@@ -1778,7 +1738,10 @@

             rescale(cpi->oxcf.maximum_buffer_size,

                     cpi->oxcf.target_bandwidth, 1000);

+    // Set up frame rate and related parameters rate control values.

     vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);

+    // Set absolute upper and lower quality limits

     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;

     cpi->best_quality                = cpi->oxcf.best_allowed_q;

@@ -1807,9 +1770,9 @@

     cpi->cq_target_quality = cpi->oxcf.cq_level;

     // Only allow dropped frames in buffered mode

-    cpi->drop_frames_allowed         = cpi->oxcf.allow_df && cpi->buffered_mode;

+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;

-    cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;

+    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;

     if (!cm->use_bilinear_mc_filter)

         cm->mcomp_filter_type = SIXTAP;

@@ -1824,7 +1787,8 @@

     cm->horiz_scale  = cpi->horiz_scale;

     cm->vert_scale   = cpi->vert_scale ;

-    cpi->intra_frame_target           = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8

+    // As per VP8

+    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;

     // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)

     if (cpi->oxcf.Sharpness > 7)

@@ -1845,8 +1809,10 @@

         cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;

-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||

-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||

+    if (((cm->Width + 15) & 0xfffffff0) !=

+          cm->yv12_fb[cm->lst_fb_idx].y_width ||

+        ((cm->Height + 15) & 0xfffffff0) !=

+          cm->yv12_fb[cm->lst_fb_idx].y_height ||

         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)

         alloc_raw_frame_buffers(cpi);

@@ -3340,6 +3306,89 @@

     return force_recode;

+void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)

+{

+    if (cm->no_lpf)

+    {

+        cm->filter_level = 0;

+    }

+    else

+    {

+        struct vpx_usec_timer timer;

+        vp8_clear_system_state();

+        vpx_usec_timer_start(&timer);

+        if (cpi->sf.auto_filter == 0)

+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);

+        else

+            vp8cx_pick_filter_level(cpi->Source, cpi);

+        vpx_usec_timer_mark(&timer);

+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);

+    }

+#if CONFIG_MULTITHREAD

+    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */

+#endif

+    if (cm->filter_level > 0)

+    {

+        vp8cx_set_alt_lf_level(cpi, cm->filter_level);

+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);

+        cm->last_filter_type = cm->filter_type;

+        cm->last_sharpness_level = cm->sharpness_level;

+    }

+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

+    {

+        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];

+        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

+        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];

+        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];

+        // At this point the new frame has been encoded.

+        // If any buffer copy / swapping is signaled it should be done here.

+        if (cm->frame_type == KEY_FRAME)

+        {

+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);

+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);

+        }

+        else    // For non key frames

+        {

+            // Code to copy between reference buffers

+            if (cm->copy_buffer_to_arf)

+            {

+                if (cm->copy_buffer_to_arf == 1)

+                {

+                    if (cm->refresh_last_frame)

+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.

+                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);

+                    else

+                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);

+                }

+                else if (cm->copy_buffer_to_arf == 2)

+                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);

+            }

+            if (cm->copy_buffer_to_gf)

+            {

+                if (cm->copy_buffer_to_gf == 1)

+                {

+                    if (cm->refresh_last_frame)

+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.

+                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);

+                    else

+                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

+                }

+                else if (cm->copy_buffer_to_gf == 2)

+                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);

+            }

+        }

+    }

+}

 static void encode_frame_to_data_rate

     VP8_COMP *cpi,

@@ -3698,11 +3747,12 @@

-        // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames

-        // to prevent bits just going to waste.

+        // If CBR and the buffer is as full then it is reasonable to allow

+        // higher quality on the frames to prevent bits just going to waste.

         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)

-            // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause

+            // Note that the use of >= here elliminates the risk of a devide

+            // by 0 error in the else if clause

             if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)

                 cpi->active_best_quality = cpi->best_quality;

@@ -3715,6 +3765,20 @@

+    // Make sure constrained quality mode limits are adhered to for the first

+    // few frames of one pass encodes

+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)

+    {

+        if ( (cm->frame_type == KEY_FRAME) ||

+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )

+        {

+             cpi->active_best_quality = cpi->best_quality;

+        }

+        else if (cpi->active_best_quality < cpi->cq_target_quality)

+        {

+            cpi->active_best_quality = cpi->cq_target_quality;

+        }

+    }

     // Clip the active best and worst quality values to limits

     if (cpi->active_worst_quality > cpi->worst_quality)

@@ -3895,6 +3959,7 @@

         // transform / motion compensation build reconstruction frame

         vp8_encode_frame(cpi);

         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);

         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;

@@ -4254,93 +4319,44 @@

     else

         cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

-    if (cm->no_lpf)

+#if CONFIG_MULTITHREAD

+    if (cpi->b_multi_threaded)

-        cm->filter_level = 0;

+        sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */

     else

+#endif

-        struct vpx_usec_timer timer;

-        vpx_usec_timer_start(&timer);

-        if (cpi->sf.auto_filter == 0)

-            vp8cx_pick_filter_level_fast(cpi->Source, cpi);

-        else

-            vp8cx_pick_filter_level(cpi->Source, cpi);

-        vpx_usec_timer_mark(&timer);

-        cpi->time_pick_lpf +=  vpx_usec_timer_elapsed(&timer);

+        loopfilter_frame(cpi, cm);

-    if (cm->filter_level > 0)

-    {

-        vp8cx_set_alt_lf_level(cpi, cm->filter_level);

-        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);

-        cm->last_filter_type = cm->filter_type;

-        cm->last_sharpness_level = cm->sharpness_level;

-    }

-    /* Move storing frame_type out of the above loop since it is also

-     * needed in motion search besides loopfilter */

-    cm->last_frame_type = cm->frame_type;

-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

     if (cpi->oxcf.error_resilient_mode == 1)

         cm->refresh_entropy_probs = 0;

+#if CONFIG_MULTITHREAD

+    /* wait that filter_level is picked so that we can continue with stream packing */

+    if (cpi->b_multi_threaded)

+        sem_wait(&cpi->h_event_end_lpf);

+#endif

     // build the bitstream

     vp8_pack_bitstream(cpi, dest, size);

+#if CONFIG_MULTITHREAD

+    /* wait for loopfilter thread done */

+    if (cpi->b_multi_threaded)

-        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];

-        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

-        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];

-        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];

-        // At this point the new frame has been encoded coded.

-        // If any buffer copy / swaping is signalled it should be done here.

-        if (cm->frame_type == KEY_FRAME)

-        {

-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);

-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);

-        }

-        else    // For non key frames

-        {

-            // Code to copy between reference buffers

-            if (cm->copy_buffer_to_arf)

-            {

-                if (cm->copy_buffer_to_arf == 1)

-                {

-                    if (cm->refresh_last_frame)

-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.

-                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);

-                    else

-                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);

-                }

-                else if (cm->copy_buffer_to_arf == 2)

-                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);

-            }

-            if (cm->copy_buffer_to_gf)

-            {

-                if (cm->copy_buffer_to_gf == 1)

-                {

-                    if (cm->refresh_last_frame)

-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.

-                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);

-                    else

-                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

-                }

-                else if (cm->copy_buffer_to_gf == 2)

-                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);

-            }

-        }

+        sem_wait(&cpi->h_event_end_lpf);

+#endif

+    /* Move storing frame_type out of the above loop since it is also

+     * needed in motion search besides loopfilter */

+      cm->last_frame_type = cm->frame_type;

     // Update rate control heuristics

     cpi->total_byte_count += (*size);

     cpi->projected_frame_size = (*size) << 3;

@@ -5179,7 +5195,9 @@

     cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

     if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)

+    {

         generate_psnr_packet(cpi);

+    }

 #if CONFIG_PSNR

@@ -5195,12 +5213,35 @@

             if (cpi->b_calculate_psnr)

                 double y, u, v;

-                double sq_error;

-                double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);

+                double ye,ue,ve;

+                double frame_psnr;

+                YV12_BUFFER_CONFIG      *orig = cpi->Source;

+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;

+                YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;

+                int y_samples = orig->y_height * orig->y_width ;

+                int uv_samples = orig->uv_height * orig->uv_width ;

+                int t_samples = y_samples + 2 * uv_samples;

+                long long sq_error;

-                cpi->total_y += y;

-                cpi->total_u += u;

-                cpi->total_v += v;

+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,

+                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,

+                  IF_RTCD(&cpi->rtcd.variance));

+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

+                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,

+                  IF_RTCD(&cpi->rtcd.variance));

+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

+                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,

+                  IF_RTCD(&cpi->rtcd.variance));

+                sq_error = ye + ue + ve;

+                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);

+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);

+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);

+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);

                 cpi->total_sq_error += sq_error;

                 cpi->total  += frame_psnr;

@@ -5209,17 +5250,35 @@

                     vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));

                     vp8_clear_system_state();

-                    frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);

-                    frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);

-                    cpi->summed_quality += frame_ssim2 * weight;

-                    cpi->summed_weights += weight;

+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,

+                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,

+                      IF_RTCD(&cpi->rtcd.variance));

-                    cpi->totalp_y += y2;

-                    cpi->totalp_u += u2;

-                    cpi->totalp_v += v2;

-                    cpi->totalp  += frame_psnr2;

+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

+                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,

+                      IF_RTCD(&cpi->rtcd.variance));

+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

+                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,

+                      IF_RTCD(&cpi->rtcd.variance));

+                    sq_error = ye + ue + ve;

+                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);

+                    cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);

+                    cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);

+                    cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);

                     cpi->total_sq_error2 += sq_error;

+                    cpi->totalp  += frame_psnr2;

+                    frame_ssim2 = vp8_calc_ssim(cpi->Source,

+                      &cm->post_proc_buffer, 1, &weight,

+                      IF_RTCD(&cpi->rtcd.variance));

+                    cpi->summed_quality += frame_ssim2 * weight;

+                    cpi->summed_weights += weight;

 #if 0

                         FILE *f = fopen("q_used.stt", "a");

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -603,12 +603,17 @@

     int encoding_thread_count;

     pthread_t *h_encoding_thread;

+    pthread_t h_filter_thread;

     MB_ROW_COMP *mb_row_ei;

     ENCODETHREAD_DATA *en_thread_data;

+    LPFTHREAD_DATA lpf_thread_data;

     //events

     sem_t *h_event_start_encoding;

     sem_t h_event_end_encoding;

+    sem_t h_event_start_lpf;

+    sem_t h_event_end_lpf;

 #endif

     TOKENLIST *tplist;

--- a/vp8/encoder/pickinter.c

+++ b/vp8/encoder/pickinter.c

@@ -664,7 +664,8 @@

         case V_PRED:

         case H_PRED:

         case TM_PRED:

-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+                (&x->e_mbd);

             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);

             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];

             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);

--- a/vp8/encoder/ratectrl.c

+++ b/vp8/encoder/ratectrl.c

@@ -887,7 +887,8 @@

             int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;

-            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))

+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||

+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))

                 int percent_low = 0;

@@ -896,9 +897,12 @@

                 // If we are are below the optimal buffer fullness level and adherence

                 // to buffering contraints is important to the end useage then adjust

                 // the per frame target.

-                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))

+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&

+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))

-                    percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;

+                    percent_low =

+                        (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /

+                        one_percent_bits;

                     if (percent_low > 100)

                         percent_low = 100;

@@ -909,7 +913,8 @@

                 else if (cpi->bits_off_target < 0)

                     // Adjust per frame data target downwards to compensate.

-                    percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));

+                    percent_low = (int)(100 * -cpi->bits_off_target /

+                                       (cpi->total_byte_count * 8));

                     if (percent_low > 100)

                         percent_low = 100;

@@ -918,39 +923,60 @@

                 // lower the target bandwidth for this frame.

-                cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;

+                cpi->this_frame_target =

+                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;

-                // Are we using allowing control of active_worst_allowed_q according to buffer level.

+                // Are we using allowing control of active_worst_allowed_q

+                // according to buffer level.

                 if (cpi->auto_worst_q)

                     int critical_buffer_level;

-                    // For streaming applications the most important factor is cpi->buffer_level as this takes

-                    // into account the specified short term buffering constraints. However, hitting the long

-                    // term clip data rate target is also important.

+                    // For streaming applications the most important factor is

+                    // cpi->buffer_level as this takes into account the

+                    // specified short term buffering constraints. However,

+                    // hitting the long term clip data rate target is also

+                    // important.

                     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)

-                        // Take the smaller of cpi->buffer_level and cpi->bits_off_target

-                        critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;

+                        // Take the smaller of cpi->buffer_level and

+                        // cpi->bits_off_target

+                        critical_buffer_level =

+                            (cpi->buffer_level < cpi->bits_off_target)

+                            ? cpi->buffer_level : cpi->bits_off_target;

-                    // For local file playback short term buffering contraints are less of an issue

+                    // For local file playback short term buffering contraints

+                    // are less of an issue

                     else

-                        // Consider only how we are doing for the clip as a whole

+                        // Consider only how we are doing for the clip as a

+                        // whole

                         critical_buffer_level = cpi->bits_off_target;

-                    // Set the active worst quality based upon the selected buffer fullness number.

+                    // Set the active worst quality based upon the selected

+                    // buffer fullness number.

                     if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)

-                        if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))

+                        if ( critical_buffer_level >

+                             (cpi->oxcf.optimal_buffer_level >> 2) )

-                            int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;

-                            int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));

+                            INT64 qadjustment_range =

+                                      cpi->worst_quality - cpi->ni_av_qi;

+                            INT64 above_base =

+                                      (critical_buffer_level -

+                                       (cpi->oxcf.optimal_buffer_level >> 2));

-                            // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)

-                            // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)

-                            cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));

+                            // Step active worst quality down from

+                            // cpi->ni_av_qi when (critical_buffer_level ==

+                            // cpi->optimal_buffer_level) to

+                            // cpi->worst_quality when

+                            // (critical_buffer_level ==

+                            //     cpi->optimal_buffer_level >> 2)

+                            cpi->active_worst_quality =

+                                cpi->worst_quality -

+                                ((qadjustment_range * above_base) /

+                                 (cpi->oxcf.optimal_buffer_level*3>>2));

                         else

@@ -1009,6 +1035,15 @@

             // Set the active worst quality

             cpi->active_worst_quality = cpi->worst_quality;

+        }

+        // Special trap for constrained quality mode

+        // "active_worst_quality" may never drop below cq level

+        // for any frame type.

+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&

+             cpi->active_worst_quality < cpi->cq_target_quality)

+        {

+            cpi->active_worst_quality = cpi->cq_target_quality;

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -806,7 +806,8 @@

         x->e_mbd.mode_info_context->mbmi.mode = mode;

-        vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

+        RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+            (&x->e_mbd);

         macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb));

         rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]

@@ -2103,7 +2104,8 @@

         case H_PRED:

         case TM_PRED:

             x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+                (&x->e_mbd);

             macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;

             rate2 += rate_y;

             distortion2 += distortion;

--- a/vp8/encoder/ssim.c

+++ b/vp8/encoder/ssim.c

@@ -11,298 +11,13 @@

 #include "vpx_scale/yv12config.h"

 #include "math.h"

+#include "onyx_int.h"

-#define C1 (float)(64 * 64 * 0.01*255*0.01*255)

-#define C2 (float)(64 * 64 * 0.03*255*0.03*255)

-static int width_y;

-static int height_y;

-static int height_uv;

-static int width_uv;

-static int stride_uv;

-static int stride;

-static int lumimask;

-static int luminance;

-static double plane_summed_weights = 0;

-static short img12_sum_block[8*4096*4096*2] ;

-static short img1_sum[8*4096*2];

-static short img2_sum[8*4096*2];

-static int   img1_sq_sum[8*4096*2];

-static int   img2_sq_sum[8*4096*2];

-static int   img12_mul_sum[8*4096*2];

-double vp8_similarity

-(

-    int mu_x,

-    int mu_y,

-    int pre_mu_x2,

-    int pre_mu_y2,

-    int pre_mu_xy2

-)

-{

-    int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;

-    mu_x2 = mu_x * mu_x;

-    mu_y2 = mu_y * mu_y;

-    mu_xy = mu_x * mu_y;

-    theta_x2 = 64 * pre_mu_x2 - mu_x2;

-    theta_y2 = 64 * pre_mu_y2 - mu_y2;

-    theta_xy = 64 * pre_mu_xy2 - mu_xy;

-    return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));

-}

-double vp8_ssim

-(

-    const unsigned char *img1,

-    const unsigned char *img2,

-    int stride_img1,

-    int stride_img2,

-    int width,

-    int height

-)

-{

-    int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;

-    double plane_quality, weight, mean;

-    short *img1_sum_ptr1, *img1_sum_ptr2;

-    short *img2_sum_ptr1, *img2_sum_ptr2;

-    int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;

-    int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;

-    int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;

-    plane_quality = 0;

-    if (lumimask)

-        plane_summed_weights = 0.0f;

-    else

-        plane_summed_weights = (height - 7) * (width - 7);

-    //some prologue for the main loop

-    temp = 8 * width;

-    img1_sum_ptr1      = img1_sum + temp;

-    img2_sum_ptr1      = img2_sum + temp;

-    img1_sq_sum_ptr1   = img1_sq_sum + temp;

-    img2_sq_sum_ptr1   = img2_sq_sum + temp;

-    img12_mul_sum_ptr1 = img12_mul_sum + temp;

-    for (x = 0; x < width; x++)

-    {

-        img1_sum[x]      = img1[x];

-        img2_sum[x]      = img2[x];

-        img1_sq_sum[x]   = img1[x] * img1[x];

-        img2_sq_sum[x]   = img2[x] * img2[x];

-        img12_mul_sum[x] = img1[x] * img2[x];

-        img1_sum_ptr1[x]      = 0;

-        img2_sum_ptr1[x]      = 0;

-        img1_sq_sum_ptr1[x]   = 0;

-        img2_sq_sum_ptr1[x]   = 0;

-        img12_mul_sum_ptr1[x] = 0;

-    }

-    //the main loop

-    for (y = 1; y < height; y++)

-    {

-        img1 += stride_img1;

-        img2 += stride_img2;

-        temp = (y - 1) % 9 * width;

-        img1_sum_ptr1      = img1_sum + temp;

-        img2_sum_ptr1      = img2_sum + temp;

-        img1_sq_sum_ptr1   = img1_sq_sum + temp;

-        img2_sq_sum_ptr1   = img2_sq_sum + temp;

-        img12_mul_sum_ptr1 = img12_mul_sum + temp;

-        temp = y % 9 * width;

-        img1_sum_ptr2      = img1_sum + temp;

-        img2_sum_ptr2      = img2_sum + temp;

-        img1_sq_sum_ptr2   = img1_sq_sum + temp;

-        img2_sq_sum_ptr2   = img2_sq_sum + temp;

-        img12_mul_sum_ptr2 = img12_mul_sum + temp;

-        for (x = 0; x < width; x++)

-        {

-            img1_sum_ptr2[x]      = img1_sum_ptr1[x] + img1[x];

-            img2_sum_ptr2[x]      = img2_sum_ptr1[x] + img2[x];

-            img1_sq_sum_ptr2[x]   = img1_sq_sum_ptr1[x] + img1[x] * img1[x];

-            img2_sq_sum_ptr2[x]   = img2_sq_sum_ptr1[x] + img2[x] * img2[x];

-            img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];

-        }

-        if (y > 6)

-        {

-            //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum

-            temp = (y + 1) % 9 * width;

-            img1_sum_ptr1      = img1_sum + temp;

-            img2_sum_ptr1      = img2_sum + temp;

-            img1_sq_sum_ptr1   = img1_sq_sum + temp;

-            img2_sq_sum_ptr1   = img2_sq_sum + temp;

-            img12_mul_sum_ptr1 = img12_mul_sum + temp;

-            for (x = 0; x < width; x++)

-            {

-                img1_sum_ptr1[x]      = img1_sum_ptr2[x] - img1_sum_ptr1[x];

-                img2_sum_ptr1[x]      = img2_sum_ptr2[x] - img2_sum_ptr1[x];

-                img1_sq_sum_ptr1[x]   = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];

-                img2_sq_sum_ptr1[x]   = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];

-                img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];

-            }

-            //here we calculate the sum over the 8x8 block of pixels

-            //this is done by sliding a window across the column sums for the last 8 lines

-            //each time adding the new column sum, and subtracting the one which fell out of the window

-            img1_block      = 0;

-            img2_block      = 0;

-            img1_sq_block   = 0;

-            img2_sq_block   = 0;

-            img12_mul_block = 0;

-            //prologue, and calculation of simularity measure from the first 8 column sums

-            for (x = 0; x < 8; x++)

-            {

-                img1_block      += img1_sum_ptr1[x];

-                img2_block      += img2_sum_ptr1[x];

-                img1_sq_block   += img1_sq_sum_ptr1[x];

-                img2_sq_block   += img2_sq_sum_ptr1[x];

-                img12_mul_block += img12_mul_sum_ptr1[x];

-            }

-            if (lumimask)

-            {

-                y2 = y - 7;

-                x2 = 0;

-                if (luminance)

-                {

-                    mean = (img2_block + img1_block) / 128.0f;

-                    if (!(y2 % 2 || x2 % 2))

-                        *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;

-                }

-                else

-                {

-                    mean = *(img12_sum_block + y2 * width_uv + x2);

-                    mean += *(img12_sum_block + y2 * width_uv + x2 + 4);

-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);

-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);

-                    mean /= 512.0f;

-                }

-                weight = mean < 40 ? 0.0f :

-                         (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);

-                plane_summed_weights += weight;

-                plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);

-            }

-            else

-                plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);

-            //and for the rest

-            for (x = 8; x < width; x++)

-            {

-                img1_block      = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];

-                img2_block      = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];

-                img1_sq_block   = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];

-                img2_sq_block   = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];

-                img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];

-                if (lumimask)

-                {

-                    y2 = y - 7;

-                    x2 = x - 7;

-                    if (luminance)

-                    {

-                        mean = (img2_block + img1_block) / 128.0f;

-                        if (!(y2 % 2 || x2 % 2))

-                            *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;

-                    }

-                    else

-                    {

-                        mean = *(img12_sum_block + y2 * width_uv + x2);

-                        mean += *(img12_sum_block + y2 * width_uv + x2 + 4);

-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);

-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);

-                        mean /= 512.0f;

-                    }

-                    weight = mean < 40 ? 0.0f :

-                             (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);

-                    plane_summed_weights += weight;

-                    plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);

-                }

-                else

-                    plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);

-            }

-        }

-    }

-    if (plane_summed_weights == 0)

-        return 1.0f;

-    else

-        return plane_quality / plane_summed_weights;

-}

-double vp8_calc_ssim

-(

-    YV12_BUFFER_CONFIG *source,

-    YV12_BUFFER_CONFIG *dest,

-    int lumamask,

-    double *weight

-)

-{

-    double a, b, c;

-    double frame_weight;

-    double ssimv;

-    width_y = source->y_width;

-    height_y = source->y_height;

-    height_uv = source->uv_height;

-    width_uv = source->uv_width;

-    stride_uv = dest->uv_stride;

-    stride = dest->y_stride;

-    lumimask = lumamask;

-    luminance = 1;

-    a = vp8_ssim(source->y_buffer, dest->y_buffer,

-                 source->y_stride, dest->y_stride, source->y_width, source->y_height);

-    luminance = 0;

-    frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));

-    if (frame_weight == 0)

-        a = b = c = 1.0f;

-    else

-    {

-        b = vp8_ssim(source->u_buffer, dest->u_buffer,

-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);

-        c = vp8_ssim(source->v_buffer, dest->v_buffer,

-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);

-    }

-    ssimv = a * .8 + .1 * (b + c);

-    *weight = frame_weight;

-    return ssimv;

-}

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x)  (x)

+#else

+#define IF_RTCD(x)  NULL

+#endif

 // Google version of SSIM

 // SSIM

 #define KERNEL 3

@@ -519,4 +234,175 @@

     *ssim_u /= uvsize;

     *ssim_v /= uvsize;

     return ssim_all;

+}

+void ssim_parms_c

+(

+    unsigned char *s,

+    int sp,

+    unsigned char *r,

+    int rp,

+    unsigned long *sum_s,

+    unsigned long *sum_r,

+    unsigned long *sum_sq_s,

+    unsigned long *sum_sq_r,

+    unsigned long *sum_sxr

+)

+{

+    int i,j;

+    for(i=0;i<16;i++,s+=sp,r+=rp)

+     {

+         for(j=0;j<16;j++)

+         {

+             *sum_s += s[j];

+             *sum_r += r[j];

+             *sum_sq_s += s[j] * s[j];

+             *sum_sq_r += r[j] * r[j];

+             *sum_sxr += s[j] * r[j];

+         }

+     }

+}

+void ssim_parms_8x8_c

+(

+    unsigned char *s,

+    int sp,

+    unsigned char *r,

+    int rp,

+    unsigned long *sum_s,

+    unsigned long *sum_r,

+    unsigned long *sum_sq_s,

+    unsigned long *sum_sq_r,

+    unsigned long *sum_sxr

+)

+{

+    int i,j;

+    for(i=0;i<8;i++,s+=sp,r+=rp)

+     {

+         for(j=0;j<8;j++)

+         {

+             *sum_s += s[j];

+             *sum_r += r[j];

+             *sum_sq_s += s[j] * s[j];

+             *sum_sq_r += r[j] * r[j];

+             *sum_sxr += s[j] * r[j];

+         }

+     }

+}

+const static long long c1 =  426148; // (256^2*(.01*255)^2

+const static long long c2 = 3835331; //(256^2*(.03*255)^2

+static double similarity

+(

+    unsigned long sum_s,

+    unsigned long sum_r,

+    unsigned long sum_sq_s,

+    unsigned long sum_sq_r,

+    unsigned long sum_sxr,

+    int count

+)

+{

+    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);

+    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*

+            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;

+    return ssim_n * 1.0 / ssim_d;

+}

+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,

+            const vp8_variance_rtcd_vtable_t *rtcd)

+{

+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;

+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);

+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);

+}

+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,

+                const vp8_variance_rtcd_vtable_t *rtcd)

+{

+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;

+    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);

+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);

+}

+// TODO: (jbb) tried to scale this function such that we may be able to use it

+// for distortion metric in mode selection code ( provided we do a reconstruction)

+long dssim(unsigned char *s,int sp, unsigned char *r,int rp,

+           const vp8_variance_rtcd_vtable_t *rtcd)

+{

+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;

+    double ssim3;

+    long long ssim_n;

+    long long ssim_d;

+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);

+    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);

+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*

+            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;

+    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;

+    return (long)( 256*ssim3 * ssim3 );

+}

+// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels

+// such that the window regions overlap block boundaries to penalize blocking

+// artifacts.

+double vp8_ssim2

+(

+    unsigned char *img1,

+    unsigned char *img2,

+    int stride_img1,

+    int stride_img2,

+    int width,

+    int height,

+    const vp8_variance_rtcd_vtable_t *rtcd

+)

+{

+    int i,j;

+    double ssim_total=0;

+    // we can sample points as frequently as we like start with 1 per 8x8

+    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)

+    {

+        for(j=0; j < width; j+=8 )

+        {

+            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);

+        }

+    }

+    ssim_total /= (width/8 * height /8);

+    return ssim_total;

+}

+double vp8_calc_ssim

+(

+    YV12_BUFFER_CONFIG *source,

+    YV12_BUFFER_CONFIG *dest,

+    int lumamask,

+    double *weight,

+    const vp8_variance_rtcd_vtable_t *rtcd

+)

+{

+    double a, b, c;

+    double ssimv;

+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,

+                 source->y_stride, dest->y_stride, source->y_width,

+                 source->y_height, rtcd);

+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,

+                 source->uv_stride, dest->uv_stride, source->uv_width,

+                 source->uv_height, rtcd);

+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,

+                 source->uv_stride, dest->uv_stride, source->uv_width,

+                 source->uv_height, rtcd);

+    ssimv = a * .8 + .1 * (b + c);

+    *weight = 1;

+    return ssimv;

--- a/vp8/encoder/variance.h

+++ b/vp8/encoder/variance.h

@@ -85,6 +85,19 @@

       unsigned int *sse \

);

+#define prototype_ssimpf(sym) \

+    void (sym) \

+      ( \

+        unsigned char *s, \

+        int sp, \

+        unsigned char *r, \

+        int rp, \

+        unsigned long *sum_s, \

+        unsigned long *sum_r, \

+        unsigned long *sum_sq_s, \

+        unsigned long *sum_sq_r, \

+        unsigned long *sum_sxr \

+      );

 #define prototype_getmbss(sym) unsigned int (sym)(const short *)

@@ -306,7 +319,16 @@

 #endif

 extern prototype_sad(vp8_variance_get4x4sse_cs);

+#ifndef vp8_ssimpf

+#define vp8_ssimpf ssim_parms_c

+#endif

+extern prototype_ssimpf(vp8_ssimpf)

+#ifndef vp8_ssimpf_8x8

+#define vp8_ssimpf_8x8 ssim_parms_8x8_c

+#endif

+extern prototype_ssimpf(vp8_ssimpf_8x8)

 typedef prototype_sad(*vp8_sad_fn_t);

 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);

 typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);

@@ -315,6 +337,10 @@

 typedef prototype_variance2(*vp8_variance2_fn_t);

 typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);

 typedef prototype_getmbss(*vp8_getmbss_fn_t);

+typedef prototype_ssimpf(*vp8_ssimpf_fn_t)

 typedef struct

     vp8_sad_fn_t             sad4x4;

@@ -365,6 +391,11 @@

     vp8_sad_multi_d_fn_t     sad8x8x4d;

     vp8_sad_multi_d_fn_t     sad4x4x4d;

+#if CONFIG_PSNR

+    vp8_ssimpf_fn_t          ssimpf_8x8;

+    vp8_ssimpf_fn_t          ssimpf;

+#endif

 } vp8_variance_rtcd_vtable_t;

 typedef struct

@@ -378,6 +409,7 @@

     vp8_sad_multi_fn_t      sdx3f;

     vp8_sad_multi1_fn_t     sdx8f;

     vp8_sad_multi_d_fn_t    sdx4df;

 } vp8_variance_fn_ptr_t;

 #if CONFIG_RUNTIME_CPU_DETECT

--- a/vp8/encoder/x86/sad_sse4.asm

+++ b/vp8/encoder/x86/sad_sse4.asm

@@ -186,7 +186,7 @@

         PROCESS_16X2X8 0

         mov             rdi,        arg(4)           ;Results

-        movdqu          XMMWORD PTR [rdi],    xmm1

+        movdqa          XMMWORD PTR [rdi],    xmm1

     ; begin epilog

     pop         rdi

@@ -224,7 +224,7 @@

         PROCESS_16X2X8 0

         mov             rdi,        arg(4)           ;Results

-        movdqu          XMMWORD PTR [rdi],    xmm1

+        movdqa          XMMWORD PTR [rdi],    xmm1

     ; begin epilog

     pop         rdi

@@ -262,7 +262,7 @@

         PROCESS_8X2X8 0

         mov             rdi,        arg(4)           ;Results

-        movdqu          XMMWORD PTR [rdi],    xmm1

+        movdqa          XMMWORD PTR [rdi],    xmm1

     ; begin epilog

     pop         rdi

@@ -303,7 +303,7 @@

         PROCESS_8X2X8 0

         PROCESS_8X2X8 0

         mov             rdi,        arg(4)           ;Results

-        movdqu          XMMWORD PTR [rdi],    xmm1

+        movdqa          XMMWORD PTR [rdi],    xmm1

     ; begin epilog

     pop         rdi

@@ -339,7 +339,7 @@

         PROCESS_4X2X8 0

         mov             rdi,        arg(4)           ;Results

-        movdqu          XMMWORD PTR [rdi],    xmm1

+        movdqa          XMMWORD PTR [rdi],    xmm1

     ; begin epilog

     pop         rdi

--- /dev/null

+++ b/vp8/encoder/x86/ssim_opt.asm

@@ -1,0 +1,215 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

+%macro TABULATE_SSIM 0

+        paddusw         xmm15, xmm3  ; sum_s

+        paddusw         xmm14, xmm4  ; sum_r

+        movdqa          xmm1, xmm3

+        pmaddwd         xmm1, xmm1

+        paddq           xmm13, xmm1 ; sum_sq_s

+        movdqa          xmm2, xmm4

+        pmaddwd         xmm2, xmm2

+        paddq           xmm12, xmm2 ; sum_sq_r

+        pmaddwd         xmm3, xmm4

+        paddq           xmm11, xmm3  ; sum_sxr

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_Q 1

+        movdqa          xmm2,%1

+        punpckldq       %1,xmm0

+        punpckhdq       xmm2,xmm0

+        paddq           %1,xmm2

+        movdqa          xmm2,%1

+        punpcklqdq      %1,xmm0

+        punpckhqdq      xmm2,xmm0

+        paddq           %1,xmm2

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_W 1

+        movdqa          xmm1, %1

+        punpcklwd       %1,xmm0

+        punpckhwd       xmm1,xmm0

+        paddd           %1, xmm1

+        SUM_ACROSS_Q    %1

+%endmacro

+;void ssim_parms_sse3(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp8_ssim_parms_16x16_sse3)

+sym(vp8_ssim_parms_16x16_sse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 16      ;row counter

+NextRow:

+    ;grab source and reference pixels

+    movdqu          xmm5, [rsi]

+    movdqu          xmm6, [rdi]

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpckhbw       xmm3, xmm0 ; high_s

+    punpckhbw       xmm4, xmm0 ; high_r

+    TABULATE_SSIM

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movq            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movq            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movq            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movq            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movq            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void ssim_parms_sse3(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp8_ssim_parms_8x8_sse3)

+sym(vp8_ssim_parms_8x8_sse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 8      ;row counter

+NextRow2:

+    ;grab source and reference pixels

+    movq            xmm5, [rsi]

+    movq            xmm6, [rdi]

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz NextRow2

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movq            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movq            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movq            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movq            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movq            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -176,7 +176,26 @@

                     d->dqcoeff

);

+#if CONFIG_PSNR

+#if ARCH_X86_64

+typedef void ssimpf

+(

+    unsigned char *s,

+    int sp,

+    unsigned char *r,

+    int rp,

+    unsigned long *sum_s,

+    unsigned long *sum_r,

+    unsigned long *sum_sq_s,

+    unsigned long *sum_sq_r,

+    unsigned long *sum_sxr

+);

+extern ssimpf vp8_ssim_parms_16x16_sse3;

+extern ssimpf vp8_ssim_parms_8x8_sse3;

 #endif

+#endif

+#endif

 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

@@ -280,6 +299,8 @@

         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;

         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;

         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;

         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;

         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;

@@ -339,8 +360,17 @@

         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;

+#if CONFIG_PSNR

+#if ARCH_X86_64

+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;

+        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;

+#endif

+#endif

 #endif

 #if HAVE_SSE4_1

     if (SSE4_1Enabled)

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -116,6 +116,7 @@

 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm

 ifeq ($(CONFIG_REALTIME_ONLY),yes)

 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

--- a/vp8/vp8cx_arm.mk

+++ b/vp8/vp8cx_arm.mk

@@ -34,6 +34,7 @@

 #File list for armv6

 # encoder

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

--

⑨