shithub: libvpx

--- a/vp8/common/arm/arm_systemdependent.c

+++ b/vp8/common/arm/arm_systemdependent.c

@@ -46,7 +46,6 @@

         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;

         rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;

-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;

         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;

         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;

@@ -80,7 +79,6 @@

         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;

         rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;

-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;

         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;

         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;

--- a/vp8/common/arm/armv6/iwalsh_v6.asm

+++ b/vp8/common/arm/armv6/iwalsh_v6.asm

@@ -9,7 +9,6 @@

     EXPORT |vp8_short_inv_walsh4x4_v6|

-    EXPORT |vp8_short_inv_walsh4x4_1_v6|

ARM

     REQUIRE8

@@ -17,19 +16,19 @@

     AREA    |.text|, CODE, READONLY  ; name this block of code

-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)

+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)

 |vp8_short_inv_walsh4x4_v6| PROC

-    stmdb       sp!, {r4 - r11, lr}

+    stmdb       sp!, {r4 - r12, lr}

-    ldr         r2, [r0], #4         ; [1  |  0]

-    ldr         r3, [r0], #4         ; [3  |  2]

-    ldr         r4, [r0], #4         ; [5  |  4]

-    ldr         r5, [r0], #4         ; [7  |  6]

-    ldr         r6, [r0], #4         ; [9  |  8]

-    ldr         r7, [r0], #4         ; [11 | 10]

-    ldr         r8, [r0], #4         ; [13 | 12]

-    ldr         r9, [r0]             ; [15 | 14]

+    ldr         r2, [r0, #0]         ; [1  |  0]

+    ldr         r3, [r0, #4]         ; [3  |  2]

+    ldr         r4, [r0, #8]         ; [5  |  4]

+    ldr         r5, [r0, #12]        ; [7  |  6]

+    ldr         r6, [r0, #16]        ; [9  |  8]

+    ldr         r7, [r0, #20]        ; [11 | 10]

+    ldr         r8, [r0, #24]        ; [13 | 12]

+    ldr         r9, [r0, #28]        ; [15 | 14]

     qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]

     qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]

@@ -69,25 +68,28 @@

     qadd16      r4, r4, r10          ; [b2+3|c2+3]

     qadd16      r5, r5, r10          ; [a2+3|d2+3]

-    asr         r12, r2, #3          ; [1  |  x]

-    pkhtb       r12, r12, r3, asr #19; [1  |  0]

-    lsl         lr, r3, #16          ; [~3 |  x]

-    lsl         r2, r2, #16          ; [~2 |  x]

-    asr         lr, lr, #3           ; [3  |  x]

-    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]

+    asr         r12, r3, #19         ; [0]

+    strh        r12, [r1], #32

+    asr         lr, r2, #19          ; [1]

+    strh        lr, [r1], #32

+    sxth        r2, r2

+    sxth        r3, r3

+    asr         r2, r2, #3           ; [2]

+    strh        r2, [r1], #32

+    asr         r3, r3, #3           ; [3]

+    strh        r3, [r1], #32

-    asr         r2, r4, #3           ; [5  |  x]

-    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]

-    lsl         r3, r5, #16          ; [~7 |  x]

-    lsl         r4, r4, #16          ; [~6 |  x]

-    asr         r3, r3, #3           ; [7  |  x]

-    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]

+    asr         r12, r5, #19         ; [4]

+    strh        r12, [r1], #32

+    asr         lr, r4, #19          ; [5]

+    strh        lr, [r1], #32

+    sxth        r4, r4

+    sxth        r5, r5

+    asr         r4, r4, #3           ; [6]

+    strh        r4, [r1], #32

+    asr         r5, r5, #3           ; [7]

+    strh        r5, [r1], #32

-    str         r12, [r1], #4

-    str         lr, [r1], #4

-    str         r2, [r1], #4

-    str         r3, [r1], #4

     qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]

     qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]

     qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]

@@ -103,49 +105,31 @@

     qadd16      r8, r8, r10          ; [b2+3|c2+3]

     qadd16      r9, r9, r10          ; [a2+3|d2+3]

-    asr         r2, r6, #3           ; [9  |  x]

-    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]

-    lsl         r3, r7, #16          ; [~11|  x]

-    lsl         r4, r6, #16          ; [~10|  x]

-    asr         r3, r3, #3           ; [11 |  x]

-    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]

+    asr         r12, r7, #19         ; [8]

+    strh        r12, [r1], #32

+    asr         lr, r6, #19          ; [9]

+    strh        lr, [r1], #32

+    sxth        r6, r6

+    sxth        r7, r7

+    asr         r6, r6, #3           ; [10]

+    strh        r6, [r1], #32

+    asr         r7, r7, #3           ; [11]

+    strh        r7, [r1], #32

-    asr         r4, r8, #3           ; [13 |  x]

-    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]

-    lsl         r5, r9, #16          ; [~15|  x]

-    lsl         r6, r8, #16          ; [~14|  x]

-    asr         r5, r5, #3           ; [15 |  x]

-    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]

+    asr         r12, r9, #19         ; [12]

+    strh        r12, [r1], #32

+    asr         lr, r8, #19          ; [13]

+    strh        lr, [r1], #32

+    sxth        r8, r8

+    sxth        r9, r9

+    asr         r8, r8, #3           ; [14]

+    strh        r8, [r1], #32

+    asr         r9, r9, #3           ; [15]

+    strh        r9, [r1], #32

-    str         r2, [r1], #4

-    str         r3, [r1], #4

-    str         r4, [r1], #4

-    str         r5, [r1]

-    ldmia       sp!, {r4 - r11, pc}

+    ldmia       sp!, {r4 - r12, pc}

     ENDP        ; |vp8_short_inv_walsh4x4_v6|

-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)

-|vp8_short_inv_walsh4x4_1_v6| PROC

-    ldrsh       r2, [r0]             ; [0]

-    add         r2, r2, #3           ; [0] + 3

-    asr         r2, r2, #3           ; a1 ([0]+3) >> 3

-    lsl         r2, r2, #16          ; [a1 |  x]

-    orr         r2, r2, r2, lsr #16  ; [a1 | a1]

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1]

-    bx          lr

-    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|

 ; Constant Pool

 c0x00030003 DCD 0x00030003

--- a/vp8/common/arm/idct_arm.h

+++ b/vp8/common/arm/idct_arm.h

@@ -25,9 +25,6 @@

 #undef  vp8_idct_idct1_scalar_add

 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6

-#undef  vp8_idct_iwalsh1

-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6

 #undef  vp8_idct_iwalsh16

 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6

 #endif

@@ -45,9 +42,6 @@

 #undef  vp8_idct_idct1_scalar_add

 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon

-#undef  vp8_idct_iwalsh1

-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon

 #undef  vp8_idct_iwalsh16

 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon

--- a/vp8/common/arm/neon/iwalsh_neon.asm

+++ b/vp8/common/arm/neon/iwalsh_neon.asm

@@ -8,7 +8,6 @@

 ;  be found in the AUTHORS file in the root of the source tree.

     EXPORT  |vp8_short_inv_walsh4x4_neon|

-    EXPORT  |vp8_short_inv_walsh4x4_1_neon|

ARM

     REQUIRE8

@@ -16,7 +15,7 @@

     AREA    |.text|, CODE, READONLY  ; name this block of code

-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)

+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)

 |vp8_short_inv_walsh4x4_neon| PROC

     ; read in all four lines of values: d0->d3

@@ -59,22 +58,30 @@

     vshr.s16 q0, q0, #3 ;e/f >> 3

     vshr.s16 q1, q1, #3 ;g/h >> 3

-    vst4.i16 {d0,d1,d2,d3}, [r1@128]

+    mov      r2, #64

+    add      r3, r1, #32

-    bx lr

-    ENDP    ; |vp8_short_inv_walsh4x4_neon|

+    vst1.i16 d0[0], [r1],r2

+    vst1.i16 d1[0], [r3],r2

+    vst1.i16 d2[0], [r1],r2

+    vst1.i16 d3[0], [r3],r2

+    vst1.i16 d0[1], [r1],r2

+    vst1.i16 d1[1], [r3],r2

+    vst1.i16 d2[1], [r1],r2

+    vst1.i16 d3[1], [r3],r2

-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)

-|vp8_short_inv_walsh4x4_1_neon| PROC

-    ldrsh r2, [r0]          ; load input[0]

-    add r3, r2, #3          ; add 3

-    add r2, r1, #16         ; base for last 8 output

-    asr r0, r3, #3          ; right shift 3

-    vdup.16 q0, r0          ; load and duplicate

-    vst1.16 {q0}, [r1@128]  ; write back 8

-    vst1.16 {q0}, [r2@128]  ; write back last 8

+    vst1.i16 d0[2], [r1],r2

+    vst1.i16 d1[2], [r3],r2

+    vst1.i16 d2[2], [r1],r2

+    vst1.i16 d3[2], [r3],r2

+    vst1.i16 d0[3], [r1],r2

+    vst1.i16 d1[3], [r3],r2

+    vst1.i16 d2[3], [r1]

+    vst1.i16 d3[3], [r3]

     bx lr

-    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|

+    ENDP    ; |vp8_short_inv_walsh4x4_neon|

END

--- a/vp8/common/idct.h

+++ b/vp8/common/idct.h

@@ -37,6 +37,10 @@

 #define vp8_idct_idct16 vp8_short_idct4x4llm_c

 #endif

 extern prototype_idct(vp8_idct_idct16);

+/* add this prototype to prevent compiler warning about implicit

+ * declaration of vp8_short_idct4x4llm_c function in dequantize.c

+ * when building, for example, neon optimized version */

+extern prototype_idct(vp8_short_idct4x4llm_c);

 #ifndef vp8_idct_idct1_scalar_add

 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c

--- a/vp8/common/idctllm.c

+++ b/vp8/common/idctllm.c

@@ -137,8 +137,9 @@

-void vp8_short_inv_walsh4x4_c(short *input, short *output)

+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)

+    short output[16];

     int i;

     int a1, b1, c1, d1;

     int a2, b2, c2, d2;

@@ -183,22 +184,21 @@

         ip += 4;

         op += 4;

+    for(i = 0; i < 16; i++)

+    {

+        mb_dqcoeff[i * 16] = output[i];

+    }

-void vp8_short_inv_walsh4x4_1_c(short *input, short *output)

+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)

     int i;

     int a1;

-    short *op = output;

     a1 = ((input[0] + 3) >> 3);

-    for (i = 0; i < 4; i++)

+    for(i = 0; i < 16; i++)

-        op[0] = a1;

-        op[1] = a1;

-        op[2] = a1;

-        op[3] = a1;

-        op += 4;

+        mb_dqcoeff[i * 16] = a1;

--- a/vp8/common/invtrans.c

+++ b/vp8/common/invtrans.c

@@ -28,18 +28,6 @@

-static void recon_dcblock(MACROBLOCKD *x)

-{

-    BLOCKD *b = &x->block[24];

-    int i;

-    for (i = 0; i < 16; i++)

-    {

-        x->block[i].dqcoeff[0] = b->diff[i];

-    }

-}

 void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)

     int i;

@@ -47,9 +35,7 @@

     if(x->mode_info_context->mbmi.mode != SPLITMV)

         /* do 2nd order transform on the dc block */

-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);

-        recon_dcblock(x);

+        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);

     for (i = 0; i < 16; i++)

--- a/vp8/common/x86/idct_x86.h

+++ b/vp8/common/x86/idct_x86.h

@@ -24,7 +24,6 @@

 extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);

 extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);

-extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_idct_idct16

@@ -35,9 +34,6 @@

 #undef vp8_idct_iwalsh16

 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx

-#undef vp8_idct_iwalsh1

-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx

 #endif

 #endif

--- a/vp8/common/x86/iwalsh_mmx.asm

+++ b/vp8/common/x86/iwalsh_mmx.asm

@@ -11,42 +11,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)

-global sym(vp8_short_inv_walsh4x4_1_mmx)

-sym(vp8_short_inv_walsh4x4_1_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rsi, arg(0)

-    mov     rax, 3

-    mov     rdi, arg(1)

-    add     rax, [rsi]          ;input[0] + 3

-    movd    mm0, eax

-    punpcklwd mm0, mm0          ;x x val val

-    punpckldq mm0, mm0          ;val val val val

-    psraw   mm0, 3            ;(input[0] + 3) >> 3

-    movq  [rdi + 0], mm0

-    movq  [rdi + 8], mm0

-    movq  [rdi + 16], mm0

-    movq  [rdi + 24], mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)

 global sym(vp8_short_inv_walsh4x4_mmx)

 sym(vp8_short_inv_walsh4x4_mmx):

@@ -159,10 +123,50 @@

     psraw   mm2, 3

     psraw   mm3, 3

-    movq  [rdi + 0], mm0

-    movq  [rdi + 8], mm1

-    movq  [rdi + 16], mm2

-    movq  [rdi + 24], mm3

+;    movq  [rdi + 0], mm0

+;    movq  [rdi + 8], mm1

+;    movq  [rdi + 16], mm2

+;    movq  [rdi + 24], mm3

+    movd    eax, mm0

+    psrlq   mm0, 32

+    mov     word ptr[rdi+32*0], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*1], ax

+    movd    eax, mm0

+    mov     word ptr[rdi+32*2], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*3], ax

+    movd    ecx, mm1

+    psrlq   mm1, 32

+    mov     word ptr[rdi+32*4], cx

+    shr     ecx, 16

+    mov     word ptr[rdi+32*5], cx

+    movd    ecx, mm1

+    mov     word ptr[rdi+32*6], cx

+    shr     ecx, 16

+    mov     word ptr[rdi+32*7], cx

+    movd    eax, mm2

+    psrlq   mm2, 32

+    mov     word ptr[rdi+32*8], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*9], ax

+    movd    eax, mm2

+    mov     word ptr[rdi+32*10], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*11], ax

+    movd    ecx, mm3

+    psrlq   mm3, 32

+    mov     word ptr[rdi+32*12], cx

+    shr     ecx, 16

+    mov     word ptr[rdi+32*13], cx

+    movd    ecx, mm3

+    mov     word ptr[rdi+32*14], cx

+    shr     ecx, 16

+    mov     word ptr[rdi+32*15], cx

     ; begin epilog

     pop rdi

--- a/vp8/common/x86/iwalsh_sse2.asm

+++ b/vp8/common/x86/iwalsh_sse2.asm

@@ -96,8 +96,50 @@

     psraw   xmm5, 3

     psraw   xmm1, 3

-    movdqa  [rdi + 0], xmm5

-    movdqa  [rdi + 16], xmm1

+;;    movdqa  [rdi + 0], xmm5

+;;    movdqa  [rdi + 16], xmm1

+    movd    eax, xmm5

+    psrldq   xmm5, 4

+    mov     word ptr[rdi+32*0], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*1], ax

+    movd    eax, xmm5

+    psrldq   xmm5, 4

+    mov     word ptr[rdi+32*2], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*3], ax

+    movd    eax, xmm5

+    psrldq   xmm5, 4

+    mov     word ptr[rdi+32*4], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*5], ax

+    movd    eax, xmm5

+    mov     word ptr[rdi+32*6], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*7], ax

+    movd    eax, xmm1

+    psrldq   xmm1, 4

+    mov     word ptr[rdi+32*8], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*9], ax

+    movd    eax, xmm1

+    psrldq   xmm1, 4

+    mov     word ptr[rdi+32*10], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*11], ax

+    movd    eax, xmm1

+    psrldq   xmm1, 4

+    mov     word ptr[rdi+32*12], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*13], ax

+    movd    eax, xmm1

+    mov     word ptr[rdi+32*14], ax

+    shr     eax, 16

+    mov     word ptr[rdi+32*15], ax

     ; begin epilog

     pop rdi

--- a/vp8/common/x86/x86_systemdependent.c

+++ b/vp8/common/x86/x86_systemdependent.c

@@ -40,9 +40,6 @@

         rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;

         rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;

         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;

-        rtcd->idct.iwalsh1     = vp8_short_inv_walsh4x4_1_mmx;

         rtcd->recon.copy8x8     = vp8_copy_mem8x8_mmx;

         rtcd->recon.copy8x4     = vp8_copy_mem8x4_mmx;

--- a/vp8/decoder/arm/arm_dsystemdependent.c

+++ b/vp8/decoder/arm/arm_dsystemdependent.c

@@ -32,8 +32,6 @@

         pbi->dequant.block               = vp8_dequantize_b_v6;

         pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;

-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;

-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;

         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;

         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;

@@ -44,9 +42,6 @@

         pbi->dequant.block               = vp8_dequantize_b_neon;

         pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;

-        /*This is not used: NEON always dequants two blocks at once.

-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/

-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;

         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;

         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;

--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm

+++ /dev/null

@@ -1,213 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT |vp8_dequant_dc_idct_add_v6|

-    AREA |.text|, CODE, READONLY

-;void vp8_dequant_dc_idct_v6(short *input, short *dq,

-;                            unsigned char *dest, int stride, int Dc)

-; r0 = input

-; r1 = dq

-; r2 = dst

-; r3 = stride

-; sp + 36 = Dc

-|vp8_dequant_dc_idct_add_v6| PROC

-    stmdb   sp!, {r4-r11, lr}

-    ldr     r6, [sp, #36]

-    ldr     r4, [r0]                ;input

-    ldr     r5, [r1], #4            ;dq

-    sub     sp, sp, #4

-    str     r3, [sp]

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    mov     r12, #3

-vp8_dequant_dc_add_loop

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    subs    r12, r12, #1

-    ldrne   r4, [r0, #4]

-    ldrne   r5, [r1], #4

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    bne     vp8_dequant_dc_add_loop

-    sub     r0, r0, #32

-    mov     r1, r0

-; short_idct4x4llm_v6_dual

-    ldr     r3, cospi8sqrt2minus1

-    ldr     r4, sinpi8sqrt2

-    ldr     r6, [r0, #8]

-    mov     r5, #2

-vp8_dequant_dc_idct_loop1_v6

-    ldr     r12, [r0, #24]

-    ldr     r14, [r0, #16]

-    smulwt  r9, r3, r6

-    smulwb  r7, r3, r6

-    smulwt  r10, r4, r6

-    smulwb  r8, r4, r6

-    pkhbt   r7, r7, r9, lsl #16

-    smulwt  r11, r3, r12

-    pkhbt   r8, r8, r10, lsl #16

-    uadd16  r6, r6, r7

-    smulwt  r7, r4, r12

-    smulwb  r9, r3, r12

-    smulwb  r10, r4, r12

-    subs    r5, r5, #1

-    pkhbt   r9, r9, r11, lsl #16

-    ldr     r11, [r0], #4

-    pkhbt   r10, r10, r7, lsl #16

-    uadd16  r7, r12, r9

-    usub16  r7, r8, r7

-    uadd16  r6, r6, r10

-    uadd16  r10, r11, r14

-    usub16  r8, r11, r14

-    uadd16  r9, r10, r6

-    usub16  r10, r10, r6

-    uadd16  r6, r8, r7

-    usub16  r7, r8, r7

-    str     r6, [r1, #8]

-    ldrne   r6, [r0, #8]

-    str     r7, [r1, #16]

-    str     r10, [r1, #24]

-    str     r9, [r1], #4

-    bne     vp8_dequant_dc_idct_loop1_v6

-    mov     r5, #2

-    sub     r0, r1, #8

-vp8_dequant_dc_idct_loop2_v6

-    ldr     r6, [r0], #4

-    ldr     r7, [r0], #4

-    ldr     r8, [r0], #4

-    ldr     r9, [r0], #4

-    smulwt  r1, r3, r6

-    smulwt  r12, r4, r6

-    smulwt  lr, r3, r8

-    smulwt  r10, r4, r8

-    pkhbt   r11, r8, r6, lsl #16

-    pkhbt   r1, lr, r1, lsl #16

-    pkhbt   r12, r10, r12, lsl #16

-    pkhtb   r6, r6, r8, asr #16

-    uadd16  r6, r1, r6

-    pkhbt   lr, r9, r7, lsl #16

-    uadd16  r10, r11, lr

-    usub16  lr, r11, lr

-    pkhtb   r8, r7, r9, asr #16

-    subs    r5, r5, #1

-    smulwt  r1, r3, r8

-    smulwb  r7, r3, r8

-    smulwt  r11, r4, r8

-    smulwb  r9, r4, r8

-    pkhbt   r1, r7, r1, lsl #16

-    uadd16  r8, r1, r8

-    pkhbt   r11, r9, r11, lsl #16

-    usub16  r1, r12, r8

-    uadd16  r8, r11, r6

-    ldr     r9, c0x00040004

-    ldr     r12, [sp]               ; get stride from stack

-    uadd16  r6, r10, r8

-    usub16  r7, r10, r8

-    uadd16  r7, r7, r9

-    uadd16  r6, r6, r9

-    uadd16  r10, r14, r1

-    usub16  r1, r14, r1

-    uadd16  r10, r10, r9

-    uadd16  r1, r1, r9

-    ldr     r11, [r2]               ; load input from dst

-    mov     r8, r7, asr #3

-    pkhtb   r9, r8, r10, asr #19

-    mov     r8, r1, asr #3

-    pkhtb   r8, r8, r6, asr #19

-    uxtb16  lr, r11, ror #8

-    qadd16  r9, r9, lr

-    uxtb16  lr, r11

-    qadd16  r8, r8, lr

-    usat16  r9, #8, r9

-    usat16  r8, #8, r8

-    orr     r9, r8, r9, lsl #8

-    ldr     r11, [r2, r12]          ; load input from dst

-    mov     r7, r7, lsl #16

-    mov     r1, r1, lsl #16

-    mov     r10, r10, lsl #16

-    mov     r6, r6, lsl #16

-    mov     r7, r7, asr #3

-    pkhtb   r7, r7, r10, asr #19

-    mov     r1, r1, asr #3

-    pkhtb   r1, r1, r6, asr #19

-    uxtb16  r8, r11, ror #8

-    qadd16  r7, r7, r8

-    uxtb16  r8, r11

-    qadd16  r1, r1, r8

-    usat16  r7, #8, r7

-    usat16  r1, #8, r1

-    orr     r1, r1, r7, lsl #8

-    str     r9, [r2], r12           ; store output to dst

-    str     r1, [r2], r12           ; store output to dst

-    bne     vp8_dequant_dc_idct_loop2_v6

-; vpx_memset

-    sub     r0, r0, #32

-    add     sp, sp, #4

-    mov     r12, #0

-    str     r12, [r0]

-    str     r12, [r0, #4]

-    str     r12, [r0, #8]

-    str     r12, [r0, #12]

-    str     r12, [r0, #16]

-    str     r12, [r0, #20]

-    str     r12, [r0, #24]

-    str     r12, [r0, #28]

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP    ; |vp8_dequant_dc_idct_add_v6|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x00004E7B

-sinpi8sqrt2       DCD 0x00008A8C

-c0x00040004       DCD 0x00040004

-    END

--- a/vp8/decoder/arm/armv6/idct_blk_v6.c

+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c

@@ -13,47 +13,6 @@

 #include "vp8/decoder/dequantize.h"

-void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,

-                                        unsigned char *dst, int stride,

-                                        char *eobs, short *dc)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);

-        else if (eobs[0] == 1)

-            vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);

-        if (eobs[1] > 1)

-        {

-            vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);

-        }

-        else if (eobs[1] == 1)

-            vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);

-        if (eobs[2] > 1)

-        {

-            vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);

-        }

-        else if (eobs[2] == 1)

-            vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);

-        if (eobs[3] > 1)

-        {

-            vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);

-        }

-        else if (eobs[3] == 1)

-            vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);

-        q    += 64;

-        dc   += 4;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,

                                      unsigned char *dst,

                                      int stride, char *eobs)

--- a/vp8/decoder/arm/dequantize_arm.h

+++ b/vp8/decoder/arm/dequantize_arm.h

@@ -15,8 +15,6 @@

 #if HAVE_ARMV6

 extern prototype_dequant_block(vp8_dequantize_b_v6);

 extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);

-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);

-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);

@@ -27,12 +25,6 @@

 #undef vp8_dequant_idct_add

 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6

-#undef vp8_dequant_dc_idct_add

-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6

-#undef vp8_dequant_dc_idct_add_y_block

-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6

 #undef vp8_dequant_idct_add_y_block

 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

@@ -44,8 +36,6 @@

 #if HAVE_ARMV7

 extern prototype_dequant_block(vp8_dequantize_b_neon);

 extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);

-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);

-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);

@@ -56,12 +46,6 @@

 #undef vp8_dequant_idct_add

 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon

-#undef vp8_dequant_dc_idct_add

-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon

-#undef vp8_dequant_dc_idct_add_y_block

-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon

 #undef vp8_dequant_idct_add_y_block

 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

--- a/vp8/decoder/arm/neon/idct_blk_neon.c

+++ b/vp8/decoder/arm/neon/idct_blk_neon.c

@@ -15,46 +15,11 @@

 /* place these declarations here because we don't want to maintain them

  * outside of this scope

*/

-void idct_dequant_dc_full_2x_neon(short *input, short *dq,

-                                  unsigned char *dst,

-                                  int stride, short *dc);

-void idct_dequant_dc_0_2x_neon(short *input, short *dq,

-                               unsigned char *dst,

-                               int stride, short *dc);

 void idct_dequant_full_2x_neon(short *q, short *dq,

                                unsigned char *dst, int stride);

 void idct_dequant_0_2x_neon(short *q, short dq,

                             unsigned char *dst, int stride);

-void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,

-                                          unsigned char *dst,

-                                          int stride, char *eobs, short *dc)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (((short *)(eobs))[0])

-        {

-            if (((short *)eobs)[0] & 0xfefe)

-                idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);

-            else

-                idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);

-        }

-        if (((short *)(eobs))[1])

-        {

-            if (((short *)eobs)[1] & 0xfefe)

-                idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);

-            else

-                idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);

-        }

-        q    += 64;

-        dc   += 4;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,

                                        unsigned char *dst,

--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm

+++ /dev/null

@@ -1,75 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_dc_0_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_dc_0_2x_neon(short *q, short *dq,

-;                               unsigned char *dst, int stride);

-; r0    *q,

-; r1    *dq,

-; r2    *dst

-; r3    stride

-; sp    *dc

-|idct_dequant_dc_0_2x_neon| PROC

-    ; no q- or dq-coeffs, so r0 and r1 are free to use

-    ldr             r1, [sp]                ; *dc

-    add             r12, r2, #4

-    ldr             r0, [r1]

-    vld1.32         {d2[0]}, [r2], r3       ; lo

-    vld1.32         {d8[0]}, [r12], r3      ; hi

-    vld1.32         {d2[1]}, [r2], r3

-    vld1.32         {d8[1]}, [r12], r3

-    vld1.32         {d4[0]}, [r2], r3

-    vld1.32         {d10[0]}, [r12], r3

-    vld1.32         {d4[1]}, [r2], r3

-    vld1.32         {d10[1]}, [r12]

-    sxth            r1, r0                  ; lo *dc

-    add             r1, r1, #4

-    asr             r1, r1, #3

-    vdup.16         q0, r1

-    sxth            r0, r0, ror #16         ; hi *dc

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q3, r0

-    vaddw.u8        q1, q0, d2              ; lo

-    vaddw.u8        q2, q0, d4

-    vaddw.u8        q4, q3, d8              ; hi

-    vaddw.u8        q5, q3, d10

-    vqmovun.s16     d2, q1                  ; lo

-    vqmovun.s16     d4, q2

-    vqmovun.s16     d8, q4                  ; hi

-    vqmovun.s16     d10, q5

-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

-    add             r0, r2, #4

-    vst1.32         {d2[0]}, [r2], r3       ; lo

-    vst1.32         {d8[0]}, [r0], r3       ; hi

-    vst1.32         {d2[1]}, [r2], r3

-    vst1.32         {d8[1]}, [r0], r3

-    vst1.32         {d4[0]}, [r2], r3

-    vst1.32         {d10[0]}, [r0], r3

-    vst1.32         {d4[1]}, [r2]

-    vst1.32         {d10[1]}, [r0]

-    bx             lr

-    ENDP           ;|idct_dequant_dc_0_2x_neon|

-    END

--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm

+++ /dev/null

@@ -1,208 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_dc_full_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_dc_full_2x_neon(short *q, short *dq,

-;                                  unsigned char *dst, int stride, short *dc);

-; r0    *q,

-; r1    *dq,

-; r2    *dst

-; r3    stride

-; sp    *dc

-|idct_dequant_dc_full_2x_neon| PROC

-    push            {r4}

-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

-    vld1.16         {q2, q3}, [r0]          ; l q

-    add             r0, r0, #32

-    vld1.16         {q4, q5}, [r0]          ; r q

-    add             r12, r2, #4

-    ; interleave the predictors

-    vld1.32         {d28[0]}, [r2], r3      ; l pre

-    vld1.32         {d28[1]}, [r12], r3     ; r pre

-    vld1.32         {d29[0]}, [r2], r3

-    vld1.32         {d29[1]}, [r12], r3

-    vld1.32         {d30[0]}, [r2], r3

-    vld1.32         {d30[1]}, [r12], r3

-    vld1.32         {d31[0]}, [r2], r3

-    ldr             r1, [sp, #4]            ; *dc

-    vld1.32         {d31[1]}, [r12]

-    adr             r4, cospi8sqrt2minus1   ; pointer to the first constant

-    ldrh            r12, [r1], #2           ; lo *dc

-    ldrh            r1, [r1]                ; hi *dc

-    ; dequant: q[i] = q[i] * dq[i]

-    vmul.i16        q2, q2, q0

-    vmul.i16        q3, q3, q1

-    vmul.i16        q4, q4, q0

-    vmul.i16        q5, q5, q1

-    ; move dc up to neon and overwrite first element

-    vmov.16         d4[0], r12

-    vmov.16         d8[0], r1

-    vld1.16         {d0}, [r4]

-    ; q2: l0r0  q3: l8r8

-    ; q4: l4r4  q5: l12r12

-    vswp            d5, d8

-    vswp            d7, d10

-    ; _CONSTANTS_ * 4,12 >> 16

-    ; q6:  4 * sinpi : c1/temp1

-    ; q7: 12 * sinpi : d1/temp2

-    ; q8:  4 * cospi

-    ; q9: 12 * cospi

-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q7, q5, d0[2]

-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

-    vqdmulh.s16     q9, q5, d0[0]

-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

-    ; vqdmulh only accepts signed values. this was a problem because

-    ; our constant had the high bit set, and was treated as a negative value.

-    ; vqdmulh also doubles the value before it shifts by 16. we need to

-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

-    ; so we can shift the constant without losing precision. this avoids

-    ; shift again afterward, but also avoids the sign issue. win win!

-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

-    ; pre-shift it

-    vshr.s16        q8, q8, #1

-    vshr.s16        q9, q9, #1

-    ; q4:  4 +  4 * cospi : d1/temp1

-    ; q5: 12 + 12 * cospi : c1/temp2

-    vqadd.s16       q4, q4, q8

-    vqadd.s16       q5, q5, q9

-    ; c1 = temp1 - temp2

-    ; d1 = temp1 + temp2

-    vqsub.s16       q2, q6, q5

-    vqadd.s16       q3, q4, q7

-    ; [0]: a1+d1

-    ; [1]: b1+c1

-    ; [2]: b1-c1

-    ; [3]: a1-d1

-    vqadd.s16       q4, q10, q3

-    vqadd.s16       q5, q11, q2

-    vqsub.s16       q6, q11, q2

-    vqsub.s16       q7, q10, q3

-    ; rotate

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; idct loop 2

-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

-    ; q6: l 2, 6,10,14 r 2, 6,10,14

-    ; q7: l 3, 7,11,15 r 3, 7,11,15

-    ; q8:  1 * sinpi : c1/temp1

-    ; q9:  3 * sinpi : d1/temp2

-    ; q10: 1 * cospi

-    ; q11: 3 * cospi

-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q9, q7, d0[2]

-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

-    vqdmulh.s16     q11, q7, d0[0]

-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

-    ; see note on shifting above

-    vshr.s16        q10, q10, #1

-    vshr.s16        q11, q11, #1

-    ; q10: 1 + 1 * cospi : d1/temp1

-    ; q11: 3 + 3 * cospi : c1/temp2

-    vqadd.s16       q10, q5, q10

-    vqadd.s16       q11, q7, q11

-    ; q8: c1 = temp1 - temp2

-    ; q9: d1 = temp1 + temp2

-    vqsub.s16       q8, q8, q11

-    vqadd.s16       q9, q10, q9

-    ; a1+d1

-    ; b1+c1

-    ; b1-c1

-    ; a1-d1

-    vqadd.s16       q4, q2, q9

-    vqadd.s16       q5, q3, q8

-    vqsub.s16       q6, q3, q8

-    vqsub.s16       q7, q2, q9

-    ; +4 >> 3 (rounding)

-    vrshr.s16       q4, q4, #3              ; lo

-    vrshr.s16       q5, q5, #3

-    vrshr.s16       q6, q6, #3              ; hi

-    vrshr.s16       q7, q7, #3

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; adding pre

-    ; input is still packed. pre was read interleaved

-    vaddw.u8        q4, q4, d28

-    vaddw.u8        q5, q5, d29

-    vaddw.u8        q6, q6, d30

-    vaddw.u8        q7, q7, d31

-    vmov.i16        q14, #0

-    vmov            q15, q14

-    vst1.16         {q14, q15}, [r0]        ; write over high input

-    sub             r0, r0, #32

-    vst1.16         {q14, q15}, [r0]        ; write over low input

-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

-    add             r1, r2, #4              ; hi

-    ;saturate and narrow

-    vqmovun.s16     d0, q4                  ; lo

-    vqmovun.s16     d1, q5

-    vqmovun.s16     d2, q6                  ; hi

-    vqmovun.s16     d3, q7

-    vst1.32         {d0[0]}, [r2], r3       ; lo

-    vst1.32         {d0[1]}, [r1], r3       ; hi

-    vst1.32         {d1[0]}, [r2], r3

-    vst1.32         {d1[1]}, [r1], r3

-    vst1.32         {d2[0]}, [r2], r3

-    vst1.32         {d2[1]}, [r1], r3

-    vst1.32         {d3[0]}, [r2]

-    vst1.32         {d3[1]}, [r1]

-    pop             {r4}

-    bx              lr

-    ENDP            ; |idct_dequant_dc_full_2x_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b

-; because the lowest bit in 0x8a8c is 0, we can pre-shift this

-sinpi8sqrt2       DCD 0x4546

-    END

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -232,45 +232,53 @@

-    else if (mode == SPLITMV)

-    {

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)

-                        (xd->qcoeff, xd->block[0].dequant,

-                         xd->dst.y_buffer,

-                         xd->dst.y_stride, xd->eobs);

-    }

     else

-        BLOCKD *b = &xd->block[24];

+        short *DQC = xd->block[0].dequant;

-        /* do 2nd order transform on the dc block */

-        if (xd->eobs[24] > 1)

+        /* save the dc dequant constant in case it is overridden */

+        short dc_dequant_temp = DQC[0];

+        if (mode != SPLITMV)

-            DEQUANT_INVOKE(&pbi->dequant, block)(b);

+            BLOCKD *b = &xd->block[24];

-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);

-            ((int *)b->qcoeff)[0] = 0;

-            ((int *)b->qcoeff)[1] = 0;

-            ((int *)b->qcoeff)[2] = 0;

-            ((int *)b->qcoeff)[3] = 0;

-            ((int *)b->qcoeff)[4] = 0;

-            ((int *)b->qcoeff)[5] = 0;

-            ((int *)b->qcoeff)[6] = 0;

-            ((int *)b->qcoeff)[7] = 0;

+            /* do 2nd order transform on the dc block */

+            if (xd->eobs[24] > 1)

+            {

+                DEQUANT_INVOKE(&pbi->dequant, block)(b);

+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],

+                    xd->qcoeff);

+                ((int *)b->qcoeff)[0] = 0;

+                ((int *)b->qcoeff)[1] = 0;

+                ((int *)b->qcoeff)[2] = 0;

+                ((int *)b->qcoeff)[3] = 0;

+                ((int *)b->qcoeff)[4] = 0;

+                ((int *)b->qcoeff)[5] = 0;

+                ((int *)b->qcoeff)[6] = 0;

+                ((int *)b->qcoeff)[7] = 0;

+            }

+            else

+            {

+                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];

+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],

+                    xd->qcoeff);

+                ((int *)b->qcoeff)[0] = 0;

+            }

+            /* override the dc dequant constant */

+            DQC[0] = 1;

-        else

-        {

-            b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];

-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);

-            ((int *)b->qcoeff)[0] = 0;

-        }

-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)

+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)

                         (xd->qcoeff, xd->block[0].dequant,

                          xd->dst.y_buffer,

-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);

+                         xd->dst.y_stride, xd->eobs);

+        /* restore the dc dequant constant */

+        DQC[0] = dc_dequant_temp;

     DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)

--- a/vp8/decoder/dequantize.c

+++ b/vp8/decoder/dequantize.c

@@ -42,22 +42,3 @@

     vpx_memset(input, 0, 32);

-void vp8_dequant_dc_idct_add_c(short *input, short *dq,

-                               unsigned char *dest, int stride,

-                               int Dc)

-{

-    int i;

-    input[0] = (short)Dc;

-    for (i = 1; i < 16; i++)

-    {

-        input[i] = dq[i] * input[i];

-    }

-    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);

-    vpx_memset(input, 0, 32);

-}

--- a/vp8/decoder/dequantize.h

+++ b/vp8/decoder/dequantize.h

@@ -21,17 +21,6 @@

              unsigned char *output, \

              int stride)

-#define prototype_dequant_dc_idct_add(sym) \

-    void sym(short *input, short *dq, \

-             unsigned char *dst, \

-             int stride, \

-             int dc)

-#define prototype_dequant_dc_idct_add_y_block(sym) \

-    void sym(short *q, short *dq, \

-             unsigned char *dst, \

-             int stride, char *eobs, short *dc)

 #define prototype_dequant_idct_add_y_block(sym) \

     void sym(short *q, short *dq, \

              unsigned char *dst, \

@@ -60,16 +49,6 @@

 #endif

 extern prototype_dequant_idct_add(vp8_dequant_idct_add);

-#ifndef vp8_dequant_dc_idct_add

-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c

-#endif

-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);

-#ifndef vp8_dequant_dc_idct_add_y_block

-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c

-#endif

-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);

 #ifndef vp8_dequant_idct_add_y_block

 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c

 #endif

@@ -85,10 +64,6 @@

 typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));

-typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));

-typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));

 typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));

 typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));

@@ -97,8 +72,6 @@

     vp8_dequant_block_fn_t               block;

     vp8_dequant_idct_add_fn_t            idct_add;

-    vp8_dequant_dc_idct_add_fn_t         dc_idct_add;

-    vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;

     vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;

     vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

 } vp8_dequant_rtcd_vtable_t;

--- a/vp8/decoder/generic/dsystemdependent.c

+++ b/vp8/decoder/generic/dsystemdependent.c

@@ -23,8 +23,6 @@

     pbi->mb.rtcd                     = &pbi->common.rtcd;

     pbi->dequant.block               = vp8_dequantize_b_c;

     pbi->dequant.idct_add            = vp8_dequant_idct_add_c;

-    pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;

-    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;

     pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;

     pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;

 #endif

--- a/vp8/decoder/idct_blk.c

+++ b/vp8/decoder/idct_blk.c

@@ -12,38 +12,11 @@

 #include "vp8/common/idct.h"

 #include "dequantize.h"

-void vp8_dequant_dc_idct_add_c(short *input, short *dq,

-                               unsigned char *dest, int stride,

-                               int Dc);

 void vp8_dequant_idct_add_c(short *input, short *dq,

                             unsigned char *dest, int stride);

 void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,

                             int pred_stride, unsigned char *dst_ptr,

                             int dst_stride);

-void vp8_dequant_dc_idct_add_y_block_c

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs, short *dc)

-{

-    int i, j;

-    for (i = 0; i < 4; i++)

-    {

-        for (j = 0; j < 4; j++)

-        {

-            if (*eobs++ > 1)

-                vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);

-            else

-                vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);

-            q   += 16;

-            dst += 4;

-            dc  ++;

-        }

-        dst += 4*stride - 16;

-    }

-}

 void vp8_dequant_idct_add_y_block_c

             (short *q, short *dq,

--- a/vp8/decoder/threading.c

+++ b/vp8/decoder/threading.c

@@ -175,37 +175,8 @@

 #endif

     /* dequantization and idct */

-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)

+    if (xd->mode_info_context->mbmi.mode == B_PRED)

-        BLOCKD *b = &xd->block[24];

-        DEQUANT_INVOKE(&pbi->dequant, block)(b);

-        /* do 2nd order transform on the dc block */

-        if (xd->eobs[24] > 1)

-        {

-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);

-            ((int *)b->qcoeff)[0] = 0;

-            ((int *)b->qcoeff)[1] = 0;

-            ((int *)b->qcoeff)[2] = 0;

-            ((int *)b->qcoeff)[3] = 0;

-            ((int *)b->qcoeff)[4] = 0;

-            ((int *)b->qcoeff)[5] = 0;

-            ((int *)b->qcoeff)[6] = 0;

-            ((int *)b->qcoeff)[7] = 0;

-        }

-        else

-        {

-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);

-            ((int *)b->qcoeff)[0] = 0;

-        }

-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)

-                        (xd->qcoeff, xd->block[0].dequant,

-                         xd->dst.y_buffer,

-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);

-    }

-    else if (xd->mode_info_context->mbmi.mode == B_PRED)

-    {

         for (i = 0; i < 16; i++)

             BLOCKD *b = &xd->block[i];

@@ -214,26 +185,71 @@

             vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,

                                    b->dst_stride, mb_row, mb_col, i);

-            if (xd->eobs[i] > 1)

+            if (xd->eobs[i] )

-                DEQUANT_INVOKE(&pbi->dequant, idct_add)

-                    (b->qcoeff, b->dequant,

-                    *(b->base_dst) + b->dst, b->dst_stride);

+                if (xd->eobs[i] > 1)

+                {

+                    DEQUANT_INVOKE(&pbi->dequant, idct_add)

+                        (b->qcoeff, b->dequant,

+                        *(b->base_dst) + b->dst, b->dst_stride);

+                }

+                else

+                {

+                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)

+                        (b->qcoeff[0] * b->dequant[0],

+                        *(b->base_dst) + b->dst, b->dst_stride,

+                        *(b->base_dst) + b->dst, b->dst_stride);

+                    ((int *)b->qcoeff)[0] = 0;

+                }

+        }

+    }

+    else

+    {

+        short *DQC = xd->block[0].dequant;

+        DECLARE_ALIGNED(16, short, local_dequant[16]);

+        if (xd->mode_info_context->mbmi.mode != SPLITMV)

+        {

+            BLOCKD *b = &xd->block[24];

+            /* do 2nd order transform on the dc block */

+            if (xd->eobs[24] > 1)

+            {

+                DEQUANT_INVOKE(&pbi->dequant, block)(b);

+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],

+                    xd->qcoeff);

+                ((int *)b->qcoeff)[0] = 0;

+                ((int *)b->qcoeff)[1] = 0;

+                ((int *)b->qcoeff)[2] = 0;

+                ((int *)b->qcoeff)[3] = 0;

+                ((int *)b->qcoeff)[4] = 0;

+                ((int *)b->qcoeff)[5] = 0;

+                ((int *)b->qcoeff)[6] = 0;

+                ((int *)b->qcoeff)[7] = 0;

+            }

             else

-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)

-                    (b->qcoeff[0] * b->dequant[0],

-                    *(b->base_dst) + b->dst, b->dst_stride,

-                    *(b->base_dst) + b->dst, b->dst_stride);

+                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];

+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);

                 ((int *)b->qcoeff)[0] = 0;

+            /* make a local copy of the dequant constants */

+            vpx_memcpy(local_dequant, xd->block[0].dequant,

+                       sizeof(local_dequant));

+            /* override the dc dequant constant */

+            local_dequant[0] = 1;

+            /* use the new dequant constants */

+            DQC = local_dequant;

-    }

-    else

-    {

         DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)

-                        (xd->qcoeff, xd->block[0].dequant,

+                        (xd->qcoeff, DQC,

                          xd->dst.y_buffer,

                          xd->dst.y_stride, xd->eobs);

@@ -243,7 +259,6 @@

                      xd->dst.u_buffer, xd->dst.v_buffer,

                      xd->dst.uv_stride, xd->eobs+16);

 static THREAD_FUNCTION thread_decoding_proc(void *p_data)

--- a/vp8/decoder/x86/dequantize_mmx.asm

+++ b/vp8/decoder/x86/dequantize_mmx.asm

@@ -246,207 +246,6 @@

     pop         rbp

ret

-;void dequant_dc_idct_add_mmx(

-;short *input,          0

-;short *dq,             1

-;unsigned char *dest,   2

-;int stride,            3

-;int Dc)                4

-global sym(vp8_dequant_dc_idct_add_mmx)

-sym(vp8_dequant_dc_idct_add_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    ; end prolog

-        mov         rax,    arg(0) ;input

-        mov         rdx,    arg(1) ;dq

-        movq        mm0,    [rax   ]

-        pmullw      mm0,    [rdx]

-        movq        mm1,    [rax +8]

-        pmullw      mm1,    [rdx +8]

-        movq        mm2,    [rax+16]

-        pmullw      mm2,    [rdx+16]

-        movq        mm3,    [rax+24]

-        pmullw      mm3,    [rdx+24]

-        mov         rdx,    arg(2) ;pred

-        pxor        mm7,    mm7

-        movq        [rax],   mm7

-        movq        [rax+8], mm7

-        movq        [rax+16],mm7

-        movq        [rax+24],mm7

-        ; move lower word of Dc to lower word of mm0

-        psrlq       mm0,    16

-        movzx       rcx,    word ptr arg(4) ;Dc

-        psllq       mm0,    16

-        movq        mm7,    rcx

-        por         mm0,    mm7

-        movsxd      rax,            dword ptr arg(3) ;stride

-        psubw       mm0,            mm2             ; b1= 0-2

-        paddw       mm2,            mm2             ;

-        movq        mm5,            mm1

-        paddw       mm2,            mm0             ; a1 =0+2

-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

-        movq        mm7,            mm3             ;

-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       mm7,            mm5             ; c1

-        movq        mm5,            mm1

-        movq        mm4,            mm3

-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

-        paddw       mm5,            mm1

-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

-        paddw       mm3,            mm4

-        paddw       mm3,            mm5             ; d1

-        movq        mm6,            mm2             ; a1

-        movq        mm4,            mm0             ; b1

-        paddw       mm2,            mm3             ;0

-        paddw       mm4,            mm7             ;1

-        psubw       mm0,            mm7             ;2

-        psubw       mm6,            mm3             ;3

-        movq        mm1,            mm2             ; 03 02 01 00

-        movq        mm3,            mm4             ; 23 22 21 20

-        punpcklwd   mm1,            mm0             ; 11 01 10 00

-        punpckhwd   mm2,            mm0             ; 13 03 12 02

-        punpcklwd   mm3,            mm6             ; 31 21 30 20

-        punpckhwd   mm4,            mm6             ; 33 23 32 22

-        movq        mm0,            mm1             ; 11 01 10 00

-        movq        mm5,            mm2             ; 13 03 12 02

-        punpckldq   mm0,            mm3             ; 30 20 10 00

-        punpckhdq   mm1,            mm3             ; 31 21 11 01

-        punpckldq   mm2,            mm4             ; 32 22 12 02

-        punpckhdq   mm5,            mm4             ; 33 23 13 03

-        movq        mm3,            mm5             ; 33 23 13 03

-        psubw       mm0,            mm2             ; b1= 0-2

-        paddw       mm2,            mm2             ;

-        movq        mm5,            mm1

-        paddw       mm2,            mm0             ; a1 =0+2

-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

-        movq        mm7,            mm3             ;

-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       mm7,            mm5             ; c1

-        movq        mm5,            mm1

-        movq        mm4,            mm3

-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

-        paddw       mm5,            mm1

-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

-        paddw       mm3,            mm4

-        paddw       mm3,            mm5             ; d1

-        paddw       mm0,            [GLOBAL(fours)]

-        paddw       mm2,            [GLOBAL(fours)]

-        movq        mm6,            mm2             ; a1

-        movq        mm4,            mm0             ; b1

-        paddw       mm2,            mm3             ;0

-        paddw       mm4,            mm7             ;1

-        psubw       mm0,            mm7             ;2

-        psubw       mm6,            mm3             ;3

-        psraw       mm2,            3

-        psraw       mm0,            3

-        psraw       mm4,            3

-        psraw       mm6,            3

-        movq        mm1,            mm2             ; 03 02 01 00

-        movq        mm3,            mm4             ; 23 22 21 20

-        punpcklwd   mm1,            mm0             ; 11 01 10 00

-        punpckhwd   mm2,            mm0             ; 13 03 12 02

-        punpcklwd   mm3,            mm6             ; 31 21 30 20

-        punpckhwd   mm4,            mm6             ; 33 23 32 22

-        movq        mm0,            mm1             ; 11 01 10 00

-        movq        mm5,            mm2             ; 13 03 12 02

-        punpckldq   mm0,            mm3             ; 30 20 10 00

-        punpckhdq   mm1,            mm3             ; 31 21 11 01

-        punpckldq   mm2,            mm4             ; 32 22 12 02

-        punpckhdq   mm5,            mm4             ; 33 23 13 03

-        pxor        mm7,            mm7

-        movd        mm4,            [rdx]

-        punpcklbw   mm4,            mm7

-        paddsw      mm0,            mm4

-        packuswb    mm0,            mm7

-        movd        [rdx],          mm0

-        movd        mm4,            [rdx+rax]

-        punpcklbw   mm4,            mm7

-        paddsw      mm1,            mm4

-        packuswb    mm1,            mm7

-        movd        [rdx+rax],      mm1

-        movd        mm4,            [rdx+2*rax]

-        punpcklbw   mm4,            mm7

-        paddsw      mm2,            mm4

-        packuswb    mm2,            mm7

-        movd        [rdx+rax*2],    mm2

-        add         rdx,            rax

-        movd        mm4,            [rdx+2*rax]

-        punpcklbw   mm4,            mm7

-        paddsw      mm5,            mm4

-        packuswb    mm5,            mm7

-        movd        [rdx+rax*2],    mm5

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 SECTION_RODATA

 align 16

 x_s1sqr2:

--- a/vp8/decoder/x86/dequantize_x86.h

+++ b/vp8/decoder/x86/dequantize_x86.h

@@ -22,8 +22,6 @@

 #if HAVE_MMX

 extern prototype_dequant_block(vp8_dequantize_b_mmx);

 extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);

-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);

-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);

@@ -34,12 +32,6 @@

 #undef  vp8_dequant_idct_add

 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx

-#undef  vp8_dequant_dc_idct_add

-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx

-#undef vp8_dequant_dc_idct_add_y_block

-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx

 #undef vp8_dequant_idct_add_y_block

 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx

@@ -50,14 +42,10 @@

 #endif

 #if HAVE_SSE2

-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp8_dequant_dc_idct_add_y_block

-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2

 #undef vp8_dequant_idct_add_y_block

 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2

--- a/vp8/decoder/x86/idct_blk_mmx.c

+++ b/vp8/decoder/x86/idct_blk_mmx.c

@@ -12,41 +12,6 @@

 #include "vp8/common/idct.h"

 #include "vp8/decoder/dequantize.h"

-void vp8_dequant_dc_idct_add_y_block_mmx

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs, short *dc)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);

-        else if (eobs[0] == 1)

-            vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);

-        if (eobs[1] > 1)

-            vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);

-        else if (eobs[1] == 1)

-            vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);

-        if (eobs[2] > 1)

-            vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);

-        else if (eobs[2] == 1)

-            vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);

-        if (eobs[3] > 1)

-            vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);

-        else if (eobs[3] == 1)

-            vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);

-        q    += 64;

-        dc   += 4;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

 void vp8_dequant_idct_add_y_block_mmx

             (short *q, short *dq,

              unsigned char *dst, int stride, char *eobs)

--- a/vp8/decoder/x86/idct_blk_sse2.c

+++ b/vp8/decoder/x86/idct_blk_sse2.c

@@ -12,13 +12,6 @@

 #include "vp8/common/idct.h"

 #include "vp8/decoder/dequantize.h"

-void vp8_idct_dequant_dc_0_2x_sse2

-            (short *q, short *dq,

-             unsigned char *dst, int dst_stride, short *dc);

-void vp8_idct_dequant_dc_full_2x_sse2

-            (short *q, short *dq,

-             unsigned char *dst, int dst_stride, short *dc);

 void vp8_idct_dequant_0_2x_sse2

             (short *q, short *dq ,

              unsigned char *dst, int dst_stride);

@@ -25,36 +18,6 @@

 void vp8_idct_dequant_full_2x_sse2

             (short *q, short *dq ,

              unsigned char *dst, int dst_stride);

-void vp8_dequant_dc_idct_add_y_block_sse2

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs, short *dc)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (((short *)(eobs))[0])

-        {

-            if (((short *)(eobs))[0] & 0xfefe)

-                vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);

-            else

-                vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);

-        }

-        if (((short *)(eobs))[1])

-        {

-            if (((short *)(eobs))[1] & 0xfefe)

-                vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);

-            else

-                vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);

-        }

-        q    += 64;

-        dc   += 4;

-        dst  += stride*4;

-        eobs += 4;

-    }

-}

 void vp8_dequant_idct_add_y_block_sse2

             (short *q, short *dq,

--- a/vp8/decoder/x86/x86_dsystemdependent.c

+++ b/vp8/decoder/x86/x86_dsystemdependent.c

@@ -43,8 +43,6 @@

         pbi->dequant.block               = vp8_dequantize_b_mmx;

         pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;

-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;

-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;

         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;

         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;

@@ -52,8 +50,6 @@

 #if HAVE_SSE2

     if (flags & HAS_SSE2)

-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;

-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;

         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;

 #endif

--- a/vp8/vp8dx_arm.mk

+++ b/vp8/vp8dx_arm.mk

@@ -16,14 +16,11 @@

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h

 #File list for armv6

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

 #File list for neon

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)

--

⑨