shithub: libvpx

--- a/vp8/common/arm/arm_systemdependent.c

+++ b/vp8/common/arm/arm_systemdependent.c

@@ -63,6 +63,12 @@

         rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;

         rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;

         rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;

+        rtcd->dequant.block               = vp8_dequantize_b_v6;

+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;

+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;

+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;

 #endif

@@ -97,6 +103,12 @@

             vp8_build_intra_predictors_mby_neon;

         rtcd->recon.build_intra_predictors_mby_s =

             vp8_build_intra_predictors_mby_s_neon;

+        rtcd->dequant.block               = vp8_dequantize_b_neon;

+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;

+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;

+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;

 #endif

--- /dev/null

+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm

@@ -1,0 +1,190 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT |vp8_dequant_idct_add_v6|

+    AREA |.text|, CODE, READONLY

+;void vp8_dequant_idct_v6(short *input, short *dq,

+;                         unsigned char *dest, int stride)

+; r0 = q

+; r1 = dq

+; r2 = dst

+; r3 = stride

+|vp8_dequant_idct_add_v6| PROC

+    stmdb   sp!, {r4-r11, lr}

+    ldr     r4, [r0]                ;input

+    ldr     r5, [r1], #4            ;dq

+    sub     sp, sp, #4

+    str     r3, [sp]

+    mov     r12, #4

+vp8_dequant_add_loop

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    ldr     r4, [r0, #4]            ;input

+    ldr     r5, [r1], #4            ;dq

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    subs    r12, r12, #1

+    ldrne   r4, [r0, #4]

+    ldrne   r5, [r1], #4

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    bne     vp8_dequant_add_loop

+    sub     r0, r0, #32

+    mov     r1, r0

+; short_idct4x4llm_v6_dual

+    ldr     r3, cospi8sqrt2minus1

+    ldr     r4, sinpi8sqrt2

+    ldr     r6, [r0, #8]

+    mov     r5, #2

+vp8_dequant_idct_loop1_v6

+    ldr     r12, [r0, #24]

+    ldr     r14, [r0, #16]

+    smulwt  r9, r3, r6

+    smulwb  r7, r3, r6

+    smulwt  r10, r4, r6

+    smulwb  r8, r4, r6

+    pkhbt   r7, r7, r9, lsl #16

+    smulwt  r11, r3, r12

+    pkhbt   r8, r8, r10, lsl #16

+    uadd16  r6, r6, r7

+    smulwt  r7, r4, r12

+    smulwb  r9, r3, r12

+    smulwb  r10, r4, r12

+    subs    r5, r5, #1

+    pkhbt   r9, r9, r11, lsl #16

+    ldr     r11, [r0], #4

+    pkhbt   r10, r10, r7, lsl #16

+    uadd16  r7, r12, r9

+    usub16  r7, r8, r7

+    uadd16  r6, r6, r10

+    uadd16  r10, r11, r14

+    usub16  r8, r11, r14

+    uadd16  r9, r10, r6

+    usub16  r10, r10, r6

+    uadd16  r6, r8, r7

+    usub16  r7, r8, r7

+    str     r6, [r1, #8]

+    ldrne   r6, [r0, #8]

+    str     r7, [r1, #16]

+    str     r10, [r1, #24]

+    str     r9, [r1], #4

+    bne     vp8_dequant_idct_loop1_v6

+    mov     r5, #2

+    sub     r0, r1, #8

+vp8_dequant_idct_loop2_v6

+    ldr     r6, [r0], #4

+    ldr     r7, [r0], #4

+    ldr     r8, [r0], #4

+    ldr     r9, [r0], #4

+    smulwt  r1, r3, r6

+    smulwt  r12, r4, r6

+    smulwt  lr, r3, r8

+    smulwt  r10, r4, r8

+    pkhbt   r11, r8, r6, lsl #16

+    pkhbt   r1, lr, r1, lsl #16

+    pkhbt   r12, r10, r12, lsl #16

+    pkhtb   r6, r6, r8, asr #16

+    uadd16  r6, r1, r6

+    pkhbt   lr, r9, r7, lsl #16

+    uadd16  r10, r11, lr

+    usub16  lr, r11, lr

+    pkhtb   r8, r7, r9, asr #16

+    subs    r5, r5, #1

+    smulwt  r1, r3, r8

+    smulwb  r7, r3, r8

+    smulwt  r11, r4, r8

+    smulwb  r9, r4, r8

+    pkhbt   r1, r7, r1, lsl #16

+    uadd16  r8, r1, r8

+    pkhbt   r11, r9, r11, lsl #16

+    usub16  r1, r12, r8

+    uadd16  r8, r11, r6

+    ldr     r9, c0x00040004

+    ldr     r12, [sp]               ; get stride from stack

+    uadd16  r6, r10, r8

+    usub16  r7, r10, r8

+    uadd16  r7, r7, r9

+    uadd16  r6, r6, r9

+    uadd16  r10, r14, r1

+    usub16  r1, r14, r1

+    uadd16  r10, r10, r9

+    uadd16  r1, r1, r9

+    ldr     r11, [r2]               ; load input from dst

+    mov     r8, r7, asr #3

+    pkhtb   r9, r8, r10, asr #19

+    mov     r8, r1, asr #3

+    pkhtb   r8, r8, r6, asr #19

+    uxtb16  lr, r11, ror #8

+    qadd16  r9, r9, lr

+    uxtb16  lr, r11

+    qadd16  r8, r8, lr

+    usat16  r9, #8, r9

+    usat16  r8, #8, r8

+    orr     r9, r8, r9, lsl #8

+    ldr     r11, [r2, r12]          ; load input from dst

+    mov     r7, r7, lsl #16

+    mov     r1, r1, lsl #16

+    mov     r10, r10, lsl #16

+    mov     r6, r6, lsl #16

+    mov     r7, r7, asr #3

+    pkhtb   r7, r7, r10, asr #19

+    mov     r1, r1, asr #3

+    pkhtb   r1, r1, r6, asr #19

+    uxtb16  r8, r11, ror #8

+    qadd16  r7, r7, r8

+    uxtb16  r8, r11

+    qadd16  r1, r1, r8

+    usat16  r7, #8, r7

+    usat16  r1, #8, r1

+    orr     r1, r1, r7, lsl #8

+    str     r9, [r2], r12           ; store output to dst

+    str     r1, [r2], r12           ; store output to dst

+    bne     vp8_dequant_idct_loop2_v6

+; vpx_memset

+    sub     r0, r0, #32

+    add     sp, sp, #4

+    mov     r12, #0

+    str     r12, [r0]

+    str     r12, [r0, #4]

+    str     r12, [r0, #8]

+    str     r12, [r0, #12]

+    str     r12, [r0, #16]

+    str     r12, [r0, #20]

+    str     r12, [r0, #24]

+    str     r12, [r0, #28]

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP    ; |vp8_dequant_idct_add_v6|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x00004E7B

+sinpi8sqrt2       DCD 0x00008A8C

+c0x00040004       DCD 0x00040004

+    END

--- /dev/null

+++ b/vp8/common/arm/armv6/dequantize_v6.asm

@@ -1,0 +1,69 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequantize_b_loop_v6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------

+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

+; r0    short *Q,

+; r1    short *DQC

+; r2    short *DQ

+|vp8_dequantize_b_loop_v6| PROC

+    stmdb   sp!, {r4-r9, lr}

+    ldr     r3, [r0]                ;load Q

+    ldr     r4, [r1]                ;load DQC

+    ldr     r5, [r0, #4]

+    ldr     r6, [r1, #4]

+    mov     r12, #2                 ;loop counter

+dequant_loop

+    smulbb  r7, r3, r4              ;multiply

+    smultt  r8, r3, r4

+    smulbb  r9, r5, r6

+    smultt  lr, r5, r6

+    ldr     r3, [r0, #8]

+    ldr     r4, [r1, #8]

+    ldr     r5, [r0, #12]

+    ldr     r6, [r1, #12]

+    strh    r7, [r2], #2            ;store result

+    smulbb  r7, r3, r4              ;multiply

+    strh    r8, [r2], #2

+    smultt  r8, r3, r4

+    strh    r9, [r2], #2

+    smulbb  r9, r5, r6

+    strh    lr, [r2], #2

+    smultt  lr, r5, r6

+    subs    r12, r12, #1

+    add     r0, r0, #16

+    add     r1, r1, #16

+    ldrne       r3, [r0]

+    strh    r7, [r2], #2            ;store result

+    ldrne       r4, [r1]

+    strh    r8, [r2], #2

+    ldrne       r5, [r0, #4]

+    strh    r9, [r2], #2

+    ldrne       r6, [r1, #4]

+    strh    lr, [r2], #2

+    bne     dequant_loop

+    ldmia   sp!, {r4-r9, pc}

+    ENDP    ;|vp8_dequantize_b_loop_v6|

+    END

--- /dev/null

+++ b/vp8/common/arm/armv6/idct_blk_v6.c

@@ -1,0 +1,116 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/idct.h"

+#include "vp8/common/dequantize.h"

+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,

+                                     unsigned char *dst,

+                                     int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 4; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_v6 (q, dq, dst, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        if (eobs[2] > 1)

+            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);

+        else if (eobs[2] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);

+            ((int *)(q+32))[0] = 0;

+        }

+        if (eobs[3] > 1)

+            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);

+        else if (eobs[3] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);

+            ((int *)(q+48))[0] = 0;

+        }

+        q    += 64;

+        dst  += 4*stride;

+        eobs += 4;

+    }

+}

+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,

+                                      unsigned char *dstu,

+                                      unsigned char *dstv,

+                                      int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 2; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,

+                                                  dstu+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        q    += 32;

+        dstu += 4*stride;

+        eobs += 2;

+    }

+    for (i = 0; i < 2; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,

+                                                  dstv+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        q    += 32;

+        dstv += 4*stride;

+        eobs += 2;

+    }

+}

--- /dev/null

+++ b/vp8/common/arm/dequantize_arm.c

@@ -1,0 +1,45 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/dequantize.h"

+#include "vp8/common/idct.h"

+#if HAVE_ARMV7

+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);

+#endif

+#if HAVE_ARMV6

+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

+#endif

+#if HAVE_ARMV7

+void vp8_dequantize_b_neon(BLOCKD *d)

+{

+    short *DQ  = d->dqcoeff;

+    short *Q   = d->qcoeff;

+    short *DQC = d->dequant;

+    vp8_dequantize_b_loop_neon(Q, DQC, DQ);

+}

+#endif

+#if HAVE_ARMV6

+void vp8_dequantize_b_v6(BLOCKD *d)

+{

+    short *DQ  = d->dqcoeff;

+    short *Q   = d->qcoeff;

+    short *DQC = d->dequant;

+    vp8_dequantize_b_loop_v6(Q, DQC, DQ);

+}

+#endif

--- /dev/null

+++ b/vp8/common/arm/dequantize_arm.h

@@ -1,0 +1,59 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DEQUANTIZE_ARM_H

+#define DEQUANTIZE_ARM_H

+#if HAVE_ARMV6

+extern prototype_dequant_block(vp8_dequantize_b_v6);

+extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);

+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);

+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_dequant_block

+#define vp8_dequant_block vp8_dequantize_b_v6

+#undef  vp8_dequant_idct_add

+#define vp8_dequant_idct_add vp8_dequant_idct_add_v6

+#undef  vp8_dequant_idct_add_y_block

+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

+#undef  vp8_dequant_idct_add_uv_block

+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6

+#endif

+#endif

+#if HAVE_ARMV7

+extern prototype_dequant_block(vp8_dequantize_b_neon);

+extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);

+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);

+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_dequant_block

+#define vp8_dequant_block vp8_dequantize_b_neon

+#undef  vp8_dequant_idct_add

+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon

+#undef  vp8_dequant_idct_add_y_block

+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

+#undef  vp8_dequant_idct_add_uv_block

+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp8/common/arm/neon/dequant_idct_neon.asm

@@ -1,0 +1,131 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequant_idct_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp8_dequant_idct_add_neon(short *input, short *dq,

+;                           unsigned char *dest, int stride)

+; r0    short *input,

+; r1    short *dq,

+; r2    unsigned char *dest

+; r3    int stride

+|vp8_dequant_idct_add_neon| PROC

+    vld1.16         {q3, q4}, [r0]

+    vld1.16         {q5, q6}, [r1]

+    add             r1, r2, r3              ; r1 = dest + stride

+    lsl             r3, #1                  ; 2x stride

+    vld1.32         {d14[0]}, [r2], r3

+    vld1.32         {d14[1]}, [r1], r3

+    vld1.32         {d15[0]}, [r2]

+    vld1.32         {d15[1]}, [r1]

+    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant

+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon

+    vmul.i16        q2, q4, q6

+;|short_idct4x4llm_neon| PROC

+    vld1.16         {d0}, [r12]

+    vswp            d3, d4                  ;q2(vp[4] vp[12])

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2

+    vqadd.s16       q4, q4, q2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+; memset(input, 0, 32) -- 32bytes

+    vmov.i16        q14, #0

+    vswp            d3, d4

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vmov            q15, q14

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2

+    vqadd.s16       q4, q4, q2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vst1.16         {q14, q15}, [r0]

+    vrshr.s16       d2, d2, #3

+    vrshr.s16       d3, d3, #3

+    vrshr.s16       d4, d4, #3

+    vrshr.s16       d5, d5, #3

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+    vaddw.u8        q1, q1, d14

+    vaddw.u8        q2, q2, d15

+    sub             r2, r2, r3

+    sub             r1, r1, r3

+    vqmovun.s16     d0, q1

+    vqmovun.s16     d1, q2

+    vst1.32         {d0[0]}, [r2], r3

+    vst1.32         {d0[1]}, [r1], r3

+    vst1.32         {d1[0]}, [r2]

+    vst1.32         {d1[1]}, [r1]

+    bx             lr

+    ENDP           ; |vp8_dequant_idct_add_neon|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x4e7b4e7b

+sinpi8sqrt2       DCD 0x8a8c8a8c

+    END

--- /dev/null

+++ b/vp8/common/arm/neon/dequantizeb_neon.asm

@@ -1,0 +1,34 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequantize_b_loop_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    short *Q,

+; r1    short *DQC

+; r2    short *DQ

+|vp8_dequantize_b_loop_neon| PROC

+    vld1.16         {q0, q1}, [r0]

+    vld1.16         {q2, q3}, [r1]

+    vmul.i16        q4, q0, q2

+    vmul.i16        q5, q1, q3

+    vst1.16         {q4, q5}, [r2]

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp8/common/arm/neon/idct_blk_neon.c

@@ -1,0 +1,97 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/idct.h"

+#include "vp8/common/dequantize.h"

+/* place these declarations here because we don't want to maintain them

+ * outside of this scope

+ */

+void idct_dequant_full_2x_neon(short *q, short *dq,

+                               unsigned char *dst, int stride);

+void idct_dequant_0_2x_neon(short *q, short dq,

+                            unsigned char *dst, int stride);

+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,

+                                       unsigned char *dst,

+                                       int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 4; i++)

+    {

+        if (((short *)(eobs))[0])

+        {

+            if (((short *)eobs)[0] & 0xfefe)

+                idct_dequant_full_2x_neon (q, dq, dst, stride);

+            else

+                idct_dequant_0_2x_neon (q, dq[0], dst, stride);

+        }

+        if (((short *)(eobs))[1])

+        {

+            if (((short *)eobs)[1] & 0xfefe)

+                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);

+            else

+                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);

+        }

+        q    += 64;

+        dst  += 4*stride;

+        eobs += 4;

+    }

+}

+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,

+                                        unsigned char *dstu,

+                                        unsigned char *dstv,

+                                        int stride, char *eobs)

+{

+    if (((short *)(eobs))[0])

+    {

+        if (((short *)eobs)[0] & 0xfefe)

+            idct_dequant_full_2x_neon (q, dq, dstu, stride);

+        else

+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);

+    }

+    q    += 32;

+    dstu += 4*stride;

+    if (((short *)(eobs))[1])

+    {

+        if (((short *)eobs)[1] & 0xfefe)

+            idct_dequant_full_2x_neon (q, dq, dstu, stride);

+        else

+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);

+    }

+    q += 32;

+    if (((short *)(eobs))[2])

+    {

+        if (((short *)eobs)[2] & 0xfefe)

+            idct_dequant_full_2x_neon (q, dq, dstv, stride);

+        else

+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);

+    }

+    q    += 32;

+    dstv += 4*stride;

+    if (((short *)(eobs))[3])

+    {

+        if (((short *)eobs)[3] & 0xfefe)

+            idct_dequant_full_2x_neon (q, dq, dstv, stride);

+        else

+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);

+    }

+}

--- /dev/null

+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm

@@ -1,0 +1,79 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_0_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_0_2x_neon(short *q, short dq,

+;                            unsigned char *dst, int stride);

+; r0   *q

+; r1   dq

+; r2   *dst

+; r3   stride

+|idct_dequant_0_2x_neon| PROC

+    push            {r4, r5}

+    add             r12, r2, #4

+    vld1.32         {d2[0]}, [r2], r3

+    vld1.32         {d8[0]}, [r12], r3

+    vld1.32         {d2[1]}, [r2], r3

+    vld1.32         {d8[1]}, [r12], r3

+    vld1.32         {d4[0]}, [r2], r3

+    vld1.32         {d10[0]}, [r12], r3

+    vld1.32         {d4[1]}, [r2], r3

+    vld1.32         {d10[1]}, [r12], r3

+    ldrh            r12, [r0]               ; lo q

+    ldrh            r4, [r0, #32]           ; hi q

+    mov             r5, #0

+    strh            r5, [r0]

+    strh            r5, [r0, #32]

+    sxth            r12, r12                ; lo

+    mul             r0, r12, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q0, r0

+    sxth            r4, r4                  ; hi

+    mul             r0, r4, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q3, r0

+    vaddw.u8        q1, q0, d2              ; lo

+    vaddw.u8        q2, q0, d4

+    vaddw.u8        q4, q3, d8              ; hi

+    vaddw.u8        q5, q3, d10

+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

+    add             r0, r2, #4

+    vqmovun.s16     d2, q1                  ; lo

+    vqmovun.s16     d4, q2

+    vqmovun.s16     d8, q4                  ; hi

+    vqmovun.s16     d10, q5

+    vst1.32         {d2[0]}, [r2], r3       ; lo

+    vst1.32         {d8[0]}, [r0], r3       ; hi

+    vst1.32         {d2[1]}, [r2], r3

+    vst1.32         {d8[1]}, [r0], r3

+    vst1.32         {d4[0]}, [r2], r3

+    vst1.32         {d10[0]}, [r0], r3

+    vst1.32         {d4[1]}, [r2]

+    vst1.32         {d10[1]}, [r0]

+    pop             {r4, r5}

+    bx              lr

+    ENDP            ; |idct_dequant_0_2x_neon|

+    END

--- /dev/null

+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm

@@ -1,0 +1,196 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_full_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_full_2x_neon(short *q, short *dq,

+;                               unsigned char *dst, int stride);

+; r0    *q,

+; r1    *dq,

+; r2    *dst

+; r3    stride

+|idct_dequant_full_2x_neon| PROC

+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

+    vld1.16         {q2, q3}, [r0]          ; l q

+    add             r0, r0, #32

+    vld1.16         {q4, q5}, [r0]          ; r q

+    add             r12, r2, #4

+    ; interleave the predictors

+    vld1.32         {d28[0]}, [r2],  r3     ; l pre

+    vld1.32         {d28[1]}, [r12], r3     ; r pre

+    vld1.32         {d29[0]}, [r2],  r3

+    vld1.32         {d29[1]}, [r12], r3

+    vld1.32         {d30[0]}, [r2],  r3

+    vld1.32         {d30[1]}, [r12], r3

+    vld1.32         {d31[0]}, [r2],  r3

+    vld1.32         {d31[1]}, [r12]

+    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant

+    ; dequant: q[i] = q[i] * dq[i]

+    vmul.i16        q2, q2, q0

+    vmul.i16        q3, q3, q1

+    vmul.i16        q4, q4, q0

+    vmul.i16        q5, q5, q1

+    vld1.16         {d0}, [r1]

+    ; q2: l0r0  q3: l8r8

+    ; q4: l4r4  q5: l12r12

+    vswp            d5, d8

+    vswp            d7, d10

+    ; _CONSTANTS_ * 4,12 >> 16

+    ; q6:  4 * sinpi : c1/temp1

+    ; q7: 12 * sinpi : d1/temp2

+    ; q8:  4 * cospi

+    ; q9: 12 * cospi

+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q7, q5, d0[2]

+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

+    vqdmulh.s16     q9, q5, d0[0]

+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

+    ; vqdmulh only accepts signed values. this was a problem because

+    ; our constant had the high bit set, and was treated as a negative value.

+    ; vqdmulh also doubles the value before it shifts by 16. we need to

+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

+    ; so we can shift the constant without losing precision. this avoids

+    ; shift again afterward, but also avoids the sign issue. win win!

+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

+    ; pre-shift it

+    vshr.s16        q8, q8, #1

+    vshr.s16        q9, q9, #1

+    ; q4:  4 +  4 * cospi : d1/temp1

+    ; q5: 12 + 12 * cospi : c1/temp2

+    vqadd.s16       q4, q4, q8

+    vqadd.s16       q5, q5, q9

+    ; c1 = temp1 - temp2

+    ; d1 = temp1 + temp2

+    vqsub.s16       q2, q6, q5

+    vqadd.s16       q3, q4, q7

+    ; [0]: a1+d1

+    ; [1]: b1+c1

+    ; [2]: b1-c1

+    ; [3]: a1-d1

+    vqadd.s16       q4, q10, q3

+    vqadd.s16       q5, q11, q2

+    vqsub.s16       q6, q11, q2

+    vqsub.s16       q7, q10, q3

+    ; rotate

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; idct loop 2

+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

+    ; q6: l 2, 6,10,14 r 2, 6,10,14

+    ; q7: l 3, 7,11,15 r 3, 7,11,15

+    ; q8:  1 * sinpi : c1/temp1

+    ; q9:  3 * sinpi : d1/temp2

+    ; q10: 1 * cospi

+    ; q11: 3 * cospi

+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q9, q7, d0[2]

+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

+    vqdmulh.s16     q11, q7, d0[0]

+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

+    ; see note on shifting above

+    vshr.s16        q10, q10, #1

+    vshr.s16        q11, q11, #1

+    ; q10: 1 + 1 * cospi : d1/temp1

+    ; q11: 3 + 3 * cospi : c1/temp2

+    vqadd.s16       q10, q5, q10

+    vqadd.s16       q11, q7, q11

+    ; q8: c1 = temp1 - temp2

+    ; q9: d1 = temp1 + temp2

+    vqsub.s16       q8, q8, q11

+    vqadd.s16       q9, q10, q9

+    ; a1+d1

+    ; b1+c1

+    ; b1-c1

+    ; a1-d1

+    vqadd.s16       q4, q2, q9

+    vqadd.s16       q5, q3, q8

+    vqsub.s16       q6, q3, q8

+    vqsub.s16       q7, q2, q9

+    ; +4 >> 3 (rounding)

+    vrshr.s16       q4, q4, #3              ; lo

+    vrshr.s16       q5, q5, #3

+    vrshr.s16       q6, q6, #3              ; hi

+    vrshr.s16       q7, q7, #3

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; adding pre

+    ; input is still packed. pre was read interleaved

+    vaddw.u8        q4, q4, d28

+    vaddw.u8        q5, q5, d29

+    vaddw.u8        q6, q6, d30

+    vaddw.u8        q7, q7, d31

+    vmov.i16        q14, #0

+    vmov            q15, q14

+    vst1.16         {q14, q15}, [r0]        ; write over high input

+    sub             r0, r0, #32

+    vst1.16         {q14, q15}, [r0]        ; write over low input

+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

+    add             r1, r2, #4              ; hi

+    ;saturate and narrow

+    vqmovun.s16     d0, q4                  ; lo

+    vqmovun.s16     d1, q5

+    vqmovun.s16     d2, q6                  ; hi

+    vqmovun.s16     d3, q7

+    vst1.32         {d0[0]}, [r2], r3       ; lo

+    vst1.32         {d0[1]}, [r1], r3       ; hi

+    vst1.32         {d1[0]}, [r2], r3

+    vst1.32         {d1[1]}, [r1], r3

+    vst1.32         {d2[0]}, [r2], r3

+    vst1.32         {d2[1]}, [r1], r3

+    vst1.32         {d3[0]}, [r2]

+    vst1.32         {d3[1]}, [r1]

+    bx             lr

+    ENDP           ; |idct_dequant_full_2x_neon|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x4e7b

+; because the lowest bit in 0x8a8c is 0, we can pre-shift this

+sinpi8sqrt2       DCD 0x4546

+    END

--- /dev/null

+++ b/vp8/common/dequantize.c

@@ -1,0 +1,44 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "dequantize.h"

+#include "vp8/common/idct.h"

+#include "vpx_mem/vpx_mem.h"

+void vp8_dequantize_b_c(BLOCKD *d)

+{

+    int i;

+    short *DQ  = d->dqcoeff;

+    short *Q   = d->qcoeff;

+    short *DQC = d->dequant;

+    for (i = 0; i < 16; i++)

+    {

+        DQ[i] = Q[i] * DQC[i];

+    }

+}

+void vp8_dequant_idct_add_c(short *input, short *dq,

+                            unsigned char *dest, int stride)

+{

+    int i;

+    for (i = 0; i < 16; i++)

+    {

+        input[i] = dq[i] * input[i];

+    }

+    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);

+    vpx_memset(input, 0, 32);

+}

--- /dev/null

+++ b/vp8/common/dequantize.h

@@ -1,0 +1,85 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DEQUANTIZE_H

+#define DEQUANTIZE_H

+#include "vp8/common/blockd.h"

+#define prototype_dequant_block(sym) \

+    void sym(BLOCKD *x)

+#define prototype_dequant_idct_add(sym) \

+    void sym(short *input, short *dq, \

+             unsigned char *output, \

+             int stride)

+#define prototype_dequant_idct_add_y_block(sym) \

+    void sym(short *q, short *dq, \

+             unsigned char *dst, \

+             int stride, char *eobs)

+#define prototype_dequant_idct_add_uv_block(sym) \

+    void sym(short *q, short *dq, \

+             unsigned char *dst_u, \

+             unsigned char *dst_v, int stride, char *eobs)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/dequantize_x86.h"

+#endif

+#if ARCH_ARM

+#include "arm/dequantize_arm.h"

+#endif

+#ifndef vp8_dequant_block

+#define vp8_dequant_block vp8_dequantize_b_c

+#endif

+extern prototype_dequant_block(vp8_dequant_block);

+#ifndef vp8_dequant_idct_add

+#define vp8_dequant_idct_add vp8_dequant_idct_add_c

+#endif

+extern prototype_dequant_idct_add(vp8_dequant_idct_add);

+#ifndef vp8_dequant_idct_add_y_block

+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c

+#endif

+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);

+#ifndef vp8_dequant_idct_add_uv_block

+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c

+#endif

+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);

+typedef prototype_dequant_block((*vp8_dequant_block_fn_t));

+typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));

+typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));

+typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));

+typedef struct

+{

+    vp8_dequant_block_fn_t               block;

+    vp8_dequant_idct_add_fn_t            idct_add;

+    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;

+    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

+} vp8_dequant_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define DEQUANT_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define DEQUANT_INVOKE(ctx,fn) vp8_dequant_##fn

+#endif

+#endif

--- a/vp8/common/generic/systemdependent.c

+++ b/vp8/common/generic/systemdependent.c

@@ -70,6 +70,14 @@

 #if CONFIG_RUNTIME_CPU_DETECT

     VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

+    rtcd->dequant.block             = vp8_dequantize_b_c;

+    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;

+    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;

+    rtcd->dequant.idct_add_uv_block =

+        vp8_dequant_idct_add_uv_block_c;

     rtcd->idct.idct16       = vp8_short_idct4x4llm_c;

     rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;

     rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;

--- /dev/null

+++ b/vp8/common/idct_blk.c

@@ -1,0 +1,90 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/idct.h"

+#include "dequantize.h"

+void vp8_dequant_idct_add_c(short *input, short *dq,

+                            unsigned char *dest, int stride);

+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,

+                            int pred_stride, unsigned char *dst_ptr,

+                            int dst_stride);

+void vp8_dequant_idct_add_y_block_c

+            (short *q, short *dq,

+             unsigned char *dst, int stride, char *eobs)

+{

+    int i, j;

+    for (i = 0; i < 4; i++)

+    {

+        for (j = 0; j < 4; j++)

+        {

+            if (*eobs++ > 1)

+                vp8_dequant_idct_add_c (q, dq, dst, stride);

+            else

+            {

+                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);

+                ((int *)q)[0] = 0;

+            }

+            q   += 16;

+            dst += 4;

+        }

+        dst += 4*stride - 16;

+    }

+}

+void vp8_dequant_idct_add_uv_block_c

+            (short *q, short *dq,

+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

+{

+    int i, j;

+    for (i = 0; i < 2; i++)

+    {

+        for (j = 0; j < 2; j++)

+        {

+            if (*eobs++ > 1)

+                vp8_dequant_idct_add_c (q, dq, dstu, stride);

+            else

+            {

+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);

+                ((int *)q)[0] = 0;

+            }

+            q    += 16;

+            dstu += 4;

+        }

+        dstu += 4*stride - 8;

+    }

+    for (i = 0; i < 2; i++)

+    {

+        for (j = 0; j < 2; j++)

+        {

+            if (*eobs++ > 1)

+                vp8_dequant_idct_add_c (q, dq, dstv, stride);

+            else

+            {

+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);

+                ((int *)q)[0] = 0;

+            }

+            q    += 16;

+            dstv += 4;

+        }

+        dstv += 4*stride - 8;

+    }

+}

--- a/vp8/common/invtrans.c

+++ /dev/null

@@ -1,56 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "invtrans.h"

-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,

-                             int pitch)

-{

-    if (*b->eob > 1)

-    {

-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,

-              *(b->base_dst) + b->dst, b->dst_stride);

-    }

-    else

-    {

-        IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,

-                         *(b->base_dst) + b->dst, b->dst_stride);

-    }

-}

-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)

-{

-    int i;

-    if(x->mode_info_context->mbmi.mode != SPLITMV)

-    {

-        /* do 2nd order transform on the dc block */

-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);

-    }

-    for (i = 0; i < 16; i++)

-    {

-        vp8_inverse_transform_b(rtcd, &x->block[i], 16);

-    }

-}

-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)

-{

-    int i;

-    for (i = 16; i < 24; i++)

-    {

-        vp8_inverse_transform_b(rtcd, &x->block[i], 8);

-    }

-}

--- a/vp8/common/invtrans.h

+++ b/vp8/common/invtrans.h

@@ -15,9 +15,51 @@

 #include "vpx_config.h"

 #include "idct.h"

 #include "blockd.h"

-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);

-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

+#include "onyxc_int.h"

+static void eob_adjust(char *eobs, short *diff)

+{

+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */

+    int js;

+    for(js = 0; js < 16; js++)

+    {

+        if((eobs[js] == 0) && (diff[0] != 0))

+            eobs[js]++;

+        diff+=16;

+    }

+}

+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,

+                                      const VP8_COMMON_RTCD *rtcd)

+{

+    short *DQC = xd->block[0].dequant;

+    /* save the dc dequant constant in case it is overridden */

+    short dc_dequant_temp = DQC[0];

+    if (xd->mode_info_context->mbmi.mode != SPLITMV)

+    {

+        /* do 2nd order transform on the dc block */

+        if (xd->eobs[24] > 1)

+        {

+            IDCT_INVOKE(&rtcd->idct, iwalsh16)

+                (&xd->block[24].dqcoeff[0], xd->qcoeff);

+        }

+        else

+        {

+            IDCT_INVOKE(&rtcd->idct, iwalsh1)

+                (&xd->block[24].dqcoeff[0], xd->qcoeff);

+        }

+        eob_adjust(xd->eobs, xd->qcoeff);

+        /* override the dc dequant constant */

+        DQC[0] = 1;

+    }

+    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)

+                    (xd->qcoeff, xd->block[0].dequant,

+                     xd->dst.y_buffer,

+                     xd->dst.y_stride, xd->eobs);

+    /* restore the dc dequant constant */

+    DQC[0] = dc_dequant_temp;

+}

 #endif

--- a/vp8/common/onyxc_int.h

+++ b/vp8/common/onyxc_int.h

@@ -22,6 +22,7 @@

 #if CONFIG_POSTPROC

 #include "postproc.h"

 #endif

+#include "dequantize.h"

 /*#ifdef PACKET_TESTING*/

 #include "header.h"

@@ -73,6 +74,7 @@

 typedef struct VP8_COMMON_RTCD

 #if CONFIG_RUNTIME_CPU_DETECT

+    vp8_dequant_rtcd_vtable_t        dequant;

     vp8_idct_rtcd_vtable_t        idct;

     vp8_recon_rtcd_vtable_t       recon;

     vp8_subpix_rtcd_vtable_t      subpix;

--- a/vp8/common/reconinter.c

+++ b/vp8/common/reconinter.c

@@ -334,11 +334,12 @@

 /*encoder only*/

-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)

+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,

+                                         unsigned char *dst_y,

+                                         int dst_ystride)

     unsigned char *ptr_base;

     unsigned char *ptr;

-    unsigned char *pred_ptr = x->predictor;

     int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;

     int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;

     int pre_stride = x->block[0].pre_stride;

@@ -348,11 +349,13 @@

     if ((mv_row | mv_col) & 7)

-        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);

+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,

+                                 dst_y, dst_ystride);

     else

-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);

+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,

+            dst_ystride);

@@ -594,71 +597,5 @@

         build_4x4uvmvs(xd);

         build_inter4x4_predictors_mb(xd);

-    }

-}

-/* encoder only*/

-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)

-{

-    int i;

-    if (x->mode_info_context->mbmi.partitioning < 3)

-    {

-        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];

-        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];

-        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];

-        x->block[10].bmi = x->mode_info_context->bmi[10];

-        build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);

-        build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);

-        build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);

-        build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);

-    }

-    else

-    {

-        for (i = 0; i < 16; i += 2)

-        {

-            BLOCKD *d0 = &x->block[i];

-            BLOCKD *d1 = &x->block[i+1];

-            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];

-            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];

-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)

-                build_inter_predictors2b(x, d0, d0->predictor, 16);

-            else

-            {

-                build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);

-                build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);

-            }

-        }

-    }

-    for (i = 16; i < 24; i += 2)

-    {

-        BLOCKD *d0 = &x->block[i];

-        BLOCKD *d1 = &x->block[i+1];

-        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)

-            build_inter_predictors2b(x, d0, d0->predictor, 8);

-        else

-        {

-            build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);

-            build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);

-        }

-    }

-}

-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)

-{

-    if (xd->mode_info_context->mbmi.mode != SPLITMV)

-    {

-        vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],

-                                           &xd->predictor[320], 16, 8);

-    }

-    else

-    {

-        build_4x4uvmvs(xd);

-        build_inter4x4_predictors_mb_e(xd);

--- a/vp8/common/reconinter.h

+++ b/vp8/common/reconinter.h

@@ -21,11 +21,13 @@

                                                int dst_uvstride);

-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);

-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);

+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,

+                                                unsigned char *dst_y,

+                                                int dst_ystride);

+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,

+                                         vp8_subpix_fn_t sppf);

 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);

 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);

-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);

 #endif

--- /dev/null

+++ b/vp8/common/x86/dequantize_mmx.asm

@@ -1,0 +1,258 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)

+global sym(vp8_dequantize_b_impl_mmx)

+sym(vp8_dequantize_b_impl_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov       rsi, arg(0) ;sq

+        mov       rdi, arg(1) ;dq

+        mov       rax, arg(2) ;q

+        movq      mm1, [rsi]

+        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.

+        movq      [rdi], mm1

+        movq      mm1, [rsi+8]

+        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.

+        movq      [rdi+8], mm1

+        movq      mm1, [rsi+16]

+        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.

+        movq      [rdi+16], mm1

+        movq      mm1, [rsi+24]

+        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.

+        movq      [rdi+24], mm1

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void dequant_idct_add_mmx(

+;short *input,            0

+;short *dq,               1

+;unsigned char *dest,     2

+;int stride)              3

+global sym(vp8_dequant_idct_add_mmx)

+sym(vp8_dequant_idct_add_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    GET_GOT     rbx

+    push        rdi

+    ; end prolog

+        mov         rax,    arg(0) ;input

+        mov         rdx,    arg(1) ;dq

+        movq        mm0,    [rax   ]

+        pmullw      mm0,    [rdx]

+        movq        mm1,    [rax +8]

+        pmullw      mm1,    [rdx +8]

+        movq        mm2,    [rax+16]

+        pmullw      mm2,    [rdx+16]

+        movq        mm3,    [rax+24]

+        pmullw      mm3,    [rdx+24]

+        mov         rdx,    arg(2) ;dest

+        pxor        mm7,    mm7

+        movq        [rax],   mm7

+        movq        [rax+8], mm7

+        movq        [rax+16],mm7

+        movq        [rax+24],mm7

+        movsxd      rdi,            dword ptr arg(3) ;stride

+        psubw       mm0,            mm2             ; b1= 0-2

+        paddw       mm2,            mm2             ;

+        movq        mm5,            mm1

+        paddw       mm2,            mm0             ; a1 =0+2

+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

+        movq        mm7,            mm3             ;

+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       mm7,            mm5             ; c1

+        movq        mm5,            mm1

+        movq        mm4,            mm3

+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

+        paddw       mm5,            mm1

+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

+        paddw       mm3,            mm4

+        paddw       mm3,            mm5             ; d1

+        movq        mm6,            mm2             ; a1

+        movq        mm4,            mm0             ; b1

+        paddw       mm2,            mm3             ;0

+        paddw       mm4,            mm7             ;1

+        psubw       mm0,            mm7             ;2

+        psubw       mm6,            mm3             ;3

+        movq        mm1,            mm2             ; 03 02 01 00

+        movq        mm3,            mm4             ; 23 22 21 20

+        punpcklwd   mm1,            mm0             ; 11 01 10 00

+        punpckhwd   mm2,            mm0             ; 13 03 12 02

+        punpcklwd   mm3,            mm6             ; 31 21 30 20

+        punpckhwd   mm4,            mm6             ; 33 23 32 22

+        movq        mm0,            mm1             ; 11 01 10 00

+        movq        mm5,            mm2             ; 13 03 12 02

+        punpckldq   mm0,            mm3             ; 30 20 10 00

+        punpckhdq   mm1,            mm3             ; 31 21 11 01

+        punpckldq   mm2,            mm4             ; 32 22 12 02

+        punpckhdq   mm5,            mm4             ; 33 23 13 03

+        movq        mm3,            mm5             ; 33 23 13 03

+        psubw       mm0,            mm2             ; b1= 0-2

+        paddw       mm2,            mm2             ;

+        movq        mm5,            mm1

+        paddw       mm2,            mm0             ; a1 =0+2

+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

+        movq        mm7,            mm3             ;

+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       mm7,            mm5             ; c1

+        movq        mm5,            mm1

+        movq        mm4,            mm3

+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

+        paddw       mm5,            mm1

+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

+        paddw       mm3,            mm4

+        paddw       mm3,            mm5             ; d1

+        paddw       mm0,            [GLOBAL(fours)]

+        paddw       mm2,            [GLOBAL(fours)]

+        movq        mm6,            mm2             ; a1

+        movq        mm4,            mm0             ; b1

+        paddw       mm2,            mm3             ;0

+        paddw       mm4,            mm7             ;1

+        psubw       mm0,            mm7             ;2

+        psubw       mm6,            mm3             ;3

+        psraw       mm2,            3

+        psraw       mm0,            3

+        psraw       mm4,            3

+        psraw       mm6,            3

+        movq        mm1,            mm2             ; 03 02 01 00

+        movq        mm3,            mm4             ; 23 22 21 20

+        punpcklwd   mm1,            mm0             ; 11 01 10 00

+        punpckhwd   mm2,            mm0             ; 13 03 12 02

+        punpcklwd   mm3,            mm6             ; 31 21 30 20

+        punpckhwd   mm4,            mm6             ; 33 23 32 22

+        movq        mm0,            mm1             ; 11 01 10 00

+        movq        mm5,            mm2             ; 13 03 12 02

+        punpckldq   mm0,            mm3             ; 30 20 10 00

+        punpckhdq   mm1,            mm3             ; 31 21 11 01

+        punpckldq   mm2,            mm4             ; 32 22 12 02

+        punpckhdq   mm5,            mm4             ; 33 23 13 03

+        pxor        mm7,            mm7

+        movd        mm4,            [rdx]

+        punpcklbw   mm4,            mm7

+        paddsw      mm0,            mm4

+        packuswb    mm0,            mm7

+        movd        [rdx],          mm0

+        movd        mm4,            [rdx+rdi]

+        punpcklbw   mm4,            mm7

+        paddsw      mm1,            mm4

+        packuswb    mm1,            mm7

+        movd        [rdx+rdi],      mm1

+        movd        mm4,            [rdx+2*rdi]

+        punpcklbw   mm4,            mm7

+        paddsw      mm2,            mm4

+        packuswb    mm2,            mm7

+        movd        [rdx+rdi*2],    mm2

+        add         rdx,            rdi

+        movd        mm4,            [rdx+2*rdi]

+        punpcklbw   mm4,            mm7

+        paddsw      mm5,            mm4

+        packuswb    mm5,            mm7

+        movd        [rdx+rdi*2],    mm5

+    ; begin epilog

+    pop rdi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+x_s1sqr2:

+    times 4 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+    times 4 dw 0x4E7B

+align 16

+fours:

+    times 4 dw 0x0004

--- /dev/null

+++ b/vp8/common/x86/dequantize_x86.h

@@ -1,0 +1,58 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DEQUANTIZE_X86_H

+#define DEQUANTIZE_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+extern prototype_dequant_block(vp8_dequantize_b_mmx);

+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);

+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);

+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_dequant_block

+#define vp8_dequant_block vp8_dequantize_b_mmx

+#undef  vp8_dequant_idct_add

+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx

+#undef vp8_dequant_idct_add_y_block

+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx

+#undef vp8_dequant_idct_add_uv_block

+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx

+#endif

+#endif

+#if HAVE_SSE2

+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);

+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef vp8_dequant_idct_add_y_block

+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2

+#undef vp8_dequant_idct_add_uv_block

+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp8/common/x86/idct_blk_mmx.c

@@ -1,0 +1,127 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/idct.h"

+#include "vp8/common/dequantize.h"

+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

+void vp8_dequantize_b_mmx(BLOCKD *d)

+{

+    short *sq = (short *) d->qcoeff;

+    short *dq = (short *) d->dqcoeff;

+    short *q = (short *) d->dequant;

+    vp8_dequantize_b_impl_mmx(sq, dq, q);

+}

+void vp8_dequant_idct_add_y_block_mmx

+            (short *q, short *dq,

+             unsigned char *dst, int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 4; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_mmx (q, dq, dst, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,

+                                      dst+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        if (eobs[2] > 1)

+            vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);

+        else if (eobs[2] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,

+                                      dst+8, stride);

+            ((int *)(q+32))[0] = 0;

+        }

+        if (eobs[3] > 1)

+            vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);

+        else if (eobs[3] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,

+                                      dst+12, stride);

+            ((int *)(q+48))[0] = 0;

+        }

+        q    += 64;

+        dst  += 4*stride;

+        eobs += 4;

+    }

+}

+void vp8_dequant_idct_add_uv_block_mmx

+            (short *q, short *dq,

+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 2; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_mmx (q, dq, dstu, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,

+                                      dstu+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        q    += 32;

+        dstu += 4*stride;

+        eobs += 2;

+    }

+    for (i = 0; i < 2; i++)

+    {

+        if (eobs[0] > 1)

+            vp8_dequant_idct_add_mmx (q, dq, dstv, stride);

+        else if (eobs[0] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);

+            ((int *)q)[0] = 0;

+        }

+        if (eobs[1] > 1)

+            vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);

+        else if (eobs[1] == 1)

+        {

+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,

+                                      dstv+4, stride);

+            ((int *)(q+16))[0] = 0;

+        }

+        q    += 32;

+        dstv += 4*stride;

+        eobs += 2;

+    }

+}

--- /dev/null

+++ b/vp8/common/x86/idct_blk_sse2.c

@@ -1,0 +1,90 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8/common/idct.h"

+#include "vp8/common/dequantize.h"

+void vp8_idct_dequant_0_2x_sse2

+            (short *q, short *dq ,

+             unsigned char *dst, int dst_stride);

+void vp8_idct_dequant_full_2x_sse2

+            (short *q, short *dq ,

+             unsigned char *dst, int dst_stride);

+void vp8_dequant_idct_add_y_block_sse2

+            (short *q, short *dq,

+             unsigned char *dst, int stride, char *eobs)

+{

+    int i;

+    for (i = 0; i < 4; i++)

+    {

+        if (((short *)(eobs))[0])

+        {

+            if (((short *)(eobs))[0] & 0xfefe)

+                vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);

+            else

+                vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);

+        }

+        if (((short *)(eobs))[1])

+        {

+            if (((short *)(eobs))[1] & 0xfefe)

+                vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);

+            else

+                vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);

+        }

+        q    += 64;

+        dst  += stride*4;

+        eobs += 4;

+    }

+}

+void vp8_dequant_idct_add_uv_block_sse2

+            (short *q, short *dq,

+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

+{

+    if (((short *)(eobs))[0])

+    {

+        if (((short *)(eobs))[0] & 0xfefe)

+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);

+        else

+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);

+    }

+    q    += 32;

+    dstu += stride*4;

+    if (((short *)(eobs))[1])

+    {

+        if (((short *)(eobs))[1] & 0xfefe)

+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);

+        else

+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);

+    }

+    q    += 32;

+    if (((short *)(eobs))[2])

+    {

+        if (((short *)(eobs))[2] & 0xfefe)

+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);

+        else

+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);

+    }

+    q    += 32;

+    dstv += stride*4;

+    if (((short *)(eobs))[3])

+    {

+      if (((short *)(eobs))[3] & 0xfefe)

+          vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);

+      else

+          vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);

+    }

+}

--- a/vp8/common/x86/x86_systemdependent.c

+++ b/vp8/common/x86/x86_systemdependent.c

@@ -37,6 +37,11 @@

     if (flags & HAS_MMX)

+        rtcd->dequant.block               = vp8_dequantize_b_mmx;

+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;

+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;

+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;

         rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;

         rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;

         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;

@@ -81,6 +86,9 @@

             vp8_build_intra_predictors_mbuv_sse2;

         rtcd->recon.build_intra_predictors_mbuv_s =

             vp8_build_intra_predictors_mbuv_s_sse2;

+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;

+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;

         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;

--- a/vp8/decoder/arm/arm_dsystemdependent.c

+++ b/vp8/decoder/arm/arm_dsystemdependent.c

@@ -11,9 +11,6 @@

 #include "vpx_config.h"

 #include "vpx_ports/arm.h"

-#include "vp8/common/blockd.h"

-#include "vp8/common/pragmas.h"

-#include "vp8/decoder/dequantize.h"

 #include "vp8/decoder/onyxd_int.h"

 void vp8_arch_arm_decode_init(VP8D_COMP *pbi)

@@ -30,10 +27,6 @@

 #if HAVE_ARMV6

     if (flags & HAS_MEDIA)

-        pbi->dequant.block               = vp8_dequantize_b_v6;

-        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;

-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;

-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;

 #endif

@@ -40,10 +33,6 @@

 #if HAVE_ARMV7

     if (flags & HAS_NEON)

-        pbi->dequant.block               = vp8_dequantize_b_neon;

-        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;

-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;

-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;

 #endif

 #endif

--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm

+++ /dev/null

@@ -1,190 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT |vp8_dequant_idct_add_v6|

-    AREA |.text|, CODE, READONLY

-;void vp8_dequant_idct_v6(short *input, short *dq,

-;                         unsigned char *dest, int stride)

-; r0 = q

-; r1 = dq

-; r2 = dst

-; r3 = stride

-|vp8_dequant_idct_add_v6| PROC

-    stmdb   sp!, {r4-r11, lr}

-    ldr     r4, [r0]                ;input

-    ldr     r5, [r1], #4            ;dq

-    sub     sp, sp, #4

-    str     r3, [sp]

-    mov     r12, #4

-vp8_dequant_add_loop

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    subs    r12, r12, #1

-    ldrne   r4, [r0, #4]

-    ldrne   r5, [r1], #4

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    bne     vp8_dequant_add_loop

-    sub     r0, r0, #32

-    mov     r1, r0

-; short_idct4x4llm_v6_dual

-    ldr     r3, cospi8sqrt2minus1

-    ldr     r4, sinpi8sqrt2

-    ldr     r6, [r0, #8]

-    mov     r5, #2

-vp8_dequant_idct_loop1_v6

-    ldr     r12, [r0, #24]

-    ldr     r14, [r0, #16]

-    smulwt  r9, r3, r6

-    smulwb  r7, r3, r6

-    smulwt  r10, r4, r6

-    smulwb  r8, r4, r6

-    pkhbt   r7, r7, r9, lsl #16

-    smulwt  r11, r3, r12

-    pkhbt   r8, r8, r10, lsl #16

-    uadd16  r6, r6, r7

-    smulwt  r7, r4, r12

-    smulwb  r9, r3, r12

-    smulwb  r10, r4, r12

-    subs    r5, r5, #1

-    pkhbt   r9, r9, r11, lsl #16

-    ldr     r11, [r0], #4

-    pkhbt   r10, r10, r7, lsl #16

-    uadd16  r7, r12, r9

-    usub16  r7, r8, r7

-    uadd16  r6, r6, r10

-    uadd16  r10, r11, r14

-    usub16  r8, r11, r14

-    uadd16  r9, r10, r6

-    usub16  r10, r10, r6

-    uadd16  r6, r8, r7

-    usub16  r7, r8, r7

-    str     r6, [r1, #8]

-    ldrne   r6, [r0, #8]

-    str     r7, [r1, #16]

-    str     r10, [r1, #24]

-    str     r9, [r1], #4

-    bne     vp8_dequant_idct_loop1_v6

-    mov     r5, #2

-    sub     r0, r1, #8

-vp8_dequant_idct_loop2_v6

-    ldr     r6, [r0], #4

-    ldr     r7, [r0], #4

-    ldr     r8, [r0], #4

-    ldr     r9, [r0], #4

-    smulwt  r1, r3, r6

-    smulwt  r12, r4, r6

-    smulwt  lr, r3, r8

-    smulwt  r10, r4, r8

-    pkhbt   r11, r8, r6, lsl #16

-    pkhbt   r1, lr, r1, lsl #16

-    pkhbt   r12, r10, r12, lsl #16

-    pkhtb   r6, r6, r8, asr #16

-    uadd16  r6, r1, r6

-    pkhbt   lr, r9, r7, lsl #16

-    uadd16  r10, r11, lr

-    usub16  lr, r11, lr

-    pkhtb   r8, r7, r9, asr #16

-    subs    r5, r5, #1

-    smulwt  r1, r3, r8

-    smulwb  r7, r3, r8

-    smulwt  r11, r4, r8

-    smulwb  r9, r4, r8

-    pkhbt   r1, r7, r1, lsl #16

-    uadd16  r8, r1, r8

-    pkhbt   r11, r9, r11, lsl #16

-    usub16  r1, r12, r8

-    uadd16  r8, r11, r6

-    ldr     r9, c0x00040004

-    ldr     r12, [sp]               ; get stride from stack

-    uadd16  r6, r10, r8

-    usub16  r7, r10, r8

-    uadd16  r7, r7, r9

-    uadd16  r6, r6, r9

-    uadd16  r10, r14, r1

-    usub16  r1, r14, r1

-    uadd16  r10, r10, r9

-    uadd16  r1, r1, r9

-    ldr     r11, [r2]               ; load input from dst

-    mov     r8, r7, asr #3

-    pkhtb   r9, r8, r10, asr #19

-    mov     r8, r1, asr #3

-    pkhtb   r8, r8, r6, asr #19

-    uxtb16  lr, r11, ror #8

-    qadd16  r9, r9, lr

-    uxtb16  lr, r11

-    qadd16  r8, r8, lr

-    usat16  r9, #8, r9

-    usat16  r8, #8, r8

-    orr     r9, r8, r9, lsl #8

-    ldr     r11, [r2, r12]          ; load input from dst

-    mov     r7, r7, lsl #16

-    mov     r1, r1, lsl #16

-    mov     r10, r10, lsl #16

-    mov     r6, r6, lsl #16

-    mov     r7, r7, asr #3

-    pkhtb   r7, r7, r10, asr #19

-    mov     r1, r1, asr #3

-    pkhtb   r1, r1, r6, asr #19

-    uxtb16  r8, r11, ror #8

-    qadd16  r7, r7, r8

-    uxtb16  r8, r11

-    qadd16  r1, r1, r8

-    usat16  r7, #8, r7

-    usat16  r1, #8, r1

-    orr     r1, r1, r7, lsl #8

-    str     r9, [r2], r12           ; store output to dst

-    str     r1, [r2], r12           ; store output to dst

-    bne     vp8_dequant_idct_loop2_v6

-; vpx_memset

-    sub     r0, r0, #32

-    add     sp, sp, #4

-    mov     r12, #0

-    str     r12, [r0]

-    str     r12, [r0, #4]

-    str     r12, [r0, #8]

-    str     r12, [r0, #12]

-    str     r12, [r0, #16]

-    str     r12, [r0, #20]

-    str     r12, [r0, #24]

-    str     r12, [r0, #28]

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP    ; |vp8_dequant_idct_add_v6|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x00004E7B

-sinpi8sqrt2       DCD 0x00008A8C

-c0x00040004       DCD 0x00040004

-    END

--- a/vp8/decoder/arm/armv6/dequantize_v6.asm

+++ /dev/null

@@ -1,69 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequantize_b_loop_v6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;-------------------------------

-;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

-; r0    short *Q,

-; r1    short *DQC

-; r2    short *DQ

-|vp8_dequantize_b_loop_v6| PROC

-    stmdb   sp!, {r4-r9, lr}

-    ldr     r3, [r0]                ;load Q

-    ldr     r4, [r1]                ;load DQC

-    ldr     r5, [r0, #4]

-    ldr     r6, [r1, #4]

-    mov     r12, #2                 ;loop counter

-dequant_loop

-    smulbb  r7, r3, r4              ;multiply

-    smultt  r8, r3, r4

-    smulbb  r9, r5, r6

-    smultt  lr, r5, r6

-    ldr     r3, [r0, #8]

-    ldr     r4, [r1, #8]

-    ldr     r5, [r0, #12]

-    ldr     r6, [r1, #12]

-    strh    r7, [r2], #2            ;store result

-    smulbb  r7, r3, r4              ;multiply

-    strh    r8, [r2], #2

-    smultt  r8, r3, r4

-    strh    r9, [r2], #2

-    smulbb  r9, r5, r6

-    strh    lr, [r2], #2

-    smultt  lr, r5, r6

-    subs    r12, r12, #1

-    add     r0, r0, #16

-    add     r1, r1, #16

-    ldrne       r3, [r0]

-    strh    r7, [r2], #2            ;store result

-    ldrne       r4, [r1]

-    strh    r8, [r2], #2

-    ldrne       r5, [r0, #4]

-    strh    r9, [r2], #2

-    ldrne       r6, [r1, #4]

-    strh    lr, [r2], #2

-    bne     dequant_loop

-    ldmia   sp!, {r4-r9, pc}

-    ENDP    ;|vp8_dequantize_b_loop_v6|

-    END

--- a/vp8/decoder/arm/armv6/idct_blk_v6.c

+++ /dev/null

@@ -1,116 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,

-                                     unsigned char *dst,

-                                     int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_v6 (q, dq, dst, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        if (eobs[2] > 1)

-            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);

-        else if (eobs[2] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);

-            ((int *)(q+32))[0] = 0;

-        }

-        if (eobs[3] > 1)

-            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);

-        else if (eobs[3] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);

-            ((int *)(q+48))[0] = 0;

-        }

-        q    += 64;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

-void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,

-                                      unsigned char *dstu,

-                                      unsigned char *dstv,

-                                      int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,

-                                                  dstu+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        q    += 32;

-        dstu += 4*stride;

-        eobs += 2;

-    }

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,

-                                                  dstv+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        q    += 32;

-        dstv += 4*stride;

-        eobs += 2;

-    }

-}

--- a/vp8/decoder/arm/dequantize_arm.c

+++ /dev/null

@@ -1,46 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/decoder/dequantize.h"

-#include "vp8/common/idct.h"

-#include "vpx_mem/vpx_mem.h"

-#if HAVE_ARMV7

-extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);

-#endif

-#if HAVE_ARMV6

-extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

-#endif

-#if HAVE_ARMV7

-void vp8_dequantize_b_neon(BLOCKD *d)

-{

-    short *DQ  = d->dqcoeff;

-    short *Q   = d->qcoeff;

-    short *DQC = d->dequant;

-    vp8_dequantize_b_loop_neon(Q, DQC, DQ);

-}

-#endif

-#if HAVE_ARMV6

-void vp8_dequantize_b_v6(BLOCKD *d)

-{

-    short *DQ  = d->dqcoeff;

-    short *Q   = d->qcoeff;

-    short *DQC = d->dequant;

-    vp8_dequantize_b_loop_v6(Q, DQC, DQ);

-}

-#endif

--- a/vp8/decoder/arm/dequantize_arm.h

+++ /dev/null

@@ -1,59 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DEQUANTIZE_ARM_H

-#define DEQUANTIZE_ARM_H

-#if HAVE_ARMV6

-extern prototype_dequant_block(vp8_dequantize_b_v6);

-extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);

-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);

-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_dequant_block

-#define vp8_dequant_block vp8_dequantize_b_v6

-#undef vp8_dequant_idct_add

-#define vp8_dequant_idct_add vp8_dequant_idct_add_v6

-#undef vp8_dequant_idct_add_y_block

-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

-#undef vp8_dequant_idct_add_uv_block

-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6

-#endif

-#endif

-#if HAVE_ARMV7

-extern prototype_dequant_block(vp8_dequantize_b_neon);

-extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);

-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);

-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_dequant_block

-#define vp8_dequant_block vp8_dequantize_b_neon

-#undef vp8_dequant_idct_add

-#define vp8_dequant_idct_add vp8_dequant_idct_add_neon

-#undef vp8_dequant_idct_add_y_block

-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

-#undef vp8_dequant_idct_add_uv_block

-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon

-#endif

-#endif

-#endif

--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm

+++ /dev/null

@@ -1,131 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequant_idct_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_dequant_idct_add_neon(short *input, short *dq,

-;                           unsigned char *dest, int stride)

-; r0    short *input,

-; r1    short *dq,

-; r2    unsigned char *dest

-; r3    int stride

-|vp8_dequant_idct_add_neon| PROC

-    vld1.16         {q3, q4}, [r0]

-    vld1.16         {q5, q6}, [r1]

-    add             r1, r2, r3              ; r1 = dest + stride

-    lsl             r3, #1                  ; 2x stride

-    vld1.32         {d14[0]}, [r2], r3

-    vld1.32         {d14[1]}, [r1], r3

-    vld1.32         {d15[0]}, [r2]

-    vld1.32         {d15[1]}, [r1]

-    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant

-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon

-    vmul.i16        q2, q4, q6

-;|short_idct4x4llm_neon| PROC

-    vld1.16         {d0}, [r12]

-    vswp            d3, d4                  ;q2(vp[4] vp[12])

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-; memset(input, 0, 32) -- 32bytes

-    vmov.i16        q14, #0

-    vswp            d3, d4

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vmov            q15, q14

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vst1.16         {q14, q15}, [r0]

-    vrshr.s16       d2, d2, #3

-    vrshr.s16       d3, d3, #3

-    vrshr.s16       d4, d4, #3

-    vrshr.s16       d5, d5, #3

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-    vaddw.u8        q1, q1, d14

-    vaddw.u8        q2, q2, d15

-    sub             r2, r2, r3

-    sub             r1, r1, r3

-    vqmovun.s16     d0, q1

-    vqmovun.s16     d1, q2

-    vst1.32         {d0[0]}, [r2], r3

-    vst1.32         {d0[1]}, [r1], r3

-    vst1.32         {d1[0]}, [r2]

-    vst1.32         {d1[1]}, [r1]

-    bx             lr

-    ENDP           ; |vp8_dequant_idct_add_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b4e7b

-sinpi8sqrt2       DCD 0x8a8c8a8c

-    END

--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm

+++ /dev/null

@@ -1,34 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequantize_b_loop_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    short *Q,

-; r1    short *DQC

-; r2    short *DQ

-|vp8_dequantize_b_loop_neon| PROC

-    vld1.16         {q0, q1}, [r0]

-    vld1.16         {q2, q3}, [r1]

-    vmul.i16        q4, q0, q2

-    vmul.i16        q5, q1, q3

-    vst1.16         {q4, q5}, [r2]

-    bx             lr

-    ENDP

-    END

--- a/vp8/decoder/arm/neon/idct_blk_neon.c

+++ /dev/null

@@ -1,97 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-/* place these declarations here because we don't want to maintain them

- * outside of this scope

- */

-void idct_dequant_full_2x_neon(short *q, short *dq,

-                               unsigned char *dst, int stride);

-void idct_dequant_0_2x_neon(short *q, short dq,

-                            unsigned char *dst, int stride);

-void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,

-                                       unsigned char *dst,

-                                       int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (((short *)(eobs))[0])

-        {

-            if (((short *)eobs)[0] & 0xfefe)

-                idct_dequant_full_2x_neon (q, dq, dst, stride);

-            else

-                idct_dequant_0_2x_neon (q, dq[0], dst, stride);

-        }

-        if (((short *)(eobs))[1])

-        {

-            if (((short *)eobs)[1] & 0xfefe)

-                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);

-            else

-                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);

-        }

-        q    += 64;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

-void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,

-                                        unsigned char *dstu,

-                                        unsigned char *dstv,

-                                        int stride, char *eobs)

-{

-    if (((short *)(eobs))[0])

-    {

-        if (((short *)eobs)[0] & 0xfefe)

-            idct_dequant_full_2x_neon (q, dq, dstu, stride);

-        else

-            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);

-    }

-    q    += 32;

-    dstu += 4*stride;

-    if (((short *)(eobs))[1])

-    {

-        if (((short *)eobs)[1] & 0xfefe)

-            idct_dequant_full_2x_neon (q, dq, dstu, stride);

-        else

-            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);

-    }

-    q += 32;

-    if (((short *)(eobs))[2])

-    {

-        if (((short *)eobs)[2] & 0xfefe)

-            idct_dequant_full_2x_neon (q, dq, dstv, stride);

-        else

-            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);

-    }

-    q    += 32;

-    dstv += 4*stride;

-    if (((short *)(eobs))[3])

-    {

-        if (((short *)eobs)[3] & 0xfefe)

-            idct_dequant_full_2x_neon (q, dq, dstv, stride);

-        else

-            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);

-    }

-}

--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm

+++ /dev/null

@@ -1,79 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_0_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_0_2x_neon(short *q, short dq,

-;                            unsigned char *dst, int stride);

-; r0   *q

-; r1   dq

-; r2   *dst

-; r3   stride

-|idct_dequant_0_2x_neon| PROC

-    push            {r4, r5}

-    add             r12, r2, #4

-    vld1.32         {d2[0]}, [r2], r3

-    vld1.32         {d8[0]}, [r12], r3

-    vld1.32         {d2[1]}, [r2], r3

-    vld1.32         {d8[1]}, [r12], r3

-    vld1.32         {d4[0]}, [r2], r3

-    vld1.32         {d10[0]}, [r12], r3

-    vld1.32         {d4[1]}, [r2], r3

-    vld1.32         {d10[1]}, [r12], r3

-    ldrh            r12, [r0]               ; lo q

-    ldrh            r4, [r0, #32]           ; hi q

-    mov             r5, #0

-    strh            r5, [r0]

-    strh            r5, [r0, #32]

-    sxth            r12, r12                ; lo

-    mul             r0, r12, r1

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q0, r0

-    sxth            r4, r4                  ; hi

-    mul             r0, r4, r1

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q3, r0

-    vaddw.u8        q1, q0, d2              ; lo

-    vaddw.u8        q2, q0, d4

-    vaddw.u8        q4, q3, d8              ; hi

-    vaddw.u8        q5, q3, d10

-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

-    add             r0, r2, #4

-    vqmovun.s16     d2, q1                  ; lo

-    vqmovun.s16     d4, q2

-    vqmovun.s16     d8, q4                  ; hi

-    vqmovun.s16     d10, q5

-    vst1.32         {d2[0]}, [r2], r3       ; lo

-    vst1.32         {d8[0]}, [r0], r3       ; hi

-    vst1.32         {d2[1]}, [r2], r3

-    vst1.32         {d8[1]}, [r0], r3

-    vst1.32         {d4[0]}, [r2], r3

-    vst1.32         {d10[0]}, [r0], r3

-    vst1.32         {d4[1]}, [r2]

-    vst1.32         {d10[1]}, [r0]

-    pop             {r4, r5}

-    bx              lr

-    ENDP            ; |idct_dequant_0_2x_neon|

-    END

--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm

+++ /dev/null

@@ -1,196 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_full_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_full_2x_neon(short *q, short *dq,

-;                               unsigned char *dst, int stride);

-; r0    *q,

-; r1    *dq,

-; r2    *dst

-; r3    stride

-|idct_dequant_full_2x_neon| PROC

-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

-    vld1.16         {q2, q3}, [r0]          ; l q

-    add             r0, r0, #32

-    vld1.16         {q4, q5}, [r0]          ; r q

-    add             r12, r2, #4

-    ; interleave the predictors

-    vld1.32         {d28[0]}, [r2],  r3     ; l pre

-    vld1.32         {d28[1]}, [r12], r3     ; r pre

-    vld1.32         {d29[0]}, [r2],  r3

-    vld1.32         {d29[1]}, [r12], r3

-    vld1.32         {d30[0]}, [r2],  r3

-    vld1.32         {d30[1]}, [r12], r3

-    vld1.32         {d31[0]}, [r2],  r3

-    vld1.32         {d31[1]}, [r12]

-    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant

-    ; dequant: q[i] = q[i] * dq[i]

-    vmul.i16        q2, q2, q0

-    vmul.i16        q3, q3, q1

-    vmul.i16        q4, q4, q0

-    vmul.i16        q5, q5, q1

-    vld1.16         {d0}, [r1]

-    ; q2: l0r0  q3: l8r8

-    ; q4: l4r4  q5: l12r12

-    vswp            d5, d8

-    vswp            d7, d10

-    ; _CONSTANTS_ * 4,12 >> 16

-    ; q6:  4 * sinpi : c1/temp1

-    ; q7: 12 * sinpi : d1/temp2

-    ; q8:  4 * cospi

-    ; q9: 12 * cospi

-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q7, q5, d0[2]

-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

-    vqdmulh.s16     q9, q5, d0[0]

-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

-    ; vqdmulh only accepts signed values. this was a problem because

-    ; our constant had the high bit set, and was treated as a negative value.

-    ; vqdmulh also doubles the value before it shifts by 16. we need to

-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

-    ; so we can shift the constant without losing precision. this avoids

-    ; shift again afterward, but also avoids the sign issue. win win!

-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

-    ; pre-shift it

-    vshr.s16        q8, q8, #1

-    vshr.s16        q9, q9, #1

-    ; q4:  4 +  4 * cospi : d1/temp1

-    ; q5: 12 + 12 * cospi : c1/temp2

-    vqadd.s16       q4, q4, q8

-    vqadd.s16       q5, q5, q9

-    ; c1 = temp1 - temp2

-    ; d1 = temp1 + temp2

-    vqsub.s16       q2, q6, q5

-    vqadd.s16       q3, q4, q7

-    ; [0]: a1+d1

-    ; [1]: b1+c1

-    ; [2]: b1-c1

-    ; [3]: a1-d1

-    vqadd.s16       q4, q10, q3

-    vqadd.s16       q5, q11, q2

-    vqsub.s16       q6, q11, q2

-    vqsub.s16       q7, q10, q3

-    ; rotate

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; idct loop 2

-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

-    ; q6: l 2, 6,10,14 r 2, 6,10,14

-    ; q7: l 3, 7,11,15 r 3, 7,11,15

-    ; q8:  1 * sinpi : c1/temp1

-    ; q9:  3 * sinpi : d1/temp2

-    ; q10: 1 * cospi

-    ; q11: 3 * cospi

-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q9, q7, d0[2]

-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

-    vqdmulh.s16     q11, q7, d0[0]

-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

-    ; see note on shifting above

-    vshr.s16        q10, q10, #1

-    vshr.s16        q11, q11, #1

-    ; q10: 1 + 1 * cospi : d1/temp1

-    ; q11: 3 + 3 * cospi : c1/temp2

-    vqadd.s16       q10, q5, q10

-    vqadd.s16       q11, q7, q11

-    ; q8: c1 = temp1 - temp2

-    ; q9: d1 = temp1 + temp2

-    vqsub.s16       q8, q8, q11

-    vqadd.s16       q9, q10, q9

-    ; a1+d1

-    ; b1+c1

-    ; b1-c1

-    ; a1-d1

-    vqadd.s16       q4, q2, q9

-    vqadd.s16       q5, q3, q8

-    vqsub.s16       q6, q3, q8

-    vqsub.s16       q7, q2, q9

-    ; +4 >> 3 (rounding)

-    vrshr.s16       q4, q4, #3              ; lo

-    vrshr.s16       q5, q5, #3

-    vrshr.s16       q6, q6, #3              ; hi

-    vrshr.s16       q7, q7, #3

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; adding pre

-    ; input is still packed. pre was read interleaved

-    vaddw.u8        q4, q4, d28

-    vaddw.u8        q5, q5, d29

-    vaddw.u8        q6, q6, d30

-    vaddw.u8        q7, q7, d31

-    vmov.i16        q14, #0

-    vmov            q15, q14

-    vst1.16         {q14, q15}, [r0]        ; write over high input

-    sub             r0, r0, #32

-    vst1.16         {q14, q15}, [r0]        ; write over low input

-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

-    add             r1, r2, #4              ; hi

-    ;saturate and narrow

-    vqmovun.s16     d0, q4                  ; lo

-    vqmovun.s16     d1, q5

-    vqmovun.s16     d2, q6                  ; hi

-    vqmovun.s16     d3, q7

-    vst1.32         {d0[0]}, [r2], r3       ; lo

-    vst1.32         {d0[1]}, [r1], r3       ; hi

-    vst1.32         {d1[0]}, [r2], r3

-    vst1.32         {d1[1]}, [r1], r3

-    vst1.32         {d2[0]}, [r2], r3

-    vst1.32         {d2[1]}, [r1], r3

-    vst1.32         {d3[0]}, [r2]

-    vst1.32         {d3[1]}, [r1]

-    bx             lr

-    ENDP           ; |idct_dequant_full_2x_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b

-; because the lowest bit in 0x8a8c is 0, we can pre-shift this

-sinpi8sqrt2       DCD 0x4546

-    END

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -15,7 +15,7 @@

 #include "vp8/common/reconintra4x4.h"

 #include "vp8/common/recon.h"

 #include "vp8/common/reconinter.h"

-#include "dequantize.h"

+#include "vp8/common/dequantize.h"

 #include "detokenize.h"

 #include "vp8/common/invtrans.h"

 #include "vp8/common/alloccommon.h"

@@ -32,7 +32,7 @@

 #endif

 #include "vpx_mem/vpx_mem.h"

 #include "vp8/common/idct.h"

-#include "dequantize.h"

 #include "vp8/common/threading.h"

 #include "decoderthreading.h"

 #include "dboolhuff.h"

@@ -218,7 +218,7 @@

                 if (xd->eobs[i] > 1)

-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)

+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)

                         (b->qcoeff, b->dequant,

                         *(b->base_dst) + b->dst, b->dst_stride);

@@ -247,7 +247,7 @@

             /* do 2nd order transform on the dc block */

             if (xd->eobs[24] > 1)

-                DEQUANT_INVOKE(&pbi->dequant, block)(b);

+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);

                 IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],

                     xd->qcoeff);

@@ -272,7 +272,7 @@

             DQC[0] = 1;

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)

+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)

                         (xd->qcoeff, xd->block[0].dequant,

                          xd->dst.y_buffer,

                          xd->dst.y_stride, xd->eobs);

@@ -281,7 +281,7 @@

         DQC[0] = dc_dequant_temp;

-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)

+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)

                     (xd->qcoeff+16*16, xd->block[16].dequant,

                      xd->dst.u_buffer, xd->dst.v_buffer,

                      xd->dst.uv_stride, xd->eobs+16);

--- a/vp8/decoder/dequantize.c

+++ /dev/null

@@ -1,44 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "dequantize.h"

-#include "vp8/common/idct.h"

-#include "vpx_mem/vpx_mem.h"

-void vp8_dequantize_b_c(BLOCKD *d)

-{

-    int i;

-    short *DQ  = d->dqcoeff;

-    short *Q   = d->qcoeff;

-    short *DQC = d->dequant;

-    for (i = 0; i < 16; i++)

-    {

-        DQ[i] = Q[i] * DQC[i];

-    }

-}

-void vp8_dequant_idct_add_c(short *input, short *dq,

-                            unsigned char *dest, int stride)

-{

-    int i;

-    for (i = 0; i < 16; i++)

-    {

-        input[i] = dq[i] * input[i];

-    }

-    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);

-    vpx_memset(input, 0, 32);

-}

--- a/vp8/decoder/dequantize.h

+++ /dev/null

@@ -1,85 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DEQUANTIZE_H

-#define DEQUANTIZE_H

-#include "vp8/common/blockd.h"

-#define prototype_dequant_block(sym) \

-    void sym(BLOCKD *x)

-#define prototype_dequant_idct_add(sym) \

-    void sym(short *input, short *dq, \

-             unsigned char *output, \

-             int stride)

-#define prototype_dequant_idct_add_y_block(sym) \

-    void sym(short *q, short *dq, \

-             unsigned char *dst, \

-             int stride, char *eobs)

-#define prototype_dequant_idct_add_uv_block(sym) \

-    void sym(short *q, short *dq, \

-             unsigned char *dst_u, \

-             unsigned char *dst_v, int stride, char *eobs)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/dequantize_x86.h"

-#endif

-#if ARCH_ARM

-#include "arm/dequantize_arm.h"

-#endif

-#ifndef vp8_dequant_block

-#define vp8_dequant_block vp8_dequantize_b_c

-#endif

-extern prototype_dequant_block(vp8_dequant_block);

-#ifndef vp8_dequant_idct_add

-#define vp8_dequant_idct_add vp8_dequant_idct_add_c

-#endif

-extern prototype_dequant_idct_add(vp8_dequant_idct_add);

-#ifndef vp8_dequant_idct_add_y_block

-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c

-#endif

-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);

-#ifndef vp8_dequant_idct_add_uv_block

-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c

-#endif

-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);

-typedef prototype_dequant_block((*vp8_dequant_block_fn_t));

-typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));

-typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));

-typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));

-typedef struct

-{

-    vp8_dequant_block_fn_t               block;

-    vp8_dequant_idct_add_fn_t            idct_add;

-    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;

-    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

-} vp8_dequant_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define DEQUANT_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define DEQUANT_INVOKE(ctx,fn) vp8_dequant_##fn

-#endif

-#endif

--- a/vp8/decoder/generic/dsystemdependent.c

+++ b/vp8/decoder/generic/dsystemdependent.c

@@ -10,7 +10,7 @@

 #include "vpx_config.h"

-#include "vp8/decoder/dequantize.h"

+#include "vp8/common/dequantize.h"

 #include "vp8/decoder/onyxd_int.h"

 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);

@@ -20,11 +20,7 @@

     /* Pure C: */

 #if CONFIG_RUNTIME_CPU_DETECT

-    pbi->mb.rtcd                     = &pbi->common.rtcd;

-    pbi->dequant.block               = vp8_dequantize_b_c;

-    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;

-    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;

-    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;

+    pbi->mb.rtcd                               = &pbi->common.rtcd;

 #endif

 #if ARCH_X86 || ARCH_X86_64

--- a/vp8/decoder/idct_blk.c

+++ /dev/null

@@ -1,90 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/idct.h"

-#include "dequantize.h"

-void vp8_dequant_idct_add_c(short *input, short *dq,

-                            unsigned char *dest, int stride);

-void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,

-                            int pred_stride, unsigned char *dst_ptr,

-                            int dst_stride);

-void vp8_dequant_idct_add_y_block_c

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs)

-{

-    int i, j;

-    for (i = 0; i < 4; i++)

-    {

-        for (j = 0; j < 4; j++)

-        {

-            if (*eobs++ > 1)

-                vp8_dequant_idct_add_c (q, dq, dst, stride);

-            else

-            {

-                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);

-                ((int *)q)[0] = 0;

-            }

-            q   += 16;

-            dst += 4;

-        }

-        dst += 4*stride - 16;

-    }

-}

-void vp8_dequant_idct_add_uv_block_c

-            (short *q, short *dq,

-             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

-{

-    int i, j;

-    for (i = 0; i < 2; i++)

-    {

-        for (j = 0; j < 2; j++)

-        {

-            if (*eobs++ > 1)

-                vp8_dequant_idct_add_c (q, dq, dstu, stride);

-            else

-            {

-                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);

-                ((int *)q)[0] = 0;

-            }

-            q    += 16;

-            dstu += 4;

-        }

-        dstu += 4*stride - 8;

-    }

-    for (i = 0; i < 2; i++)

-    {

-        for (j = 0; j < 2; j++)

-        {

-            if (*eobs++ > 1)

-                vp8_dequant_idct_add_c (q, dq, dstv, stride);

-            else

-            {

-                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);

-                ((int *)q)[0] = 0;

-            }

-            q    += 16;

-            dstv += 4;

-        }

-        dstv += 4*stride - 8;

-    }

-}

--- a/vp8/decoder/onyxd_int.h

+++ b/vp8/decoder/onyxd_int.h

@@ -16,7 +16,8 @@

 #include "treereader.h"

 #include "vp8/common/onyxc_int.h"

 #include "vp8/common/threading.h"

-#include "dequantize.h"

 #if CONFIG_ERROR_CONCEALMENT

 #include "ec_types.h"

 #endif

@@ -92,11 +93,6 @@

     int   ready_for_new_data;

     DATARATE dr[16];

-#if CONFIG_RUNTIME_CPU_DETECT

-    vp8_dequant_rtcd_vtable_t        dequant;

-#endif

     vp8_prob prob_intra;

     vp8_prob prob_last;

--- a/vp8/decoder/threading.c

+++ b/vp8/decoder/threading.c

@@ -189,7 +189,7 @@

                 if (xd->eobs[i] > 1)

-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)

+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)

                         (b->qcoeff, b->dequant,

                         *(b->base_dst) + b->dst, b->dst_stride);

@@ -217,7 +217,7 @@

             /* do 2nd order transform on the dc block */

             if (xd->eobs[24] > 1)

-                DEQUANT_INVOKE(&pbi->dequant, block)(b);

+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);

                 IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],

                     xd->qcoeff);

@@ -248,13 +248,13 @@

             DQC = local_dequant;

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)

+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)

                         (xd->qcoeff, DQC,

                          xd->dst.y_buffer,

                          xd->dst.y_stride, xd->eobs);

-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)

+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)

                     (xd->qcoeff+16*16, xd->block[16].dequant,

                      xd->dst.u_buffer, xd->dst.v_buffer,

                      xd->dst.uv_stride, xd->eobs+16);

--- a/vp8/decoder/x86/dequantize_mmx.asm

+++ /dev/null

@@ -1,258 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)

-global sym(vp8_dequantize_b_impl_mmx)

-sym(vp8_dequantize_b_impl_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov       rsi, arg(0) ;sq

-        mov       rdi, arg(1) ;dq

-        mov       rax, arg(2) ;q

-        movq      mm1, [rsi]

-        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.

-        movq      [rdi], mm1

-        movq      mm1, [rsi+8]

-        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.

-        movq      [rdi+8], mm1

-        movq      mm1, [rsi+16]

-        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.

-        movq      [rdi+16], mm1

-        movq      mm1, [rsi+24]

-        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.

-        movq      [rdi+24], mm1

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void dequant_idct_add_mmx(

-;short *input,            0

-;short *dq,               1

-;unsigned char *dest,     2

-;int stride)              3

-global sym(vp8_dequant_idct_add_mmx)

-sym(vp8_dequant_idct_add_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    GET_GOT     rbx

-    push        rdi

-    ; end prolog

-        mov         rax,    arg(0) ;input

-        mov         rdx,    arg(1) ;dq

-        movq        mm0,    [rax   ]

-        pmullw      mm0,    [rdx]

-        movq        mm1,    [rax +8]

-        pmullw      mm1,    [rdx +8]

-        movq        mm2,    [rax+16]

-        pmullw      mm2,    [rdx+16]

-        movq        mm3,    [rax+24]

-        pmullw      mm3,    [rdx+24]

-        mov         rdx,    arg(2) ;dest

-        pxor        mm7,    mm7

-        movq        [rax],   mm7

-        movq        [rax+8], mm7

-        movq        [rax+16],mm7

-        movq        [rax+24],mm7

-        movsxd      rdi,            dword ptr arg(3) ;stride

-        psubw       mm0,            mm2             ; b1= 0-2

-        paddw       mm2,            mm2             ;

-        movq        mm5,            mm1

-        paddw       mm2,            mm0             ; a1 =0+2

-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

-        movq        mm7,            mm3             ;

-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       mm7,            mm5             ; c1

-        movq        mm5,            mm1

-        movq        mm4,            mm3

-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

-        paddw       mm5,            mm1

-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

-        paddw       mm3,            mm4

-        paddw       mm3,            mm5             ; d1

-        movq        mm6,            mm2             ; a1

-        movq        mm4,            mm0             ; b1

-        paddw       mm2,            mm3             ;0

-        paddw       mm4,            mm7             ;1

-        psubw       mm0,            mm7             ;2

-        psubw       mm6,            mm3             ;3

-        movq        mm1,            mm2             ; 03 02 01 00

-        movq        mm3,            mm4             ; 23 22 21 20

-        punpcklwd   mm1,            mm0             ; 11 01 10 00

-        punpckhwd   mm2,            mm0             ; 13 03 12 02

-        punpcklwd   mm3,            mm6             ; 31 21 30 20

-        punpckhwd   mm4,            mm6             ; 33 23 32 22

-        movq        mm0,            mm1             ; 11 01 10 00

-        movq        mm5,            mm2             ; 13 03 12 02

-        punpckldq   mm0,            mm3             ; 30 20 10 00

-        punpckhdq   mm1,            mm3             ; 31 21 11 01

-        punpckldq   mm2,            mm4             ; 32 22 12 02

-        punpckhdq   mm5,            mm4             ; 33 23 13 03

-        movq        mm3,            mm5             ; 33 23 13 03

-        psubw       mm0,            mm2             ; b1= 0-2

-        paddw       mm2,            mm2             ;

-        movq        mm5,            mm1

-        paddw       mm2,            mm0             ; a1 =0+2

-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];

-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)

-        movq        mm7,            mm3             ;

-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];

-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       mm7,            mm5             ; c1

-        movq        mm5,            mm1

-        movq        mm4,            mm3

-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]

-        paddw       mm5,            mm1

-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]

-        paddw       mm3,            mm4

-        paddw       mm3,            mm5             ; d1

-        paddw       mm0,            [GLOBAL(fours)]

-        paddw       mm2,            [GLOBAL(fours)]

-        movq        mm6,            mm2             ; a1

-        movq        mm4,            mm0             ; b1

-        paddw       mm2,            mm3             ;0

-        paddw       mm4,            mm7             ;1

-        psubw       mm0,            mm7             ;2

-        psubw       mm6,            mm3             ;3

-        psraw       mm2,            3

-        psraw       mm0,            3

-        psraw       mm4,            3

-        psraw       mm6,            3

-        movq        mm1,            mm2             ; 03 02 01 00

-        movq        mm3,            mm4             ; 23 22 21 20

-        punpcklwd   mm1,            mm0             ; 11 01 10 00

-        punpckhwd   mm2,            mm0             ; 13 03 12 02

-        punpcklwd   mm3,            mm6             ; 31 21 30 20

-        punpckhwd   mm4,            mm6             ; 33 23 32 22

-        movq        mm0,            mm1             ; 11 01 10 00

-        movq        mm5,            mm2             ; 13 03 12 02

-        punpckldq   mm0,            mm3             ; 30 20 10 00

-        punpckhdq   mm1,            mm3             ; 31 21 11 01

-        punpckldq   mm2,            mm4             ; 32 22 12 02

-        punpckhdq   mm5,            mm4             ; 33 23 13 03

-        pxor        mm7,            mm7

-        movd        mm4,            [rdx]

-        punpcklbw   mm4,            mm7

-        paddsw      mm0,            mm4

-        packuswb    mm0,            mm7

-        movd        [rdx],          mm0

-        movd        mm4,            [rdx+rdi]

-        punpcklbw   mm4,            mm7

-        paddsw      mm1,            mm4

-        packuswb    mm1,            mm7

-        movd        [rdx+rdi],      mm1

-        movd        mm4,            [rdx+2*rdi]

-        punpcklbw   mm4,            mm7

-        paddsw      mm2,            mm4

-        packuswb    mm2,            mm7

-        movd        [rdx+rdi*2],    mm2

-        add         rdx,            rdi

-        movd        mm4,            [rdx+2*rdi]

-        punpcklbw   mm4,            mm7

-        paddsw      mm5,            mm4

-        packuswb    mm5,            mm7

-        movd        [rdx+rdi*2],    mm5

-    ; begin epilog

-    pop rdi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-x_s1sqr2:

-    times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 4 dw 0x4E7B

-align 16

-fours:

-    times 4 dw 0x0004

--- a/vp8/decoder/x86/dequantize_x86.h

+++ /dev/null

@@ -1,58 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DEQUANTIZE_X86_H

-#define DEQUANTIZE_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_dequant_block(vp8_dequantize_b_mmx);

-extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);

-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);

-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_dequant_block

-#define vp8_dequant_block vp8_dequantize_b_mmx

-#undef  vp8_dequant_idct_add

-#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx

-#undef vp8_dequant_idct_add_y_block

-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx

-#undef vp8_dequant_idct_add_uv_block

-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);

-extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp8_dequant_idct_add_y_block

-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2

-#undef vp8_dequant_idct_add_uv_block

-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2

-#endif

-#endif

-#endif

--- a/vp8/decoder/x86/idct_blk_mmx.c

+++ /dev/null

@@ -1,117 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp8_dequant_idct_add_y_block_mmx

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_mmx (q, dq, dst, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,

-                                      dst+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        if (eobs[2] > 1)

-            vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);

-        else if (eobs[2] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,

-                                      dst+8, stride);

-            ((int *)(q+32))[0] = 0;

-        }

-        if (eobs[3] > 1)

-            vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);

-        else if (eobs[3] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,

-                                      dst+12, stride);

-            ((int *)(q+48))[0] = 0;

-        }

-        q    += 64;

-        dst  += 4*stride;

-        eobs += 4;

-    }

-}

-void vp8_dequant_idct_add_uv_block_mmx

-            (short *q, short *dq,

-             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_mmx (q, dq, dstu, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,

-                                      dstu+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        q    += 32;

-        dstu += 4*stride;

-        eobs += 2;

-    }

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_mmx (q, dq, dstv, stride);

-        else if (eobs[0] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);

-            ((int *)q)[0] = 0;

-        }

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);

-        else if (eobs[1] == 1)

-        {

-            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,

-                                      dstv+4, stride);

-            ((int *)(q+16))[0] = 0;

-        }

-        q    += 32;

-        dstv += 4*stride;

-        eobs += 2;

-    }

-}

--- a/vp8/decoder/x86/idct_blk_sse2.c

+++ /dev/null

@@ -1,90 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp8_idct_dequant_0_2x_sse2

-            (short *q, short *dq ,

-             unsigned char *dst, int dst_stride);

-void vp8_idct_dequant_full_2x_sse2

-            (short *q, short *dq ,

-             unsigned char *dst, int dst_stride);

-void vp8_dequant_idct_add_y_block_sse2

-            (short *q, short *dq,

-             unsigned char *dst, int stride, char *eobs)

-{

-    int i;

-    for (i = 0; i < 4; i++)

-    {

-        if (((short *)(eobs))[0])

-        {

-            if (((short *)(eobs))[0] & 0xfefe)

-                vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);

-            else

-                vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);

-        }

-        if (((short *)(eobs))[1])

-        {

-            if (((short *)(eobs))[1] & 0xfefe)

-                vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);

-            else

-                vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);

-        }

-        q    += 64;

-        dst  += stride*4;

-        eobs += 4;

-    }

-}

-void vp8_dequant_idct_add_uv_block_sse2

-            (short *q, short *dq,

-             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

-{

-    if (((short *)(eobs))[0])

-    {

-        if (((short *)(eobs))[0] & 0xfefe)

-            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);

-        else

-            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);

-    }

-    q    += 32;

-    dstu += stride*4;

-    if (((short *)(eobs))[1])

-    {

-        if (((short *)(eobs))[1] & 0xfefe)

-            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);

-        else

-            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);

-    }

-    q    += 32;

-    if (((short *)(eobs))[2])

-    {

-        if (((short *)(eobs))[2] & 0xfefe)

-            vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);

-        else

-            vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);

-    }

-    q    += 32;

-    dstv += stride*4;

-    if (((short *)(eobs))[3])

-    {

-      if (((short *)(eobs))[3] & 0xfefe)

-          vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);

-      else

-          vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);

-    }

-}

--- a/vp8/decoder/x86/x86_dsystemdependent.c

+++ b/vp8/decoder/x86/x86_dsystemdependent.c

@@ -13,47 +13,7 @@

 #include "vpx_ports/x86.h"

 #include "vp8/decoder/onyxd_int.h"

-#if HAVE_MMX

-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-void vp8_dequantize_b_mmx(BLOCKD *d)

-{

-    short *sq = (short *) d->qcoeff;

-    short *dq = (short *) d->dqcoeff;

-    short *q = (short *) d->dequant;

-    vp8_dequantize_b_impl_mmx(sq, dq, q);

-}

-#endif

 void vp8_arch_x86_decode_init(VP8D_COMP *pbi)

-#if CONFIG_RUNTIME_CPU_DETECT

-    int flags = x86_simd_caps();

-    /* Note:

-     *

-     * This platform can be built without runtime CPU detection as well. If

-     * you modify any of the function mappings present in this file, be sure

-     * to also update them in static mapings (<arch>/filename_<arch>.h)

-     */

-    /* Override default functions with fastest ones for this CPU. */

-#if HAVE_MMX

-    if (flags & HAS_MMX)

-    {

-        pbi->dequant.block               = vp8_dequantize_b_mmx;

-        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;

-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;

-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;

-    }

-#endif

-#if HAVE_SSE2

-    if (flags & HAS_SSE2)

-    {

-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;

-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;

-    }

-#endif

-#endif

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -1091,8 +1091,10 @@

 #endif

-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col)

+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

+                                   int mb_row, int mb_col)

+    MACROBLOCKD *xd = &x->e_mbd;

     int rate;

     if (cpi->sf.RD && cpi->compressor_speed != 2)

@@ -1112,14 +1114,17 @@

         vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

     sum_intra_stats(cpi, x);

     vp8_tokenize_mb(cpi, &x->e_mbd, t);

-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)

-        vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);

+    if (xd->mode_info_context->mbmi.mode != B_PRED)

+        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));

-    vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);

+    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)

+                    (xd->qcoeff+16*16, xd->block[16].dequant,

+                     xd->dst.u_buffer, xd->dst.v_buffer,

+                     xd->dst.uv_stride, xd->eobs+16);

     return rate;

 #ifdef SPEEDSTATS

@@ -1312,12 +1317,14 @@

     if (!x->skip)

         vp8_tokenize_mb(cpi, xd, t);

-        if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)

-        {

-          vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),

-                                      &x->e_mbd);

-        }

-        vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);

+        if (xd->mode_info_context->mbmi.mode != B_PRED)

+            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));

+        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)

+                        (xd->qcoeff+16*16, xd->block[16].dequant,

+                         xd->dst.u_buffer, xd->dst.v_buffer,

+                         xd->dst.uv_stride, xd->eobs+16);

     else

--- a/vp8/encoder/encodeintra.c

+++ b/vp8/encoder/encodeintra.c

@@ -45,7 +45,7 @@

         vp8_encode_intra16x16mby(rtcd, x);

-        vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));

     else

@@ -77,8 +77,17 @@

     x->quantize_b(be, b);

-    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);

+    if (*b->eob > 1)

+    {

+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,

+            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);

+    }

+    else

+    {

+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)

+            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,

+                b->dst_stride);

+    }

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)

@@ -96,11 +105,12 @@

 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

     BLOCK *b = &x->block[0];

+    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);

+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),

-        b->src_stride, x->e_mbd.predictor, 16);

+    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),

+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);

     vp8_transform_intra_mby(x);

@@ -108,16 +118,17 @@

     if (x->optimize)

         vp8_optimize_mby(x, rtcd);

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);

+    MACROBLOCKD *xd = &x->e_mbd;

+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);

     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,

-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],

-        &x->e_mbd.predictor[320], 8);

+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,

+        xd->dst.v_buffer, xd->dst.uv_stride);

     vp8_transform_mbuv(x);

@@ -125,5 +136,4 @@

     if (x->optimize)

         vp8_optimize_mbuv(x, rtcd);

--- a/vp8/encoder/encodemb.c

+++ b/vp8/encoder/encodemb.c

@@ -105,10 +105,10 @@

     BLOCK *b = &x->block[0];

     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),

-        b->src_stride, x->e_mbd.predictor, 16);

+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,

-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],

-        &x->e_mbd.predictor[320], 8);

+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,

+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);

 static void build_dcblock(MACROBLOCK *x)

@@ -625,7 +625,7 @@

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

-    vp8_build_inter_predictors_mb_e(&x->e_mbd);

+    vp8_build_inter_predictors_mb(&x->e_mbd);

     vp8_subtract_mb(rtcd, x);

@@ -635,7 +635,6 @@

     if (x->optimize)

         optimize_mb(x, rtcd);

 /* this funciton is used by first pass only */

@@ -643,15 +642,15 @@

     BLOCK *b = &x->block[0];

-    vp8_build_inter16x16_predictors_mby(&x->e_mbd);

+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,

+                                        x->e_mbd.dst.y_stride);

     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),

-        b->src_stride, x->e_mbd.predictor, 16);

+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

     transform_mby(x);

     vp8_quantize_mby(x);

-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));

--- a/vp8/encoder/encodemb.h

+++ b/vp8/encoder/encodemb.h

@@ -12,6 +12,7 @@

 #ifndef __INC_ENCODEMB_H

 #define __INC_ENCODEMB_H

 #include "vpx_config.h"

 #include "block.h"

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -2166,7 +2166,7 @@

                 continue;

             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);

-            vp8_build_inter16x16_predictors_mby(&x->e_mbd);

+            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);

             if (cpi->active_map_enabled && x->active_ptr[0] == 0) {

                 x->skip = 1;

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -20,6 +20,8 @@

 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h

 VP8_COMMON_SRCS-yes += common/debugmodes.c

 VP8_COMMON_SRCS-yes += common/default_coef_probs.h

+VP8_COMMON_SRCS-yes += common/dequantize.c

+VP8_COMMON_SRCS-yes += common/dequantize.h

 VP8_COMMON_SRCS-yes += common/entropy.c

 VP8_COMMON_SRCS-yes += common/entropymode.c

 VP8_COMMON_SRCS-yes += common/entropymv.c

@@ -28,6 +30,7 @@

 VP8_COMMON_SRCS-yes += common/filter.h

 VP8_COMMON_SRCS-yes += common/findnearmv.c

 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c

+VP8_COMMON_SRCS-yes += common/idct_blk.c

 VP8_COMMON_SRCS-yes += common/idctllm.c

 VP8_COMMON_SRCS-yes += common/alloccommon.h

 VP8_COMMON_SRCS-yes += common/blockd.h

@@ -57,7 +60,6 @@

 VP8_COMMON_SRCS-yes += common/systemdependent.h

 VP8_COMMON_SRCS-yes += common/threading.h

 VP8_COMMON_SRCS-yes += common/treecoder.h

-VP8_COMMON_SRCS-yes += common/invtrans.c

 VP8_COMMON_SRCS-yes += common/loopfilter.c

 VP8_COMMON_SRCS-yes += common/loopfilter_filters.c

 VP8_COMMON_SRCS-yes += common/mbpitch.c

@@ -69,9 +71,13 @@

 VP8_COMMON_SRCS-yes += common/reconintra4x4.c

 VP8_COMMON_SRCS-yes += common/setupintrarecon.c

 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c

 VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c

 VP8_COMMON_SRCS-yes += common/treecoder.c

+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h

@@ -84,11 +90,14 @@

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c

 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h

 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c

+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm

+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm

+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c

@@ -115,6 +124,8 @@

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.h

 # common (armv6)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)

@@ -129,6 +140,9 @@

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/intra4x4_predict_v6$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequant_idct_v6$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequantize_v6$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_blk_v6.c

 # common (neon)

 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)

@@ -151,3 +165,8 @@

 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequant_idct_neon$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequantizeb_neon$(ASM)

+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_blk_neon.c

--- a/vp8/vp8dx.mk

+++ b/vp8/vp8dx.mk

@@ -52,7 +52,6 @@

 VP8_DX_SRCS-yes += decoder/dboolhuff.c

 VP8_DX_SRCS-yes += decoder/decodemv.c

 VP8_DX_SRCS-yes += decoder/decodframe.c

-VP8_DX_SRCS-yes += decoder/dequantize.c

 VP8_DX_SRCS-yes += decoder/detokenize.c

 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h

 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h

@@ -61,20 +60,14 @@

 VP8_DX_SRCS-yes += decoder/dboolhuff.h

 VP8_DX_SRCS-yes += decoder/decodemv.h

 VP8_DX_SRCS-yes += decoder/decoderthreading.h

-VP8_DX_SRCS-yes += decoder/dequantize.h

 VP8_DX_SRCS-yes += decoder/detokenize.h

 VP8_DX_SRCS-yes += decoder/onyxd_int.h

 VP8_DX_SRCS-yes += decoder/treereader.h

 VP8_DX_SRCS-yes += decoder/onyxd_if.c

 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c

-VP8_DX_SRCS-yes += decoder/idct_blk.c

 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h

 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))

-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h

 VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c

-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm

-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c

-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c

--- a/vp8/vp8dx_arm.mk

+++ b/vp8/vp8dx_arm.mk

@@ -12,17 +12,3 @@

 #VP8_DX_SRCS list is modified according to different platforms.

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c

-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c

-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h

-#File list for armv6

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

-#File list for neon

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c

--

⑨