shithub: libvpx

--- a/configure

+++ b/configure

@@ -507,7 +507,7 @@

         check_add_cflags -Wpointer-arith

         check_add_cflags -Wtype-limits

         check_add_cflags -Wcast-qual

-        enabled extra_warnings || check_add_cflags -Wno-unused

+        enabled extra_warnings || check_add_cflags -Wno-unused-function

fi

     if enabled icc; then

--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

@@ -25,14 +25,14 @@

 |vp8_variance16x16_armv6| PROC

     stmfd   sp!, {r4-r12, lr}

-    mov     r12, #16            ; set loop counter to 16 (=block height)

     mov     r8, #0              ; initialize sum = 0

     mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

 loop

     ; 1st 4 pixels

-    ldr     r4, [r0, #0x0]      ; load 4 src pixels

-    ldr     r5, [r2, #0x0]      ; load 4 ref pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

     mov     lr, #0              ; constant zero

@@ -55,8 +55,8 @@

     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

     ; 2nd 4 pixels

-    ldr     r4, [r0, #0x4]      ; load 4 src pixels

-    ldr     r5, [r2, #0x4]      ; load 4 ref pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

     usub8   r6, r4, r5          ; calculate difference

@@ -79,8 +79,8 @@

     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

     ; 3rd 4 pixels

-    ldr     r4, [r0, #0x8]      ; load 4 src pixels

-    ldr     r5, [r2, #0x8]      ; load 4 ref pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

     usub8   r6, r4, r5          ; calculate difference

@@ -103,8 +103,8 @@

     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

     ; 4th 4 pixels

-    ldr     r4, [r0, #0xc]      ; load 4 src pixels

-    ldr     r5, [r2, #0xc]      ; load 4 ref pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

     usub8   r6, r4, r5          ; calculate difference

@@ -135,10 +135,10 @@

     bne     loop

     ; return stuff

-    ldr     r6, [sp, #0x28]     ; get address of sse

+    ldr     r6, [sp, #40]       ; get address of sse

     mul     r0, r8, r8          ; sum * sum

     str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

     ldmfd   sp!, {r4-r12, pc}

@@ -145,3 +145,4 @@

     ENDP

END

--- /dev/null

+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm

@@ -1,0 +1,176 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp8_variance_halfpixvar16x16_h_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm

@@ -1,0 +1,216 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp8_variance_halfpixvar16x16_hv_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; pointer to pixels on the next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load source pixels a, row N

+    ldr     r6, [r0, #1]        ; load source pixels b, row N

+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1

+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load source pixels a, row N

+    ldr     r6, [r0, #5]        ; load source pixels b, row N

+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load source pixels a, row N

+    ldr     r6, [r0, #9]        ; load source pixels b, row N

+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load source pixels a, row N

+    ldr     r6, [r0, #13]       ; load source pixels b, row N

+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    subs    r12, r12, #1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm

@@ -1,0 +1,178 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp8_variance_halfpixvar16x16_v_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; set src pointer to next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- a/vp8/encoder/arm/variance_arm.c

+++ b/vp8/encoder/arm/variance_arm.c

@@ -57,51 +57,38 @@

     unsigned short first_pass[36*16];

     unsigned char  second_pass[20*16];

     const short *HFilter, *VFilter;

+    unsigned int var;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

+    if (xoffset == 4 && yoffset == 0)

+    {

+        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,

+                                                   dst_ptr, dst_pixels_per_line, sse);

+    }

+    else if (xoffset == 0 && yoffset == 4)

+    {

+        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,

+                                                   dst_ptr, dst_pixels_per_line, sse);

+    }

+    else if (xoffset == 4 && yoffset == 4)

+    {

+        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,

+                                                   dst_ptr, dst_pixels_per_line, sse);

+    }

+    else

+    {

+        HFilter = vp8_bilinear_filters[xoffset];

+        VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

-                                            src_pixels_per_line,

-                                            17, 16, HFilter);

-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

-                                             16, 16, 16, VFilter);

+        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

+                                                src_pixels_per_line,

+                                                17, 16, HFilter);

+        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

+                                                 16, 16, 16, VFilter);

-    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,

-                                   dst_pixels_per_line, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_h_armv6(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_v_armv6(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4,

-                                         ref_ptr, recon_stride, sse);

+        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,

+                                       dst_pixels_per_line, sse);

+    }

+    return var;

 #endif /* HAVE_ARMV6 */

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ b/vp8/encoder/x86/quantize_sse2.asm

@@ -130,7 +130,7 @@

     mov         [rsp + zrun_zbin_boost], rsi

 %macro ZIGZAG_LOOP 1

-    movsx       edx, WORD PTR[GLOBAL(zig_zag) + (%1 * 2)] ; rc

+    movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc

; x

     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]

@@ -209,7 +209,7 @@

     pxor        xmm3, xmm6

     ; mask inv_zig_zag

     pand        xmm2, [GLOBAL(inv_zig_zag)]

-    pand        xmm3, [GLOBAL(inv_zig_zag) + 16]

+    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

     ; select the max value

     pmaxsw      xmm2, xmm3

     pshufd      xmm3, xmm2, 00001110b

--- a/vp8/encoder/x86/variance_impl_ssse3.asm

+++ b/vp8/encoder/x86/variance_impl_ssse3.asm

@@ -38,7 +38,6 @@

     GET_GOT     rbx

     push rsi

     push rdi

-    push rbx

     ; end prolog

         pxor            xmm6,           xmm6

@@ -81,10 +80,12 @@

         packuswb        xmm0,           xmm2

-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line

-        lea             rsi,            [rsi + rbx]

-%if ABI_IS_32BIT=0

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+        lea             rsi,            [rsi + r8]

 %endif

 filter_block2d_bil_var_ssse3_loop:

@@ -132,10 +133,11 @@

         paddd           xmm7,           xmm2

         paddd           xmm7,           xmm3

-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line

 %if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line

         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

 %else

+        lea             rsi,            [rsi + r8]

         lea             rdi,            [rdi + r9]

 %endif

@@ -161,7 +163,10 @@

         movdqu          xmm1,           XMMWORD PTR [rsi]

         movdqa          xmm0,           xmm1

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

         lea             rsi,            [rsi + rax]

 filter_block2d_bil_sp_only_loop:

@@ -196,8 +201,13 @@

         movdqa          xmm1,           xmm0

         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

         sub             rcx,            1

         jnz             filter_block2d_bil_sp_only_loop

@@ -208,7 +218,7 @@

         mov             rdi,            arg(2)               ;src_ptr

         movsxd          rcx,            dword ptr arg(4)     ;Height

         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line

         pxor            xmm0,           xmm0

 filter_block2d_bil_full_pixel_loop:

@@ -232,7 +242,7 @@

         paddd           xmm7,           xmm2

         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line

         sub             rcx,            1

         jnz             filter_block2d_bil_full_pixel_loop

@@ -245,8 +255,11 @@

         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

         pxor            xmm0,           xmm0

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

 filter_block2d_bil_fp_only_loop:

         movdqu          xmm1,           XMMWORD PTR [rsi]

         movdqu          xmm2,           XMMWORD PTR [rsi+1]

@@ -278,7 +291,11 @@

         paddd           xmm7,           xmm3

         lea             rsi,            [rsi + rdx]

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

         sub             rcx,            1

         jnz             filter_block2d_bil_fp_only_loop

@@ -322,7 +339,6 @@

         movd        [rdi],       xmm6

     ; begin epilog

-    pop rbx

     pop rdi

     pop rsi

     RESTORE_GOT

--- a/vp8/vp8cx_arm.mk

+++ b/vp8/vp8cx_arm.mk

@@ -38,6 +38,9 @@

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

--

⑨