shithub: libvpx

--- a/vp8/common/arm/bilinearfilter_arm.c

+++ b/vp8/common/arm/bilinearfilter_arm.c

@@ -12,26 +12,7 @@

 #include <math.h>

 #include "filter.h"

 #include "subpixel.h"

-extern void vp8_filter_block2d_bil_first_pass_armv6

-(

-    unsigned char  *src_ptr,

-    unsigned short *dst_ptr,

-    unsigned int    src_pitch,

-    unsigned int    height,

-    unsigned int    width,

-    const short    *vp8_filter

-);

-extern void vp8_filter_block2d_bil_second_pass_armv6

-(

-    unsigned short *src_ptr,

-    unsigned char  *dst_ptr,

-    int             dst_pitch,

-    unsigned int    height,

-    unsigned int    width,

-    const short    *vp8_filter

-);

+#include "arm/bilinearfilter_arm.h"

 void vp8_filter_block2d_bil_armv6

--- /dev/null

+++ b/vp8/common/arm/bilinearfilter_arm.h

@@ -1,0 +1,35 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef BILINEARFILTER_ARM_H

+#define BILINEARFILTER_ARM_H

+extern void vp8_filter_block2d_bil_first_pass_armv6

+(

+    const unsigned char  *src_ptr,

+    unsigned short       *dst_ptr,

+    unsigned int          src_pitch,

+    unsigned int          height,

+    unsigned int          width,

+    const short          *vp8_filter

+);

+extern void vp8_filter_block2d_bil_second_pass_armv6

+(

+    const unsigned short *src_ptr,

+    unsigned char        *dst_ptr,

+    int                   dst_pitch,

+    unsigned int          height,

+    unsigned int          width,

+    const short         *vp8_filter

+);

+#endif /* BILINEARFILTER_ARM_H */

--- a/vp8/encoder/arm/arm_csystemdependent.c

+++ b/vp8/encoder/arm/arm_csystemdependent.c

@@ -38,14 +38,14 @@

         /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;

         cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;

         cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;

-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;

-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/

+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/

+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;

         /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;

         cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;

         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;

-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/

+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/

+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;

         /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;

         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

--- /dev/null

+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

@@ -1,0 +1,147 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_variance16x16_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp8_variance16x16_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     r8, #0              ; initialize sum = 0

+    mov     r11, #0             ; initialize sse = 0

+loop

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0x0]      ; load 4 src pixels

+    ldr     r5, [r2, #0x0]      ; load 4 ref pixels

+    mov     lr, #0              ; constant zero

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #0x4]      ; load 4 src pixels

+    ldr     r5, [r2, #0x4]      ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #0x8]      ; load 4 src pixels

+    ldr     r5, [r2, #0x8]      ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #0xc]      ; load 4 src pixels

+    ldr     r5, [r2, #0xc]      ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #0x28]     ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+    END

--- a/vp8/encoder/arm/variance_arm.c

+++ b/vp8/encoder/arm/variance_arm.c

@@ -10,6 +10,40 @@

 #include "vpx_config.h"

 #include "variance.h"

+#include "filter.h"

+#include "arm/bilinearfilter_arm.h"

+#if HAVE_ARMV6

+unsigned int vp8_sub_pixel_variance16x16_armv6

+(

+    const unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    const unsigned char *dst_ptr,

+    int dst_pixels_per_line,

+    unsigned int *sse

+)

+{

+    unsigned short first_pass[36*16];

+    unsigned char  second_pass[20*16];

+    const short *HFilter, *VFilter;

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

+                                            src_pixels_per_line,

+                                            17, 16, HFilter);

+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

+                                             16, 16, 16, VFilter);

+    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,

+                                   dst_pixels_per_line, sse);

+}

+#endif

 #if HAVE_ARMV7

--- a/vp8/encoder/arm/variance_arm.h

+++ b/vp8/encoder/arm/variance_arm.h

@@ -12,6 +12,23 @@

 #ifndef VARIANCE_ARM_H

 #define VARIANCE_ARM_H

+#if HAVE_ARMV6

+extern prototype_variance(vp8_variance16x16_armv6);

+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_variance_subpixvar16x16

+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6

+#undef  vp8_variance_var16x16

+#define vp8_variance_var16x16 vp8_variance16x16_armv6

+#endif /* !CONFIG_RUNTIME_CPU_DETECT */

+#endif /* HAVE_ARMV6 */

 #if HAVE_ARMV7

 extern prototype_sad(vp8_sad4x4_neon);

 extern prototype_sad(vp8_sad8x8_neon);

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -116,6 +116,7 @@

 # common (c)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c

+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c

--- a/vp8/vp8cx_arm.mk

+++ b/vp8/vp8cx_arm.mk

@@ -17,9 +17,10 @@

 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/asm_enc_offsets.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/variance_arm.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h

 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c

 VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c

@@ -33,6 +34,7 @@

 #File list for armv6

 # encoder

+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

 #File list for neon

--

⑨