ref: 18dc92fd664357db31d7ef43337e2dee3a0f5062
parent: 305be4e4179214c58796de91e86badadbca29451
author: Timothy B. Terriberry <tterribe@xiph.org>
date: Mon Sep 27 13:18:18 EDT 2010
Add 4-tap version of 2nd-pass ARMv6 MC filter. The existing code applied a 6-tap filter with 0's on either end. We're already paying the branch penalty to avoid computing the two extra columns needed as input to this filter. We might as well save time computing the filter as well. This reduces the inner loop from 21 instructions to 16, the number of loads per iteration from 4 to 1, and the number of multiplies from 7 to 4. The gain in overall decoding performance, however, is small (less than 1%). This change also means we now valgrind clean on ARMv6, which is its real purpose. The errors reported here were valgrind's fault (it does not detect that 0 times an uninitialized value is initialized), but Julian Seward says it would slow down valgrind considerably to make such checks. Speeding up libvpx rather, even by a small amount, seems a much better idea if only to enable proper valgrind checking of the rest of the codec. Change-Id: Ifb376ea195e086b60f61daf1097d8910c4d8ff16
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -11,6 +11,7 @@
EXPORT |vp8_filter_block2d_first_pass_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
+ EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
EXPORT |vp8_filter_block2d_second_pass_only_armv6|
@@ -188,6 +189,64 @@
bne height_loop_2nd
add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp8_filter address
+ mov r7, r3, lsl #16 ; height is top part of counter
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ add lr, r1, r3 ; save final destination pointer
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+ mov r4, #0x40 ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+ ldrd r8, [r0, #-4] ; load the data
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd_4|
+ ldr r10, [r0, #4]!
+ smladx r6, r9, r12, r4 ; apply filter
+ pkhbt r8, r9, r8
+ smlad r5, r8, r12, r4
+ pkhbt r8, r10, r9
+ smladx r6, r10, r11, r6
+ sub r7, r7, #1
+ smlad r5, r8, r11, r5
+
+ mov r8, r9 ; shift the data for the next loop
+ mov r9, r10
+
+ usat r6, #8, r6, asr #7 ; shift and clamp
+ usat r5, #8, r5, asr #7
+
+ strb r5, [r1], r2 ; the result is transposed back and stored
+ tst r7, #0xff
+ strb r6, [r1], r2
+
+ bne width_loop_2nd_4
+
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; update src for next loop
+ sub r1, lr, r7, lsr #16 ; update dst for next loop
+
+ bne height_loop_2nd_4
+
ldmia sp!, {r4 - r11, pc}
ENDP
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -50,6 +50,15 @@
const short *vp8_filter
);
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int cnt,
+ const short *vp8_filter
+);
+
extern void vp8_filter_block2d_first_pass_only_armv6
(
unsigned char *src_ptr,
@@ -107,12 +116,16 @@
{
// Vfilter is a 4 tap filter
if (yoffset & 0x1)
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
// Vfilter is 6 tap filter
else
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
-
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
}
}
@@ -186,11 +199,15 @@
else
{
if (yoffset & 0x1)
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ }
else
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
-
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ }
}
}
@@ -224,11 +241,15 @@
else
{
if (yoffset & 0x1)
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ }
else
+ {
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
-
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ }
}
}