ref: 8f910594bdc6e93269b93c930cd02a381d601a3f
parent: 1a219c22b1b3dfa48f0ad2de16d9bb32dee96aeb
parent: c231b0175dbfb8b1a1b06707bc300d2910abba7a
author: Johann <johannkoenig@google.com>
date: Wed Jul 13 00:09:55 EDT 2011
Merge "Update armv6 loopfilter to new interface"
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -54,9 +54,11 @@
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
- rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+ rtcd->loopfilter.simple_mb_v =
+ vp8_loop_filter_simple_vertical_edge_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
- rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+ rtcd->loopfilter.simple_mb_h =
+ vp8_loop_filter_simple_horizontal_edge_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -53,14 +53,11 @@
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
-;r2 const char *flimit,
+;r2 const char *blimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -72,14 +69,18 @@
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r6], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|Hnext8|
; vp8_filter_mask() function
@@ -275,14 +276,18 @@
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r6], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|MBHnext8|
@@ -584,15 +589,19 @@
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
ldr r7, [src], pstep
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
ldr r8, [src], pstep
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r12], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|Vnext8|
@@ -855,18 +864,22 @@
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
- ldr r4, [r2], #4 ; flimit
+ ldrb r4, [r2] ; blimit
pld [src, #23]
ldr r7, [src], pstep
- ldr r2, [r3], #4 ; limit
+ ldrb r2, [r3] ; limit
pld [src, #23]
ldr r8, [src], pstep
- uadd8 r4, r4, r4 ; flimit * 2
- ldr r3, [r12], #4 ; thresh
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
pld [src, #23]
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
- uadd8 r4, r4, r2 ; flimit * 2 + limit
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
|MBVnext8|
; vp8_filter_mask() function
@@ -906,6 +919,7 @@
str lr, [sp, #8]
ldr lr, [src], pstep
+
TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
ldr lr, [sp, #8] ; load back (f)limit accumulator
@@ -954,6 +968,7 @@
beq mbvskip_filter ; skip filtering
+
;vp8_hevmask() function
;calculate high edge variance
@@ -1121,6 +1136,7 @@
smlabb r8, r6, lr, r7
smlatb r6, r6, lr, r7
smlabb r9, r10, lr, r7
+
smlatb r10, r10, lr, r7
ssat r8, #8, r8, asr #7
ssat r6, #8, r6, asr #7
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -45,35 +45,28 @@
MEND
+
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
-;r2 const char *flimit,
-;r3 const char *limit,
-;stack const char *thresh,
-;stack int count
+;r2 const char *blimit
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
-
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- ldr r12, [r3] ; limit
+ ldrb r12, [r2] ; blimit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
- ldr r7, [r2] ; flimit
+ orr r12, r12, r12, lsl #8 ; blimit
ldr r2, c0x80808080
- ldr r9, [sp, #40] ; count for 8-in-parallel
- uadd8 r7, r7, r7 ; flimit * 2
- mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
- uadd8 r12, r7, r12 ; flimit * 2 + limit
+ orr r12, r12, r12, lsl #16 ; blimit
+ mov r9, #4 ; double the count. we're doing 4 at a time
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
@@ -148,20 +141,19 @@
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- ldr r12, [r2] ; r12: flimit
+ ldrb r12, [r2] ; r12: blimit
ldr r2, c0x80808080
- ldr r7, [r3] ; limit
+ orr r12, r12, r12, lsl #8
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
- uadd8 r12, r12, r12 ; flimit * 2
+ orr r12, r12, r12, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
- uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
@@ -168,7 +160,6 @@
ldrh r3, [src, #-2]
pld [src, #23]
ldrh r4, [src], pstep
- ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
@@ -175,7 +166,7 @@
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
- mov r11, r11, lsl #1 ; 4-in-parallel
+ mov r11, #4 ; double the count. we're doing 4 at a time
|simple_vnext8|
; vp8_simple_filter_mask() function
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -18,8 +18,6 @@
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#endif
#if HAVE_ARMV7
@@ -55,15 +53,6 @@
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
@@ -77,15 +66,6 @@
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
@@ -101,15 +81,12 @@
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
@@ -127,15 +104,12 @@
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
}
#endif
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -19,10 +19,10 @@
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@@ -38,13 +38,13 @@
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
--
⑨