ref: 283b0e25ac6e0a2bd3d5f0b8cd1d0a50bbda7318
parent: 622958449b9388cca0f4a4e287b3e94422e4a573
author: Attila Nagy <attilanagy@google.com>
date: Wed Jul 6 09:35:33 EDT 2011
Update armv7 loopfilter to new interface Change-Id: I65105a9c63832669237e6a6a7fcb4ea3ea683346
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -9,11 +9,11 @@
*/
-#include "vpx_ports/config.h"
-#include <math.h>
+#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
+#if HAVE_ARMV6
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
@@ -20,19 +20,25 @@
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
+#endif
-extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh,
+ unsigned char *v);
-extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
#if HAVE_ARMV6
/*ARMV6 loopfilter functions*/
@@ -40,13 +46,13 @@
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+ vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
- vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+ vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -55,7 +61,7 @@
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
}
/* Vertical MB Filtering */
@@ -62,13 +68,13 @@
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
- vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+ vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
- vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+ vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -77,7 +83,7 @@
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
}
/* Horizontal B Filtering */
@@ -84,15 +90,15 @@
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
- vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+ vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
- vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+ vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -101,9 +107,9 @@
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
}
/* Vertical B Filtering */
@@ -110,15 +116,15 @@
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
- vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+ vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
- vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+ vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -127,9 +133,9 @@
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
}
#endif
@@ -139,83 +145,58 @@
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
+ vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
-void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
if (u_ptr)
- vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
+ vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
-void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
if (u_ptr)
- vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
+ vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
}
-void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-}
-
/* Vertical B Filtering */
void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
- if (u_ptr)
- vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
-}
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
-void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void) u_ptr;
- (void) v_ptr;
- (void) uv_stride;
- vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
- vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
}
#endif
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -12,6 +12,8 @@
#ifndef LOOPFILTER_ARM_H
#define LOOPFILTER_ARM_H
+#include "vpx_config.h"
+
#if HAVE_ARMV6
extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
@@ -46,18 +48,19 @@
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+#endif /* HAVE_ARMV6 */
+
#if HAVE_ARMV7
extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@@ -83,7 +86,8 @@
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-#endif
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -14,93 +14,81 @@
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
- REQUIRE8
- PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
-; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
; r0 unsigned char *src
; r1 int pitch
-; r2 const signed char *flimit
-; r3 const signed char *limit
-; sp const signed char *thresh,
-; sp+4 int count (unused)
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
- stmdb sp!, {lr}
- vld1.s8 {d0[], d1[]}, [r2] ; flimit
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r12, [sp, #4] ; load thresh pointer
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1
+ add r1, r1, r1
- vld1.u8 {q3}, [r2], r1 ; p3
- vld1.u8 {q4}, [r2], r1 ; p2
- vld1.u8 {q5}, [r2], r1 ; p1
- vld1.u8 {q6}, [r2], r1 ; p0
- vld1.u8 {q7}, [r2], r1 ; q0
- vld1.u8 {q8}, [r2], r1 ; q1
- vld1.u8 {q9}, [r2], r1 ; q2
- vld1.u8 {q10}, [r2] ; q3
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
- sub r0, r0, r1, lsl #1
+ vdup.u8 q2, r3 ; duplicate thresh
+ vld1.u8 {q3}, [r2@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r2@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r2@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r2@128] ; q2
+ vld1.u8 {q10}, [r12@128] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r12, r12, r1, lsl #1
+
bl vp8_loop_filter_neon
- vst1.u8 {q5}, [r0], r1 ; store op1
- vst1.u8 {q6}, [r0], r1 ; store op0
- vst1.u8 {q7}, [r0], r1 ; store oq0
- vst1.u8 {q8}, [r0], r1 ; store oq1
+ vst1.u8 {q5}, [r2@128], r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r2@128], r1 ; store oq0
+ vst1.u8 {q8}, [r12@128], r1 ; store oq1
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; unsigned char *v)
+
; r0 unsigned char *u,
; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
- stmdb sp!, {lr}
- vld1.s8 {d0[], d1[]}, [r2] ; flimit
- vld1.s8 {d2[], d3[]}, [r3] ; limit
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
+ vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
- vld1.u8 {d6}, [r3], r1 ; p3
- vld1.u8 {d8}, [r3], r1 ; p2
- vld1.u8 {d10}, [r3], r1 ; p1
- vld1.u8 {d12}, [r3], r1 ; p0
- vld1.u8 {d14}, [r3], r1 ; q0
- vld1.u8 {d16}, [r3], r1 ; q1
- vld1.u8 {d18}, [r3], r1 ; q2
- vld1.u8 {d20}, [r3] ; q3
-
- ldr r3, [sp, #4] ; load thresh pointer
-
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
- vld1.u8 {d7}, [r12], r1 ; p3
- vld1.u8 {d9}, [r12], r1 ; p2
- vld1.u8 {d11}, [r12], r1 ; p1
- vld1.u8 {d13}, [r12], r1 ; p0
- vld1.u8 {d15}, [r12], r1 ; q0
- vld1.u8 {d17}, [r12], r1 ; q1
- vld1.u8 {d19}, [r12], r1 ; q2
- vld1.u8 {d21}, [r12] ; q3
- vld1.s8 {d4[], d5[]}, [r3] ; thresh
+ vld1.u8 {d6}, [r3@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r3@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r3@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r3@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r3@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r3@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r3@64] ; q3
+ vld1.u8 {d21}, [r12@64] ; q3
bl vp8_loop_filter_neon
@@ -107,16 +95,16 @@
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
- vst1.u8 {d10}, [r0], r1 ; store u op1
- vst1.u8 {d11}, [r2], r1 ; store v op1
- vst1.u8 {d12}, [r0], r1 ; store u op0
- vst1.u8 {d13}, [r2], r1 ; store v op0
- vst1.u8 {d14}, [r0], r1 ; store u oq0
- vst1.u8 {d15}, [r2], r1 ; store v oq0
- vst1.u8 {d16}, [r0] ; store u oq1
- vst1.u8 {d17}, [r2] ; store v oq1
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r2@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r2@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r2@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64] ; store u oq1
+ vst1.u8 {d17}, [r2@64] ; store v oq1
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
@@ -124,39 +112,38 @@
; const signed char *limit,
; const signed char *thresh,
; int count)
-; r0 unsigned char *src,
-; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
-; sp+4 int count (unused)
+; r0 unsigned char *src
+; r1 int pitch
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+
|vp8_loop_filter_vertical_edge_y_neon| PROC
- stmdb sp!, {lr}
- vld1.s8 {d0[], d1[]}, [r2] ; flimit
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- sub r2, r0, #4 ; src ptr down by 4 columns
- sub r0, r0, #2 ; dst ptr
- ldr r12, [sp, #4] ; load thresh pointer
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, #4 ; src ptr down by 4 columns
+ add r1, r1, r1
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1, asr #1
- vld1.u8 {d6}, [r2], r1 ; load first 8-line src data
- vld1.u8 {d8}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r2], r1
- vld1.u8 {d12}, [r2], r1
+ vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r2], r1
- vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r2], r1
- vld1.u8 {d20}, [r2], r1
+ vld1.u8 {d20}, [r12], r1
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
-
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r2], r1
+ vld1.u8 {d9}, [r12], r1
vld1.u8 {d11}, [r2], r1
- vld1.u8 {d13}, [r2], r1
+ vld1.u8 {d13}, [r12], r1
vld1.u8 {d15}, [r2], r1
- vld1.u8 {d17}, [r2], r1
- vld1.u8 {d19}, [r2], r1
- vld1.u8 {d21}, [r2]
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d19}, [r2]
+ vld1.u8 {d21}, [r12]
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -164,6 +151,8 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
+ vdup.u8 q2, r3 ; duplicate thresh
+
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@@ -178,28 +167,34 @@
vswp d12, d11
vswp d16, d13
+
+ sub r0, r0, #2 ; dst ptr
+
vswp d14, d12
vswp d16, d15
+ add r12, r0, r1, asr #1
+
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
+ vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
+ vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0]
+ vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
+ vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
@@ -209,38 +204,36 @@
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
- stmdb sp!, {lr}
- sub r12, r0, #4 ; move u pointer down by 4 columns
- vld1.s8 {d0[], d1[]}, [r2] ; flimit
- vld1.s8 {d2[], d3[]}, [r3] ; limit
-
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
-
- vld1.u8 {d6}, [r12], r1 ;load u data
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d10}, [r12], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d14}, [r12], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d18}, [r12], r1
- vld1.u8 {d20}, [r12]
-
+ vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
+
+ vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
+ vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
+ vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
+ vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
+ vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
+ vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
- ldr r12, [sp, #4] ; load thresh pointer
+ ldr r12, [sp, #4] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -248,6 +241,8 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
+ vdup.u8 q2, r12 ; duplicate thresh
+
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@@ -258,18 +253,16 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
-
bl vp8_loop_filter_neon
- sub r0, r0, #2
- sub r2, r2, #2
-
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
+ sub r0, r0, #2
+ sub r2, r2, #2
+
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
@@ -288,7 +281,7 @@
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
@@ -316,7 +309,6 @@
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
- vabd.u8 q9, q6, q7 ; abs(p0 - q0)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
@@ -323,35 +315,38 @@
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
- vadd.u8 q0, q0, q0 ; flimit * 2
- vadd.u8 q0, q0, q1 ; flimit * 2 + limit
- vcge.u8 q15, q1, q15
+ vmov.u8 q10, #0x80 ; 0x80
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
- vshr.u8 q2, q2, #1 ; a = a / 2
- vqadd.u8 q9, q9, q2 ; a = b + a
- vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
- vmov.u8 q0, #0x80 ; 0x80
+ vcge.u8 q15, q1, q15
; vp8_filter() function
; convert to signed
- veor q7, q7, q0 ; qs0
- veor q6, q6, q0 ; ps0
- veor q5, q5, q0 ; ps1
- veor q8, q8, q0 ; qs1
+ veor q7, q7, q10 ; qs0
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
+ vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
+
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
@@ -378,6 +373,7 @@
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
+
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
@@ -384,13 +380,13 @@
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
-
+ vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
- veor q5, q13, q0 ; *op1 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
+ veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -9,99 +9,109 @@
;
- EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
+ ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
+ EXPORT |vp8_loop_filter_bhs_neon|
+ EXPORT |vp8_loop_filter_mbhs_neon|
ARM
- REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0 unsigned char *s,
-; r1 int p, //pitch
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; stack(r4) const signed char *thresh (unused)
-; //stack(r5) int count --unused
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
- sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines
- vld1.u8 {q5}, [r0], r1 ; p1
- vld1.s8 {d2[], d3[]}, [r2] ; flimit
- vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
- vld1.u8 {q6}, [r0], r1 ; p0
- vmov.u8 q0, #0x80 ; 0x80
- vld1.u8 {q7}, [r0], r1 ; q0
- vmov.u8 q10, #0x03 ; 0x03
- vld1.u8 {q8}, [r0] ; q1
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
- ;vp8_filter_mask() function
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q5}, [r3@128], r1 ; p0
+ vld1.u8 {q8}, [r0@128] ; q1
+ vld1.u8 {q6}, [r3@128] ; p1
+
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
+
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
- ;vp8_filter() function
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
- vadd.u8 q1, q1, q1 ; flimit * 2
- vadd.u8 q1, q1, q13 ; flimit * 2 + limit
- vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+ vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
-;;;;;;;;;;
- ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0)
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
- ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0)
- vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0)
- vadd.s16 q12, q3, q3
+ vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
+ vmul.s16 q3, q3, q13
+ vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
- vadd.s16 q2, q2, q11
- vadd.s16 q3, q3, q12
-
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
- ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
-;;;;;;;;;;;;;
- vand q4, q4, q15 ; vp8_filter &= mask
+ vand q14, q4, q15 ; vp8_filter &= mask
- vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+ vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+ vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q4, q4, #3 ; Filter1 >>= 3
+ vshr.s8 q4, q3, #3 ; Filter1 >>= 3
- sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
- add r3, r0, r1
-
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
- vst1.u8 {q6}, [r0] ; store op0
- vst1.u8 {q7}, [r3] ; store oq0
+ vst1.u8 {q6}, [r3@128] ; store op0
+ vst1.u8 {q7}, [r0@128] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
-;-----------------
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate blim
+
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
+ bl vp8_loop_filter_simple_horizontal_edge_neon
+ ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
+ bl vp8_loop_filter_simple_horizontal_edge_neon
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
+ pop {r4, lr}
+ b vp8_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp8_loop_filter_bhs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp8_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp8_loop_filter_bhs_neon|
END
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,51 +9,43 @@
;
- EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
+ ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
+ EXPORT |vp8_loop_filter_bvs_neon|
+ EXPORT |vp8_loop_filter_mbvs_neon|
ARM
- REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0 unsigned char *s,
-; r1 int p, //pitch
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; stack(r4) const signed char *thresh (unused)
-; //stack(r5) int count --unused
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
+ add r12, r1, r1
+ add r3, r0, r1
- vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
- vld1.s8 {d2[], d3[]}, [r2] ; flimit
- vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
- vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
- vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
- vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
- vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
- vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
- vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
- vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
+ vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+ vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+ vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+ vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+ vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+ vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+ vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+ vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vmov.u8 q0, #0x80 ; 0x80
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vmov.u8 q11, #0x03 ; 0x03
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vmov.u8 q12, #0x04 ; 0x04
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+ vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vswp d7, d10
vswp d12, d9
- ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6
;vp8_filter_mask() function
;vp8_hevmask() function
@@ -60,8 +52,11 @@
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
+
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
@@ -69,80 +64,91 @@
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
- vadd.u8 q1, q1, q1 ; flimit * 2
- vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
- ;vp8_filter() function
-;;;;;;;;;;
- ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0)
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
- vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+ vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
- ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0)
- vadd.s16 q14, q13, q13
- vadd.s16 q2, q2, q10
- vadd.s16 q13, q13, q14
+ vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+ vmul.s16 q13, q13, q11
- ;vqadd.s8 q1, q1, q2
- vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d3
+ vmov.u8 q11, #0x03 ; 0x03
+ vmov.u8 q12, #0x04 ; 0x04
- vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d3, q13
+ vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q13, q13, d29
+ vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d29, q13
+
add r0, r0, #1
- add r2, r0, r1
-;;;;;;;;;;;
+ add r3, r0, r1
- vand q1, q1, q15 ; vp8_filter &= mask
+ vand q14, q14, q15 ; vp8_filter &= mask
- vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+ vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+ vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q1, q1, #3 ; Filter1 >>= 3
+ vshr.s8 q14, q3, #3 ; Filter1 >>= 3
;calculate output
- vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1)
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+ vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
- veor q7, q10, q0 ; *oq0 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
-
- add r3, r2, r1
+ veor q7, q10, q0 ; *oq0 = u^0x80
+ add r12, r1, r1
vswp d13, d14
- add r12, r3, r1
;store op1, op0, oq0, oq1
- vst2.8 {d12[0], d13[0]}, [r0]
- vst2.8 {d12[1], d13[1]}, [r2]
- vst2.8 {d12[2], d13[2]}, [r3]
- vst2.8 {d12[3], d13[3]}, [r12], r1
- add r0, r12, r1
- vst2.8 {d12[4], d13[4]}, [r12]
- vst2.8 {d12[5], d13[5]}, [r0], r1
- add r2, r0, r1
- vst2.8 {d12[6], d13[6]}, [r0]
- vst2.8 {d12[7], d13[7]}, [r2], r1
- add r3, r2, r1
- vst2.8 {d14[0], d15[0]}, [r2]
- vst2.8 {d14[1], d15[1]}, [r3], r1
- add r12, r3, r1
- vst2.8 {d14[2], d15[2]}, [r3]
- vst2.8 {d14[3], d15[3]}, [r12], r1
- add r0, r12, r1
- vst2.8 {d14[4], d15[4]}, [r12]
- vst2.8 {d14[5], d15[5]}, [r0], r1
- add r2, r0, r1
- vst2.8 {d14[6], d15[6]}, [r0]
- vst2.8 {d14[7], d15[7]}, [r2]
+ vst2.8 {d12[0], d13[0]}, [r0], r12
+ vst2.8 {d12[1], d13[1]}, [r3], r12
+ vst2.8 {d12[2], d13[2]}, [r0], r12
+ vst2.8 {d12[3], d13[3]}, [r3], r12
+ vst2.8 {d12[4], d13[4]}, [r0], r12
+ vst2.8 {d12[5], d13[5]}, [r3], r12
+ vst2.8 {d12[6], d13[6]}, [r0], r12
+ vst2.8 {d12[7], d13[7]}, [r3], r12
+ vst2.8 {d14[0], d15[0]}, [r0], r12
+ vst2.8 {d14[1], d15[1]}, [r3], r12
+ vst2.8 {d14[2], d15[2]}, [r0], r12
+ vst2.8 {d14[3], d15[3]}, [r3], r12
+ vst2.8 {d14[4], d15[4]}, [r0], r12
+ vst2.8 {d14[5], d15[5]}, [r3], r12
+ vst2.8 {d14[6], d15[6]}, [r0], r12
+ vst2.8 {d14[7], d15[7]}, [r3]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
-;-----------------
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+|vp8_loop_filter_bvs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ mov r4, r0
+ add r0, r0, #4
+ vdup.s8 q1, r3 ; duplicate blim
+ bl vp8_loop_filter_simple_vertical_edge_neon
+ ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
+ add r0, r4, #8
+ bl vp8_loop_filter_simple_vertical_edge_neon
+ add r0, r4, #12
+ pop {r4, lr}
+ b vp8_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp8_loop_filter_bvs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+ ldrb r3, [r2] ; load mblim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp8_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp8_loop_filter_bvs_neon|
END
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -14,156 +14,144 @@
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
- REQUIRE8
- PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
-; sp+4 int count (unused)
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
- stmdb sp!, {lr}
- sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r12, [sp, #4] ; load thresh pointer
+ push {lr}
+ add r1, r1, r1 ; double stride
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
- vld1.u8 {q3}, [r0], r1 ; p3
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- vld1.u8 {q4}, [r0], r1 ; p2
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
- vld1.u8 {q5}, [r0], r1 ; p1
- vld1.u8 {q6}, [r0], r1 ; p0
- vld1.u8 {q7}, [r0], r1 ; q0
- vld1.u8 {q8}, [r0], r1 ; q1
- vld1.u8 {q9}, [r0], r1 ; q2
- vld1.u8 {q10}, [r0], r1 ; q3
+ vld1.u8 {q3}, [r0@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r0@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r0@128], r1 ; q2
+ vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
- sub r0, r0, r1, lsl #3
- add r0, r0, r1
- add r2, r0, r1
- add r3, r2, r1
+ sub r12, r12, r1, lsl #2
+ add r0, r12, r1, lsr #1
- vst1.u8 {q4}, [r0] ; store op2
- vst1.u8 {q5}, [r2] ; store op1
- vst1.u8 {q6}, [r3], r1 ; store op0
- add r12, r3, r1
- vst1.u8 {q7}, [r3] ; store oq0
- vst1.u8 {q8}, [r12], r1 ; store oq1
- vst1.u8 {q9}, [r12] ; store oq2
+ vst1.u8 {q4}, [r12@128],r1 ; store op2
+ vst1.u8 {q5}, [r0@128],r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r0@128],r1 ; store oq0
+ vst1.u8 {q8}, [r12@128] ; store oq1
+ vst1.u8 {q9}, [r0@128] ; store oq2
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
; sp+4 unsigned char *v
+
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
- stmdb sp!, {lr}
- sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- ldr r3, [sp, #8] ; load v ptr
- ldr r12, [sp, #4] ; load thresh pointer
- sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
- vld1.u8 {d6}, [r0], r1 ; p3
- vld1.u8 {d7}, [r3], r1 ; p3
- vld1.u8 {d8}, [r0], r1 ; p2
- vld1.u8 {d9}, [r3], r1 ; p2
- vld1.u8 {d10}, [r0], r1 ; p1
- vld1.u8 {d11}, [r3], r1 ; p1
- vld1.u8 {d12}, [r0], r1 ; p0
- vld1.u8 {d13}, [r3], r1 ; p0
- vld1.u8 {d14}, [r0], r1 ; q0
- vld1.u8 {d15}, [r3], r1 ; q0
- vld1.u8 {d16}, [r0], r1 ; q1
- vld1.u8 {d17}, [r3], r1 ; q1
- vld1.u8 {d18}, [r0], r1 ; q2
- vld1.u8 {d19}, [r3], r1 ; q2
- vld1.u8 {d20}, [r0], r1 ; q3
- vld1.u8 {d21}, [r3], r1 ; q3
+ vld1.u8 {d6}, [r0@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r0@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r0@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r0@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r0@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r0@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r0@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r0@64], r1 ; q3
+ vld1.u8 {d21}, [r12@64], r1 ; q3
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
-
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
- sub r3, r3, r1, lsl #3
+ sub r12, r12, r1, lsl #3
add r0, r0, r1
- add r3, r3, r1
+ add r12, r12, r1
- vst1.u8 {d8}, [r0], r1 ; store u op2
- vst1.u8 {d9}, [r3], r1 ; store v op2
- vst1.u8 {d10}, [r0], r1 ; store u op1
- vst1.u8 {d11}, [r3], r1 ; store v op1
- vst1.u8 {d12}, [r0], r1 ; store u op0
- vst1.u8 {d13}, [r3], r1 ; store v op0
- vst1.u8 {d14}, [r0], r1 ; store u oq0
- vst1.u8 {d15}, [r3], r1 ; store v oq0
- vst1.u8 {d16}, [r0], r1 ; store u oq1
- vst1.u8 {d17}, [r3], r1 ; store v oq1
- vst1.u8 {d18}, [r0], r1 ; store u oq2
- vst1.u8 {d19}, [r3], r1 ; store v oq2
+ vst1.u8 {d8}, [r0@64], r1 ; store u op2
+ vst1.u8 {d9}, [r12@64], r1 ; store v op2
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r12@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r12@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r12@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64], r1 ; store u oq1
+ vst1.u8 {d17}, [r12@64], r1 ; store v oq1
+ vst1.u8 {d18}, [r0@64], r1 ; store u oq2
+ vst1.u8 {d19}, [r12@64], r1 ; store v oq2
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
-; sp+4 int count (unused)
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
- stmdb sp!, {lr}
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
+ vdup.s8 q2, r12 ; thresh
+ add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
- ldr r12, [sp, #4] ; load thresh pointer
+ vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
vld1.u8 {d8}, [r0], r1
- sub sp, sp, #32
+ vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
+ vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
+ vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
+ vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
+ vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
+ vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r12], r1
- vld1.u8 {d7}, [r0], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r0], r1
- vld1.u8 {d11}, [r0], r1
- vld1.u8 {d13}, [r0], r1
- vld1.u8 {d15}, [r0], r1
- vld1.u8 {d17}, [r0], r1
- vld1.u8 {d19}, [r0], r1
- vld1.u8 {d21}, [r0], r1
-
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
@@ -180,29 +168,17 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- mov r12, sp
- vst1.u8 {q3}, [r12]!
- vst1.u8 {q10}, [r12]!
+ sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
- sub r0, r0, r1, lsl #4
+ sub r12, r12, r1, lsl #3
- add r2, r0, r1
-
- add r3, r2, r1
-
- vld1.u8 {q3}, [sp]!
- vld1.u8 {q10}, [sp]!
-
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
- add r12, r3, r1
vtrn.16 q3, q5
vtrn.16 q4, q6
@@ -215,36 +191,30 @@
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
- vst1.8 {d6}, [r0]
- vst1.8 {d8}, [r2]
- vst1.8 {d10}, [r3]
- vst1.8 {d12}, [r12], r1
- add r0, r12, r1
- vst1.8 {d14}, [r12]
- vst1.8 {d16}, [r0], r1
- add r2, r0, r1
- vst1.8 {d18}, [r0]
- vst1.8 {d20}, [r2], r1
- add r3, r2, r1
- vst1.8 {d7}, [r2]
- vst1.8 {d9}, [r3], r1
- add r12, r3, r1
- vst1.8 {d11}, [r3]
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r12], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d9}, [r12], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d11}, [r12], r1
+ vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
- add r0, r12, r1
- vst1.8 {d15}, [r12]
- vst1.8 {d17}, [r0], r1
- add r2, r0, r1
- vst1.8 {d19}, [r0]
- vst1.8 {d21}, [r2]
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d15}, [r12], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r12], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
@@ -253,30 +223,29 @@
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
- stmdb sp!, {lr}
- sub r0, r0, #4 ; move src pointer down by 4 columns
- vld1.s8 {d2[], d3[]}, [r3] ; limit
- ldr r3, [sp, #8] ; load v ptr
- ldr r12, [sp, #4] ; load thresh pointer
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, #4 ; move u pointer down by 4 columns
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, #4 ; move v pointer down by 4 columns
- sub r3, r3, #4 ; move v pointer down by 4 columns
-
vld1.u8 {d6}, [r0], r1 ;load u data
- vld1.u8 {d7}, [r3], r1 ;load v data
+ vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d8}, [r0], r1
- vld1.u8 {d9}, [r3], r1
+ vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
- vld1.u8 {d11}, [r3], r1
+ vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
- vld1.u8 {d13}, [r3], r1
+ vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
- vld1.u8 {d15}, [r3], r1
+ vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
- vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
- vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r3], r1
+ vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -294,20 +263,12 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
- sub sp, sp, #32
- vld1.s8 {d4[], d5[]}, [r12] ; thresh
- mov r12, sp
- vst1.u8 {q3}, [r12]!
- vst1.u8 {q10}, [r12]!
+ sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
- sub r0, r0, r1, lsl #3
- sub r3, r3, r1, lsl #3
+ sub r12, r12, r1, lsl #3
- vld1.u8 {q3}, [sp]!
- vld1.u8 {q10}, [sp]!
-
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
@@ -326,23 +287,23 @@
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r3], r1
+ vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
- vst1.8 {d9}, [r3], r1
+ vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
- vst1.8 {d11}, [r3], r1
+ vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
- vst1.8 {d13}, [r3], r1
+ vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
- vst1.8 {d15}, [r3], r1
+ vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
- vst1.8 {d17}, [r3], r1
+ vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
- vst1.8 {d19}, [r3], r1
- vst1.8 {d20}, [r0], r1
- vst1.8 {d21}, [r3], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
- ldmia sp!, {pc}
+ pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
@@ -350,19 +311,12 @@
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
-; TODO:
-; The vertical filter writes p3/q3 back out because two 4 element writes are
-; much simpler than ordering and writing two 3 element sets (or three 2 elements
-; sets, or whichever other combinations are possible).
-; If we can preserve q3 and q10, the vertical filter will be able to avoid
-; storing those values on the stack and reading them back after the filter.
-
; r0,r1 PRESERVE
-; r2 flimit
-; r3 PRESERVE
-; q1 limit
+; r2 mblimit
+; r3 limit
+
; q2 thresh
-; q3 p3
+; q3 p3 PRESERVE
; q4 p2
; q5 p1
; q6 p0
@@ -369,7 +323,7 @@
; q7 q0
; q8 q1
; q9 q2
-; q10 q3
+; q10 q3 PRESERVE
|vp8_mbloop_filter_neon| PROC
@@ -378,12 +332,12 @@
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q3, q9, q8 ; abs(q2 - q1)
+ vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
- vmax.u8 q3, q3, q0
+ vmax.u8 q1, q1, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
@@ -391,27 +345,28 @@
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
- vmax.u8 q15, q15, q3
+ vmax.u8 q15, q15, q1
- vld1.s8 {d4[], d5[]}, [r2] ; flimit
+ vdup.u8 q1, r3 ; limit
+ vdup.u8 q2, r2 ; mblimit
vmov.u8 q0, #0x80 ; 0x80
- vadd.u8 q2, q2, q2 ; flimit * 2
- vadd.u8 q2, q2, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
- vshr.u8 q1, q1, #1 ; a = a / 2
- vqadd.u8 q12, q12, q1 ; a = b + a
- vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
+ vmov.u16 q11, #3 ; #3
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
+ vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
+
+ vqadd.u8 q12, q12, q1 ; a = b + a
+
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
@@ -418,17 +373,18 @@
vorr q14, q13, q14 ; vp8_hevmask
+ vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
+
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
- vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0)
- vadd.s16 q11, q13, q13
+ vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+
vand q15, q15, q12 ; vp8_filter_mask
- vadd.s16 q2, q2, q10
- vadd.s16 q13, q13, q11
+ vmul.i16 q13, q13, q11
vmov.u8 q12, #3 ; #3
@@ -447,23 +403,19 @@
vand q13, q1, q14 ; Filter2 &= hev
- vmov.u8 d7, #9 ; #9
-
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
- vmov.u8 d6, #18 ; #18
+ vmov q0, q15
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
- vmov q10, q15
+ vmov q11, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
- vmov.u8 d5, #27 ; #27
-
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
@@ -471,35 +423,43 @@
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
- vmov q11, q15
+
+ vmov.u8 d5, #9 ; #9
+ vmov.u8 d4, #18 ; #18
+
vmov q13, q15
vmov q14, q15
- vmlal.s8 q10, d2, d7 ; Filter2 * 9
- vmlal.s8 q11, d3, d7
- vmlal.s8 q12, d2, d6 ; Filter2 * 18
- vmlal.s8 q13, d3, d6
- vmlal.s8 q14, d2, d5 ; Filter2 * 27
+ vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
+ vmlal.s8 q11, d3, d5
+ vmov.u8 d5, #27 ; #27
+ vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
+ vmlal.s8 q13, d3, d4
+ vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q15, d3, d5
- vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7)
- vqshrn.s16 d21, q11, #7
+
+ vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
+ vqshrn.s16 d1, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
- vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u)
- vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u)
+ vmov.u8 q1, #0x80 ; 0x80
+
+ vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
+ vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
- veor q9, q11, q0 ; *oq2 = s^0x80
- veor q4, q10, q0 ; *op2 = s^0x80
- veor q8, q13, q0 ; *oq1 = s^0x80
- veor q5, q12, q0 ; *op2 = s^0x80
- veor q7, q15, q0 ; *oq0 = s^0x80
- veor q6, q14, q0 ; *op0 = s^0x80
+
+ veor q9, q11, q1 ; *oq2 = s^0x80
+ veor q4, q0, q1 ; *op2 = s^0x80
+ veor q8, q13, q1 ; *oq1 = s^0x80
+ veor q5, q12, q1 ; *op2 = s^0x80
+ veor q7, q15, q1 ; *oq0 = s^0x80
+ veor q6, q14, q1 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|