ref: 5bc8a5002beadaa195fa99df46f10a793055a533
parent: 2653292c7f5f4d020c3a621427c284edfdb762b2
author: Martin Storsjö <martin@martin.st>
date: Wed Feb 5 05:43:10 EST 2020
arm: looprestoration: Prepare for 16bpc wiener filter by adding _8bpc to function names
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
-// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-// const pixel *src, ptrdiff_t stride,
-// const int16_t fh[7], const intptr_t w,
-// int h, enum LrEdgeFlags edges);
-function wiener_filter_h_neon, export=1
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
@@ -367,11 +367,11 @@
.purgem filter_4
endfunc
-// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-// const int16_t *mid, int w, int h,
-// const int16_t fv[7], enum LrEdgeFlags edges,
-// ptrdiff_t mid_stride);
-function wiener_filter_v_neon, export=1
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
@@ -548,9 +548,9 @@
.purgem filter
endfunc
-// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-// const pixel *src, int w, int h);
-function copy_narrow_neon, export=1
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
-// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-// const pixel *src, ptrdiff_t stride,
-// const int16_t fh[7], const intptr_t w,
-// int h, enum LrEdgeFlags edges);
-function wiener_filter_h_neon, export=1
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
mov w8, w5
ld1 {v0.8h}, [x4]
mov w9, #(1 << 14) - (1 << 2)
@@ -306,11 +306,11 @@
.purgem filter
endfunc
-// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-// const int16_t *mid, int w, int h,
-// const int16_t fv[7], enum LrEdgeFlags edges,
-// ptrdiff_t mid_stride);
-function wiener_filter_v_neon, export=1
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
mov w8, w4
ld1 {v0.8h}, [x5]
movi v1.8h, #128
@@ -482,9 +482,9 @@
.purgem filter
endfunc
-// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-// const pixel *src, int w, int h);
-function copy_narrow_neon, export=1
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
adr x5, L(copy_narrow_tbl)
ldrh w6, [x5, w3, uxtw #1]
sub x5, x5, w6, uxth
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -41,10 +41,10 @@
// Compared to the reference C version, this is the output of the first pass
// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
// with round_offset precompensated.
-void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
- const pixel *src, ptrdiff_t stride,
- const int16_t fh[7], const intptr_t w,
- int h, enum LrEdgeFlags edges);
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[7], const intptr_t w,
+ int h, enum LrEdgeFlags edges);
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
// fv[3] += 128;
@@ -53,12 +53,12 @@
// sum += mid[idx] * fv[i];
// sum = (sum + rounding_off_v) >> round_bits_v;
// This function assumes that the width is a multiple of 8.
-void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
- const int16_t *mid, int w, int h,
- const int16_t fv[7], enum LrEdgeFlags edges,
- ptrdiff_t mid_stride);
-void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
- const pixel *src, int w, int h);
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[7], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride);
+void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *src, int w, int h);
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
@@ -70,27 +70,29 @@
int mid_stride = (w + 7) & ~7;
// Horizontal filter
- dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
- fh, w, h, edges);
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
+ fh, w, h, edges);
if (edges & LR_HAVE_TOP)
- dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
- fh, w, 2, edges);
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
+ fh, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
- dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
- lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
- fh, w, 2, edges);
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(lpf_stride),
+ lpf_stride, fh, w, 2, edges);
// Vertical filter
if (w >= 8)
- dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
- w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
+ BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
+ w & ~7, h, fv, edges,
+ mid_stride * sizeof(*mid));
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into dest.
ALIGN_STK_16(pixel, tmp, 64 * 8,);
- dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
- w & 7, h, fv, edges, mid_stride * sizeof(*mid));
- dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
+ BF(dav1d_wiener_filter_v, neon)(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
+ w & 7, h, fv, edges,
+ mid_stride * sizeof(*mid));
+ BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
}
}
@@ -211,8 +213,8 @@
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]);
- dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
- w & 7, h);
+ BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+ w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
@@ -228,8 +230,8 @@
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
- dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
- w & 7, h);
+ BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+ w & 7, h);
}
} else {
ALIGN_STK_16(coef, tmp1, 64 * 384,);
@@ -250,8 +252,8 @@
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt);
- dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
- w & 7, h);
+ BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+ w & 7, h);
}
}
}