shithub: dav1d

Download patch

ref: 8e8fb84dcda63e83671a41235f2d71e726a2e716
parent: feeaf785340e9aa910f65602e0f42e9958bd9e21
author: Martin Storsjö <martin@martin.st>
date: Wed Feb 5 05:17:59 EST 2020

arm: Use int16_t for the tmp intermediate buffer

For 8bpc and 10bpc, int16_t is enough here, and for 12bpc, other
intermediate int16_t buffers also need to be made of size coef anyway.

--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -1661,7 +1661,7 @@
 
 #define FILTER_OUT_STRIDE 384
 
-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+// void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
 //                                    const pixel *src, const ptrdiff_t stride,
 //                                    const int32_t *a, const int16_t *b,
 //                                    const int w, const int h);
@@ -1765,7 +1765,7 @@
         pop             {r4-r11,pc}
 endfunc
 
-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+// void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
 //                                    const pixel *src, const ptrdiff_t stride,
 //                                    const int32_t *a, const int16_t *b,
 //                                    const int w, const int h);
@@ -1927,7 +1927,7 @@
 
 // void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
 //                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const int w, const int h,
+//                               const int16_t *t1, const int w, const int h,
 //                               const int wt);
 function sgr_weighted1_neon, export=1
         push            {r4-r9,lr}
@@ -2011,7 +2011,7 @@
 
 // void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const coef *t2,
+//                               const int16_t *t1, const int16_t *t2,
 //                               const int w, const int h,
 //                               const int16_t wt[2]);
 function sgr_weighted2_neon, export=1
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -1540,7 +1540,7 @@
 
 #define FILTER_OUT_STRIDE 384
 
-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+// void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
 //                                    const pixel *src, const ptrdiff_t stride,
 //                                    const int32_t *a, const int16_t *b,
 //                                    const int w, const int h);
@@ -1657,7 +1657,7 @@
         ret
 endfunc
 
-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+// void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
 //                                    const pixel *src, const ptrdiff_t stride,
 //                                    const int32_t *a, const int16_t *b,
 //                                    const int w, const int h);
@@ -1809,7 +1809,7 @@
 
 // void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
 //                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const int w, const int h,
+//                               const int16_t *t1, const int w, const int h,
 //                               const int wt);
 function sgr_weighted1_neon, export=1
         dup             v31.8h, w7
@@ -1889,7 +1889,7 @@
 
 // void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const coef *t2,
+//                               const int16_t *t1, const int16_t *t2,
 //                               const int w, const int h,
 //                               const int16_t wt[2]);
 function sgr_weighted2_neon, export=1
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -117,13 +117,13 @@
                            const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
                              const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter1_neon(coef *tmp,
+void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
                                    const int32_t *a, const int16_t *b,
                                    const int w, const int h);
 
 /* filter with a 3x3 box (radius=1) */
-static void dav1d_sgr_filter1_neon(coef *tmp,
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
                                    const pixel (*left)[4],
                                    const pixel *lpf, const ptrdiff_t lpf_stride,
@@ -160,13 +160,13 @@
                            const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
                              const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter2_neon(coef *tmp,
+void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
                                    const int32_t *a, const int16_t *b,
                                    const int w, const int h);
 
 /* filter with a 5x5 box (radius=2) */
-static void dav1d_sgr_filter2_neon(coef *tmp,
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
                                    const pixel (*left)[4],
                                    const pixel *lpf, const ptrdiff_t lpf_stride,
@@ -195,11 +195,11 @@
 
 void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
                               const pixel *src, const ptrdiff_t src_stride,
-                              const coef *t1, const int w, const int h,
+                              const int16_t *t1, const int w, const int h,
                               const int wt);
 void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
                               const pixel *src, const ptrdiff_t src_stride,
-                              const coef *t1, const coef *t2,
+                              const int16_t *t1, const int16_t *t2,
                               const int w, const int h,
                               const int16_t wt[2]);
 
@@ -210,7 +210,7 @@
                              const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
 {
     if (!dav1d_sgr_params[sgr_idx][0]) {
-        ALIGN_STK_16(coef, tmp, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
         dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
                                w, h, dav1d_sgr_params[sgr_idx][3], edges);
         if (w >= 8)
@@ -228,7 +228,7 @@
                                         w & 7, h);
         }
     } else if (!dav1d_sgr_params[sgr_idx][1]) {
-        ALIGN_STK_16(coef, tmp, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
         dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
                                w, h, dav1d_sgr_params[sgr_idx][2], edges);
         if (w >= 8)
@@ -245,8 +245,8 @@
                                         w & 7, h);
         }
     } else {
-        ALIGN_STK_16(coef, tmp1, 64 * 384,);
-        ALIGN_STK_16(coef, tmp2, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
         dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
                                w, h, dav1d_sgr_params[sgr_idx][2], edges);
         dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,