ref: 701f88b9e3b60a9578ab1e98803c37561e41f466
parent: 53389fcda59c9b49f5e334a9aa212d59766824ca
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Oct 8 09:03:29 EDT 2018
x86: Add dc/h/v intra prediction AVX2 asm
--- a/src/ipred.c
+++ b/src/ipred.c
@@ -758,4 +758,8 @@
c->cfl_pred[3] = cfl_pred_32xN_c;
c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM && ARCH_X86
+ bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
}
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -99,4 +99,7 @@
void dav1d_intra_pred_dsp_init_8bpc(Dav1dIntraPredDSPContext *c);
void dav1d_intra_pred_dsp_init_10bpc(Dav1dIntraPredDSPContext *c);
+void dav1d_intra_pred_dsp_init_x86_8bpc(Dav1dIntraPredDSPContext *c);
+void dav1d_intra_pred_dsp_init_x86_10bpc(Dav1dIntraPredDSPContext *c);
+
#endif /* __DAV1D_SRC_IPRED_H__ */
--- a/src/meson.build
+++ b/src/meson.build
@@ -98,6 +98,7 @@
)
libdav1d_tmpl_sources += files(
+ 'x86/ipred_init.c',
'x86/itx_init.c',
'x86/loopfilter_init.c',
'x86/mc_init.c',
@@ -106,6 +107,7 @@
# NASM source files
libdav1d_sources_asm = files(
'x86/cpuid.asm',
+ 'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/mc.asm',
--- a/src/recon.c
+++ b/src/recon.c
@@ -701,7 +701,8 @@
const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
// coefficient coding
- pixel edge_mem[257], *const edge = &edge_mem[128];
+ ALIGN_STK_32(pixel, edge_buf, 257,);
+ pixel *const edge = edge_buf + 128;
const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
for (int init_y = 0; init_y < h4; init_y += 16) {
@@ -1106,7 +1107,8 @@
obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
}
if (b->interintra_type) {
- pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
+ ALIGN_STK_32(pixel, tl_edge_buf, 65,);
+ pixel *const tl_edge = tl_edge_buf + 32;
enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
SMOOTH_PRED : b->interintra_mode;
pixel *const tmp = t->scratch.interintra;
--- /dev/null
+++ b/src/x86/ipred.asm
@@ -1,0 +1,399 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+pb_128: times 4 db 128
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+
+JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov r5d, 0x8000
+ shrx r5d, r5d, r6d
+ movd xm3, r5d
+ lea r5, [ipred_dc_left_avx2_table]
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ mova m1, m0
+ jmp wq
+
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastb xm0, xm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastb xm0, xm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastb xm0, xm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastb m0, xm0
+.s32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-32]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+33]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w64_end:
+ vpbroadcastb m0, xm0
+ mova m1, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+ mova m1, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+33]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_h_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4, d
+.w8:
+ IPRED_H 8, q
+.w16:
+ IPRED_H 16, a
+INIT_YMM avx2
+.w32:
+ IPRED_H 32, a
+.w64:
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+%endif
--- /dev/null
+++ b/src/x86/ipred_init.c
@@ -1,0 +1,51 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(dav1d_ipred_dc_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
+decl_angular_ipred_fn(dav1d_ipred_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_v_avx2);
+
+void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+ c->intra_pred[DC_PRED] = dav1d_ipred_dc_avx2;
+ c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_avx2;
+ c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_avx2;
+ c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2;
+ c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2;
+ c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2;
+#endif
+}