ref: 276973ecebea1fa7657253af26f6af3ac1e85513
dir: /src/x86/mc_ssse3.asm/
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; Copyright © 2018, VideoLabs
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
; dav1d_obmc_masks[] with 64-x interleaved
obmc_masks: db 0, 0, 0, 0
; 2 @4
db 45, 19, 64, 0
; 4 @8
db 39, 25, 50, 14, 59, 5, 64, 0
; 8 @16
db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
; 16 @32
db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
; 32 @64
db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 16 db 64
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_258: times 8 dw 258
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
%macro BIDIR_JMP_TABLE 1-*
;evaluated at definition time (in loop below)
%xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
; dynamically generated label
%%table:
%rep %0 - 1 ; repeat for num args
dd %%prefix %+ .w%2 - %%base
%rotate 1
%endrep
%endmacro
BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
SECTION .text
INIT_XMM ssse3
%if WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
%endif
%macro BIDIR_FN 1 ; op
%1 0
lea stride3q, [strideq*3]
jmp wq
.w4_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq*4]
.w4: ; tile 4x
movd [dstq ], m0 ; copy dw[0]
pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
movd [dstq+strideq*1], m1 ; copy dw[1]
punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
movd [dstq+strideq*2], m0 ; dw[2]
psrlq m0, 32 ; shift right in dw[3]
movd [dstq+stride3q ], m0 ; copy
sub hd, 4
jg .w4_loop
RET
.w8_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq*2]
.w8:
movq [dstq ], m0
movhps [dstq+strideq*1], m0
sub hd, 2
jg .w8_loop
RET
.w16_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq]
.w16:
mova [dstq ], m0
dec hd
jg .w16_loop
RET
.w32_loop:
%1_INC_PTR 4
%1 0
lea dstq, [dstq+strideq]
.w32:
mova [dstq ], m0
%1 2
mova [dstq + 16 ], m0
dec hd
jg .w32_loop
RET
.w64_loop:
%1_INC_PTR 8
%1 0
add dstq, strideq
.w64:
%assign i 0
%rep 4
mova [dstq + i*16 ], m0
%assign i i+1
%if i < 4
%1 2*i
%endif
%endrep
dec hd
jg .w64_loop
RET
.w128_loop:
%1_INC_PTR 16
%1 0
add dstq, strideq
.w128:
%assign i 0
%rep 8
mova [dstq + i*16 ], m0
%assign i i+1
%if i < 8
%1 2*i
%endif
%endrep
dec hd
jg .w128_loop
RET
%endmacro
%macro AVG 1 ; src_offset
; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
mova m1, [tmp1q+(%1+1)*mmsize]
paddw m1, [tmp2q+(%1+1)*mmsize]
pmulhrsw m0, m2
pmulhrsw m1, m2
packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
%endmacro
%macro AVG_INC_PTR 1
add tmp1q, %1*mmsize
add tmp2q, %1*mmsize
%endmacro
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
LEA r6, avg_ssse3_table
tzcnt wd, wm ; leading zeros
movifnidn hd, hm ; move h(stack) to h(register) if not already that register
movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
add wq, r6
BIDIR_FN AVG
%macro W_AVG 1 ; src_offset
; (a * weight + b * (16 - weight) + 128) >> 8
; = ((a - b) * weight + (b << 4) + 128) >> 8
; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
mova m0, [tmp2q+(%1+0)*mmsize]
psubw m2, m0, [tmp1q+(%1+0)*mmsize]
mova m1, [tmp2q+(%1+1)*mmsize]
psubw m3, m1, [tmp1q+(%1+1)*mmsize]
paddw m2, m2 ; compensate for the weight only being half
paddw m3, m3 ; of what it should be
pmulhw m2, m4 ; (b-a) * (-weight << 12)
pmulhw m3, m4 ; (b-a) * (-weight << 12)
paddw m0, m2 ; ((b-a) * -weight) + b
paddw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
%endmacro
%define W_AVG_INC_PTR AVG_INC_PTR
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
LEA r6, w_avg_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movd m0, r6m
pshuflw m0, m0, q0000
punpcklqdq m0, m0
movsxd wq, dword [r6+wq*4]
pxor m4, m4
psllw m0, 11 ; can't shift by 12, sign bit must be preserved
psubw m4, m0
mova m5, [pw_2048+r6-w_avg_ssse3_table]
add wq, r6
BIDIR_FN W_AVG
%macro MASK 1 ; src_offset
; (a * m + b * (64 - m) + 512) >> 10
; = ((a - b) * m + (b << 6) + 512) >> 10
; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
mova m3, [maskq+(%1+0)*(mmsize/2)]
mova m0, [tmp2q+(%1+0)*mmsize] ; b
psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
mova m6, m3 ; m
psubb m3, m4, m6 ; -m
paddw m1, m1 ; (b - a) << 1
paddb m3, m3 ; -m << 1
punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
pmulhw m1, m2 ; (-m * (b - a)) << 10
paddw m0, m1 ; + b
mova m1, [tmp2q+(%1+1)*mmsize] ; b
psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
paddw m2, m2 ; (b - a) << 1
mova m6, m3 ; (-m << 1)
punpckhbw m3, m4, m6 ; (-m << 9)
pmulhw m2, m3 ; (-m << 9)
paddw m1, m2 ; (-m * (b - a)) << 10
pmulhrsw m0, m5 ; round
pmulhrsw m1, m5 ; round
packuswb m0, m1 ; interleave 16 -> 8
%endmacro
%macro MASK_INC_PTR 1
add maskq, %1*mmsize/2
add tmp1q, %1*mmsize
add tmp2q, %1*mmsize
%endmacro
%if ARCH_X86_64
cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
movifnidn hd, hm
%else
cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
%define hd dword r5m
%endif
%define base r6-mask_ssse3_table
LEA r6, mask_ssse3_table
tzcnt wd, wm
movsxd wq, dword [r6+wq*4]
pxor m4, m4
mova m5, [base+pw_2048]
add wq, r6
mov maskq, r6m
BIDIR_FN MASK
%undef hd
%if ARCH_X86_64
%define reg_pw_8 m8
%define reg_pw_27 m9
%define reg_pw_2048 m10
%else
%define reg_pw_8 [base+pw_8]
%define reg_pw_27 [base+pw_26] ; 64 - 38
%define reg_pw_2048 [base+pw_2048]
%endif
%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m0, [tmp1q+(%1)]
mova m1, [tmp2q+(%1)]
psubw m1, m0 ; tmp1 - tmp2
pabsw m3, m1 ; abs(tmp1 - tmp2)
paddw m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8
psrlw m3, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
psubusw m%2, reg_pw_27, m3 ; 64 - min(m, 64)
psllw m2, m%2, 10
pmulhw m1, m2 ; tmp2 * ()
paddw m0, m1 ; tmp1 + ()
;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m1, [tmp1q+(%1)+mmsize]
mova m2, [tmp2q+(%1)+mmsize]
psubw m2, m1 ; tmp1 - tmp2
pabsw m7, m2 ; abs(tmp1 - tmp2)
paddw m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8
psrlw m7, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
psubusw m3, reg_pw_27, m7 ; 64 - min(m, 64)
phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
psllw m3, 10
pmulhw m2, m3
paddw m1, m2
;********
pmulhrsw m0, reg_pw_2048 ; round/scale 2048
pmulhrsw m1, reg_pw_2048 ; round/scale 2048
packuswb m0, m1 ; concat m0 = u8.dst[15..0]
%endmacro
%macro W_MASK_420 2
W_MASK_420_B (%1*16), %2
%endmacro
%define base r6-w_mask_420_ssse3_table
%if ARCH_X86_64
; args: dst, stride, tmp1, tmp2, w, h, mask, sign
cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask
lea r6, [w_mask_420_ssse3_table]
mov wd, wm
tzcnt r7d, wd
movifnidn hd, hm
movd m0, r7m
pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0
movsxd r7, [r6+r7*4]
mova reg_pw_8, [base+pw_8]
mova reg_pw_27, [base+pw_26] ; 64 - 38
mova reg_pw_2048, [base+pw_2048]
mova m6, [base+pw_258] ; 64 * 4 + 2
add r7, r6
mov maskq, maskmp
psubw m6, m0
W_MASK_420 0, 4
jmp r7
%define loop_w r7d
%else
cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
tzcnt wd, wm
LEA r6, w_mask_420_ssse3_table
mov wd, [r6+wq*4]
mov maskq, r6mp
movd m0, r7m
pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0
mova m6, [base+pw_258] ; 64 * 4 + 2
add wq, r6
psubw m6, m0
W_MASK_420 0, 4
jmp wd
%define loop_w dword r0m
%define hd dword r5m
%endif
.w4_loop:
add tmp1q, 2*16
add tmp2q, 2*16
W_MASK_420 0, 4
lea dstq, [dstq+strideq*2]
add maskq, 4
.w4:
movd [dstq ], m0 ; copy m0[0]
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1 ; copy m0[1]
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0
movd [dstq+strideq*0], m0 ; copy m0[2]
psrlq m0, 32
movd [dstq+strideq*1], m0 ; copy m0[3]
pshufd m5, m4, q3131; DBDB even lines repeated
pshufd m4, m4, q2020; CACA odd lines repeated
psubw m1, m6, m4 ; m9 == 64 * 4 + 2
psubw m1, m5 ; C-D A-B C-D A-B
psrlw m1, 2 ; >> 2
packuswb m1, m1
movd [maskq], m1
sub hd, 4
jg .w4_loop
RET
.w8_loop:
add tmp1q, 2*16
add tmp2q, 2*16
W_MASK_420 0, 4
lea dstq, [dstq+strideq*2]
add maskq, 4
.w8:
movq [dstq ], m0
movhps [dstq+strideq*1], m0
pshufd m1, m4, q3232
psubw m0, m6, m4
psubw m0, m1
psrlw m0, 2
packuswb m0, m0
movd [maskq], m0
sub hd, 2
jg .w8_loop
RET
.w16: ; w32/64/128
%if ARCH_X86_32
mov wd, wm ; because we altered it in 32bit setup
%endif
mov loop_w, wd ; use width as counter
jmp .w16ge_inner_loop_first
.w16ge_loop:
lea tmp1q, [tmp1q+wq*2] ; skip even line pixels
lea tmp2q, [tmp2q+wq*2] ; skip even line pixels
sub dstq, wq
mov loop_w, wd
lea dstq, [dstq+strideq*2]
.w16ge_inner_loop:
W_MASK_420_B 0, 4
.w16ge_inner_loop_first:
mova [dstq ], m0
W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16))
mova [dstq+strideq*1], m0
psubw m1, m6, m4 ; m9 == 64 * 4 + 2
psubw m1, m5 ; - odd line mask
psrlw m1, 2 ; >> 2
packuswb m1, m1
movq [maskq], m1
add tmp1q, 2*16
add tmp2q, 2*16
add maskq, 8
add dstq, 16
sub loop_w, 16
jg .w16ge_inner_loop
sub hd, 2
jg .w16ge_loop
RET
%undef reg_pw_8
%undef reg_pw_27
%undef reg_pw_2048
%undef dst_bak
%undef loop_w
%undef orig_w
%undef hd
%macro BLEND_64M 4; a, b, mask1, mask2
punpcklbw m0, %1, %2; {b;a}[7..0]
punpckhbw %1, %2 ; {b;a}[15..8]
pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
packuswb m0, %1 ; {blendpx}[15..0] u8
%endmacro
%macro BLEND 2; a, b
psubb m3, m4, m0 ; m3 = (64 - m)
punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
punpckhbw m3, m0 ; {m;(64-m)}[15..8]
BLEND_64M %1, %2, m2, m3
%endmacro
cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
%define base r6-blend_ssse3_table
LEA r6, blend_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movifnidn maskq, maskmp
movsxd wq, dword [r6+wq*4]
mova m4, [base+pb_64]
mova m5, [base+pw_512]
add wq, r6
lea r6, [dsq*3]
jmp wq
.w4:
movq m0, [maskq]; m
movd m1, [dstq+dsq*0] ; a
movd m6, [dstq+dsq*1]
punpckldq m1, m6
movq m6, [tmpq] ; b
psubb m3, m4, m0 ; m3 = (64 - m)
punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
punpcklbw m1, m6 ; {b;a}[7..0]
pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
packuswb m1, m0 ; {blendpx}[15..0] u8
movd [dstq+dsq*0], m1
psrlq m1, 32
movd [dstq+dsq*1], m1
add maskq, 8
add tmpq, 8
lea dstq, [dstq+dsq*2] ; dst_stride * 2
sub hd, 2
jg .w4
RET
.w8:
mova m0, [maskq]; m
movq m1, [dstq+dsq*0] ; a
movhps m1, [dstq+dsq*1]
mova m6, [tmpq] ; b
BLEND m1, m6
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
add maskq, 16
add tmpq, 16
lea dstq, [dstq+dsq*2] ; dst_stride * 2
sub hd, 2
jg .w8
RET
.w16:
mova m0, [maskq]; m
mova m1, [dstq] ; a
mova m6, [tmpq] ; b
BLEND m1, m6
mova [dstq], m0
add maskq, 16
add tmpq, 16
add dstq, dsq ; dst_stride
dec hd
jg .w16
RET
.w32:
%assign i 0
%rep 2
mova m0, [maskq+16*i]; m
mova m1, [dstq+16*i] ; a
mova m6, [tmpq+16*i] ; b
BLEND m1, m6
mova [dstq+i*16], m0
%assign i i+1
%endrep
add maskq, 32
add tmpq, 32
add dstq, dsq ; dst_stride
dec hd
jg .w32
RET
cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
%define base r5-blend_v_ssse3_table
LEA r5, blend_v_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r5+wq*4]
mova m5, [base+pw_512]
add wq, r5
add maskq, obmc_masks-blend_v_ssse3_table
jmp wq
.w2:
movd m3, [maskq+4]
punpckldq m3, m3
; 2 mask blend is provided for 4 pixels / 2 lines
.w2_loop:
movd m1, [dstq+dsq*0] ; a {..;a;a}
pinsrw m1, [dstq+dsq*1], 1
movd m2, [tmpq] ; b
punpcklbw m0, m1, m2; {b;a}[7..0]
pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
packuswb m0, m1 ; {blendpx}[8..0] u8
movd r3d, m0
mov [dstq+dsq*0], r3w
shr r3d, 16
mov [dstq+dsq*1], r3w
add tmpq, 2*2
lea dstq, [dstq + dsq * 2]
sub hd, 2
jg .w2_loop
RET
.w4:
movddup m3, [maskq+8]
; 4 mask blend is provided for 8 pixels / 2 lines
.w4_loop:
movd m1, [dstq+dsq*0] ; a
movd m2, [dstq+dsq*1] ;
punpckldq m1, m2
movq m2, [tmpq] ; b
punpcklbw m1, m2 ; {b;a}[7..0]
pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
packuswb m1, m1 ; {blendpx}[8..0] u8
movd [dstq], m1
psrlq m1, 32
movd [dstq+dsq*1], m1
add tmpq, 2*4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_loop
RET
.w8:
mova m3, [maskq+16]
; 8 mask blend is provided for 16 pixels
.w8_loop:
movq m1, [dstq+dsq*0] ; a
movhps m1, [dstq+dsq*1]
mova m2, [tmpq]; b
BLEND_64M m1, m2, m3, m3
movq [dstq+dsq*0], m0
punpckhqdq m0, m0
movq [dstq+dsq*1], m0
add tmpq, 16
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_loop
RET
.w16:
; 16 mask blend is provided for 32 pixels
mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
.w16_loop:
mova m1, [dstq] ; a
mova m2, [tmpq] ; b
BLEND_64M m1, m2, m3, m4
mova [dstq], m0
add tmpq, 16
add dstq, dsq
dec hd
jg .w16_loop
RET
.w32:
mova m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0])
mova m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1])
mova m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2])
mova m7, [maskq+112] ; obmc_masks_32[3] (64-m[3])
; 16 mask blend is provided for 64 pixels
.w32_loop:
mova m1, [dstq+16*0] ; a
mova m2, [tmpq+16*0] ; b
BLEND_64M m1, m2, m3, m4
mova [dstq+16*0], m0
mova m1, [dstq+16*1] ; a
mova m2, [tmpq+16*1] ; b
BLEND_64M m1, m2, m6, m7
mova [dstq+16*1], m0
add tmpq, 32
add dstq, dsq
dec hd
jg .w32_loop
RET
cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
%define base t0-blend_h_ssse3_table
%if ARCH_X86_32
; We need to keep the PIC pointer for w4, reload wd from stack instead
DECLARE_REG_TMP 6
%else
DECLARE_REG_TMP 5
mov r6d, wd
%endif
LEA t0, blend_h_ssse3_table
tzcnt wd, wm
mov hd, hm
movsxd wq, dword [t0+wq*4]
mova m5, [base+pw_512]
add wq, t0
lea maskq, [base+obmc_masks+hq*4]
neg hq
jmp wq
.w2:
movd m0, [dstq+dsq*0]
pinsrw m0, [dstq+dsq*1], 1
movd m2, [maskq+hq*2]
movd m1, [tmpq]
punpcklwd m2, m2
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
packuswb m0, m0
movd r3d, m0
mov [dstq+dsq*0], r3w
shr r3d, 16
mov [dstq+dsq*1], r3w
lea dstq, [dstq+dsq*2]
add tmpq, 2*2
add hq, 2
jl .w2
RET
.w4:
%if ARCH_X86_32
mova m3, [base+blend_shuf]
%else
mova m3, [blend_shuf]
%endif
.w4_loop:
movd m0, [dstq+dsq*0]
movd m2, [dstq+dsq*1]
punpckldq m0, m2 ; a
movq m1, [tmpq] ; b
movq m2, [maskq+hq*2] ; m
pshufb m2, m3
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
packuswb m0, m0
movd [dstq+dsq*0], m0
psrlq m0, 32
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
add tmpq, 4*2
add hq, 2
jl .w4_loop
RET
.w8:
movd m4, [maskq+hq*2]
punpcklwd m4, m4
pshufd m3, m4, q0000
pshufd m4, m4, q1111
movq m1, [dstq+dsq*0] ; a
movhps m1, [dstq+dsq*1]
mova m2, [tmpq]
BLEND_64M m1, m2, m3, m4
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
add tmpq, 8*2
add hq, 2
jl .w8
RET
; w16/w32/w64/w128
.w16:
%if ARCH_X86_32
mov r6d, wm
%endif
sub dsq, r6
.w16_loop0:
movd m3, [maskq+hq*2]
pshuflw m3, m3, q0000
punpcklqdq m3, m3
mov wd, r6d
.w16_loop:
mova m1, [dstq] ; a
mova m2, [tmpq] ; b
BLEND_64M m1, m2, m3, m3
mova [dstq], m0
add dstq, 16
add tmpq, 16
sub wd, 16
jg .w16_loop
add dstq, dsq
inc hq
jl .w16_loop0
RET
; emu_edge args:
; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
; const pixel *ref, const ptrdiff_t ref_stride
;
; bw, bh total filled size
; iw, ih, copied block -> fill bottom, right
; x, y, offset in bw/bh -> fill top, left
cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
y, dst, dstride, src, sstride, \
bottomext, rightext, blk
; we assume that the buffer (stride) is larger than width, so we can
; safely overwrite by a few bytes
pxor m1, m1
%if ARCH_X86_64
%define reg_zero r12q
%define reg_tmp r10
%define reg_src srcq
%define reg_bottomext bottomextq
%define reg_rightext rightextq
%define reg_blkm r9m
%else
%define reg_zero r6
%define reg_tmp r0
%define reg_src r1
%define reg_bottomext r0
%define reg_rightext r1
%define reg_blkm r2m
%endif
;
; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
xor reg_zero, reg_zero
lea reg_tmp, [ihq-1]
cmp yq, ihq
cmovl reg_tmp, yq
test yq, yq
cmovl reg_tmp, reg_zero
%if ARCH_X86_64
imul reg_tmp, sstrideq
add srcq, reg_tmp
%else
imul reg_tmp, sstridem
mov reg_src, srcm
add reg_src, reg_tmp
%endif
;
; ref += iclip(x, 0, iw - 1)
lea reg_tmp, [iwq-1]
cmp xq, iwq
cmovl reg_tmp, xq
test xq, xq
cmovl reg_tmp, reg_zero
add reg_src, reg_tmp
%if ARCH_X86_32
mov srcm, reg_src
%endif
;
; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
%if ARCH_X86_32
mov r1, r1m ; restore bh
%endif
lea reg_bottomext, [yq+bhq]
sub reg_bottomext, ihq
lea r3, [bhq-1]
cmovl reg_bottomext, reg_zero
;
DEFINE_ARGS bw, bh, iw, ih, x, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; top_ext = iclip(-y, 0, bh - 1)
neg topextq
cmovl topextq, reg_zero
cmp reg_bottomext, bhq
cmovge reg_bottomext, r3
cmp topextq, bhq
cmovg topextq, r3
%if ARCH_X86_32
mov r4m, reg_bottomext
;
; right_ext = iclip(x + bw - iw, 0, bw - 1)
mov r0, r0m ; restore bw
%endif
lea reg_rightext, [xq+bwq]
sub reg_rightext, iwq
lea r2, [bwq-1]
cmovl reg_rightext, reg_zero
DEFINE_ARGS bw, bh, iw, ih, leftext, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; left_ext = iclip(-x, 0, bw - 1)
neg leftextq
cmovl leftextq, reg_zero
cmp reg_rightext, bwq
cmovge reg_rightext, r2
%if ARCH_X86_32
mov r3m, r1
%endif
cmp leftextq, bwq
cmovge leftextq, r2
%undef reg_zero
%undef reg_tmp
%undef reg_src
%undef reg_bottomext
%undef reg_rightext
DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; center_h = bh - top_ext - bottom_ext
%if ARCH_X86_64
lea r3, [bottomextq+topextq]
sub centerhq, r3
%else
mov r1, centerhm ; restore r1
sub centerhq, topextq
sub centerhq, r4m
mov r1m, centerhq
%endif
;
; blk += top_ext * PXSTRIDE(dst_stride)
mov r2, topextq
%if ARCH_X86_64
imul r2, dstrideq
%else
mov r6, r6m ; restore dstq
imul r2, dstridem
%endif
add dstq, r2
mov reg_blkm, dstq ; save pointer for ext
;
; center_w = bw - left_ext - right_ext
mov centerwq, bwq
%if ARCH_X86_64
lea r3, [rightextq+leftextq]
sub centerwq, r3
%else
sub centerwq, r3m
sub centerwq, leftextq
%endif
; vloop Macro
%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
%if ARCH_X86_64
%define reg_tmp r12
%else
%define reg_tmp r0
%endif
.v_loop_%3:
%if ARCH_X86_32
mov r0, r0m
mov r1, r1m
%endif
%if %1
test leftextq, leftextq
jz .body_%3
; left extension
%if ARCH_X86_64
movd m0, [srcq]
%else
mov r3, srcm
movd m0, [r3]
%endif
pshufb m0, m1
xor r3, r3
.left_loop_%3:
mova [dstq+r3], m0
add r3, mmsize
cmp r3, leftextq
jl .left_loop_%3
; body
.body_%3:
lea reg_tmp, [dstq+leftextq]
%endif
xor r3, r3
.body_loop_%3:
%if ARCH_X86_64
movu m0, [srcq+r3]
%else
mov r1, srcm
movu m0, [r1+r3]
%endif
%if %1
movu [reg_tmp+r3], m0
%else
movu [dstq+r3], m0
%endif
add r3, mmsize
cmp r3, centerwq
jl .body_loop_%3
%if %2
; right extension
%if ARCH_X86_64
test rightextq, rightextq
%else
mov r1, r3m
test r1, r1
%endif
jz .body_loop_end_%3
%if %1
add reg_tmp, centerwq
%else
lea reg_tmp, [dstq+centerwq]
%endif
%if ARCH_X86_64
movd m0, [srcq+centerwq-1]
%else
mov r3, srcm
movd m0, [r3+centerwq-1]
%endif
pshufb m0, m1
xor r3, r3
.right_loop_%3:
movu [reg_tmp+r3], m0
add r3, mmsize
%if ARCH_X86_64
cmp r3, rightextq
%else
cmp r3, r3m
%endif
jl .right_loop_%3
.body_loop_end_%3:
%endif
%if ARCH_X86_64
add dstq, dstrideq
add srcq, sstrideq
dec centerhq
jg .v_loop_%3
%else
add dstq, dstridem
mov r0, sstridem
add srcm, r0
sub dword centerhm, 1
jg .v_loop_%3
mov r0, r0m ; restore r0
%endif
%endmacro ; vloop MACRO
test leftextq, leftextq
jnz .need_left_ext
%if ARCH_X86_64
test rightextq, rightextq
jnz .need_right_ext
%else
cmp leftextq, r3m ; leftextq == 0
jne .need_right_ext
%endif
v_loop 0, 0, 0
jmp .body_done
;left right extensions
.need_left_ext:
%if ARCH_X86_64
test rightextq, rightextq
%else
mov r3, r3m
test r3, r3
%endif
jnz .need_left_right_ext
v_loop 1, 0, 1
jmp .body_done
.need_left_right_ext:
v_loop 1, 1, 2
jmp .body_done
.need_right_ext:
v_loop 0, 1, 3
.body_done:
; r0 ; bw
; r1 ;; x loop
; r4 ;; y loop
; r5 ; topextq
; r6 ;dstq
; r7 ;dstrideq
; r8 ; srcq
%if ARCH_X86_64
%define reg_dstride dstrideq
%else
%define reg_dstride r2
%endif
;
; bottom edge extension
%if ARCH_X86_64
test bottomextq, bottomextq
jz .top
%else
xor r1, r1
cmp r1, r4m
je .top
%endif
;
%if ARCH_X86_64
mov srcq, dstq
sub srcq, dstrideq
xor r1, r1
%else
mov r3, dstq
mov reg_dstride, dstridem
sub r3, reg_dstride
mov srcm, r3
%endif
;
.bottom_x_loop:
%if ARCH_X86_64
mova m0, [srcq+r1]
lea r3, [dstq+r1]
mov r4, bottomextq
%else
mov r3, srcm
mova m0, [r3+r1]
lea r3, [dstq+r1]
mov r4, r4m
%endif
;
.bottom_y_loop:
mova [r3], m0
add r3, reg_dstride
dec r4
jg .bottom_y_loop
add r1, mmsize
cmp r1, bwq
jl .bottom_x_loop
.top:
; top edge extension
test topextq, topextq
jz .end
%if ARCH_X86_64
mov srcq, reg_blkm
%else
mov r3, reg_blkm
mov reg_dstride, dstridem
%endif
mov dstq, dstm
xor r1, r1
;
.top_x_loop:
%if ARCH_X86_64
mova m0, [srcq+r1]
%else
mov r3, reg_blkm
mova m0, [r3+r1]
%endif
lea r3, [dstq+r1]
mov r4, topextq
;
.top_y_loop:
mova [r3], m0
add r3, reg_dstride
dec r4
jg .top_y_loop
add r1, mmsize
cmp r1, bwq
jl .top_x_loop
.end:
RET
%undef reg_dstride
%undef reg_blkm
%undef reg_tmp