ref: b73acaa894ac4bb66d04d27919e436c158be7fe8
parent: 275e91de9e3c6ec03a08e617c81923d938eaaa7f
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Mar 30 06:58:32 EDT 2020
x86: use btc instead of xor+test or 32byte alignment in fgy_32x32xn_ssse3
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -1492,13 +1492,11 @@
%if ARCH_X86_32
mov srcq, r1mp
add srcq, r4mp
- xor r8mp, 4
- test r8mp, 4
%else
lea srcq, [src_bakq+wq]
- test srcq, 16 ; this relies on buffer alignment...
%endif
- jz .next_blk
+ btc dword r8m, 2
+ jc .next_blk
add offxyd, 16
test dword r8m, 2 ; r8m & 2 = have_top_overlap
@@ -1640,11 +1638,10 @@
%if ARCH_X86_32
mov srcq, r1m
add srcq, r4m
- xor r8mp, 4
%else
lea srcq, [src_bakq+wq]
%endif
- ; assert(srcq & 16) != 0
+ xor dword r8m, 4
add offxyd, 16
; since this half-block had left-overlap, the next does not
@@ -1845,9 +1842,8 @@
jz .end_y_v_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
- xor hd, 0x10000
- test hd, 0x10000
- jnz .loop_y_v_overlap
+ btc hd, 16
+ jnc .loop_y_v_overlap
jmp .loop_y
.end_y_v_overlap:
@@ -1860,13 +1856,11 @@
%if ARCH_X86_32
mov srcq, r1mp
add srcq, r4mp
- xor r8mp, 4
- test r8mp, 4
%else
lea srcq, [src_bakq+wq]
- test srcq, 16
%endif
- jz .loop_x_hv_overlap
+ btc dword r8m, 2
+ jc .loop_x_hv_overlap
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
@@ -2048,9 +2042,8 @@
jz .end_y_hv_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
- xor hd, 0x10000
- test hd, 0x10000
- jnz .loop_y_hv_overlap
+ btc hd, 16
+ jnc .loop_y_hv_overlap
jmp .loop_y_h_overlap
.end_y_hv_overlap:
@@ -2063,11 +2056,10 @@
%if ARCH_X86_32
mov srcq, r1m
add srcq, r4m
- xor r8mp, 4
%else
lea srcq, [src_bakq+wq]
%endif
- ; assert(srcq & 16) != 0
+ xor dword r8m, 4
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16