ref: da5a5df86c334d60865898033ec17744e73d32dd
parent: 87a377e990f6cd1db9c214a6d33c15accefe4a32
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Thu Dec 6 02:18:24 EST 2018
Special w=4/8 cases
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -281,6 +281,10 @@
%endif
%endmacro
+%if ARCH_X86_64 == 0
+ %define movsxd movifnidn
+%endif
+
%macro movsxdifnidn 2
%ifnidn %1, %2
movsxd %1, %2
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -44,30 +44,50 @@
SECTION .text
-%macro IPRED_SET 4 ; width, store_type, stride, stride size, pshuflw_imm8
- pshuflw m1, m0, %4 ; extend 8 byte for 2 pos
+%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
+ pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
punpcklqdq m1, m1
- mov%2 [dstq + %3], m1
+ mova [dstq + %2], m1
%if %1 > 16
- mov%2 [dstq + 16 + %3], m1
+ mova [dstq + 16 + %2], m1
%endif
%if %1 > 32
- mov%2 [dstq + 32 + %3], m1
- mov%2 [dstq + 48 + %3], m1
+ mova [dstq + 32 + %2], m1
+ mova [dstq + 48 + %2], m1
%endif
%endmacro
-%macro IPRED_H 3 ; width, loop label, store_type
+%macro IPRED_H 1 ; width
sub tlq, 4
movd m0, [tlq] ; get 4 bytes of topleft data
punpcklbw m0, m0 ; extend 2 byte
- IPRED_SET %1, %3, 0, q3333
- IPRED_SET %1, %3, strideq, q2222
- IPRED_SET %1, %3, strideq*2, q1111
- IPRED_SET %1, %3, stride3q, q0000
+%if %1 == 4
+ pshuflw m1, m0, q2233
+ movd [dstq+strideq*0], m1
+ psrlq m1, 32
+ movd [dstq+strideq*1], m1
+ pshuflw m0, m0, q0011
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0
+
+%elif %1 == 8
+ punpcklwd m0, m0
+ punpckhdq m1, m0, m0
+ punpckldq m0, m0
+ movq [dstq+strideq*1], m1
+ movhps [dstq+strideq*0], m1
+ movq [dstq+stride3q ], m0
+ movhps [dstq+strideq*2], m0
+%else
+ IPRED_SET %1, 0, q3333
+ IPRED_SET %1, strideq, q2222
+ IPRED_SET %1, strideq*2, q1111
+ IPRED_SET %1, stride3q, q0000
+%endif
lea dstq, [dstq+strideq*4]
sub hd, 4
- jg %2
+ jg .w%1
RET
%endmacro
@@ -76,21 +96,17 @@
lea r5, [ipred_h_ssse3_table]
tzcnt wd, wm
movifnidn hd, hm
-%if ARCH_X86_64
movsxd wq, [r5+wq*4]
-%else
- mov wq, [r5+wq*4]
-%endif
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w4:
- IPRED_H 4, .w4, d
+ IPRED_H 4
.w8:
- IPRED_H 8, .w8, q
+ IPRED_H 8
.w16:
- IPRED_H 16, .w16, u
+ IPRED_H 16
.w32:
- IPRED_H 32, .w32, u
+ IPRED_H 32
.w64:
- IPRED_H 64, .w64, u
+ IPRED_H 64