ref: 39c2fb3d6bb60b45ed5b046839b23e51e5ab23ce
parent: c17a58efdfa8c03e3a1ae8e7f78483d48700499c
parent: 563376df0c45ce1cc26200a36d99526c9943f2ba
author: ruil2 <ruil2@cisco.com>
date: Fri May 27 11:17:31 EDT 2016
Merge pull request #2472 from saamas/processing-x86-general-bilinear-downsample-optimizations [Processing/x86] GeneralBilinearDownsample optimizations
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -478,6 +478,12 @@
%endif
%endmacro
+%macro ZERO_EXTENSION 1
+ %ifndef X86_32
+ mov dword %1, %1
+ %endif
+%endmacro
+
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -100,6 +100,7 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
+ sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
@@ -106,6 +107,11 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
+ }
+ if (iCpuFlag & WELS_CPU_AVX2) {
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
+ sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2;
}
#endif//X86_ASM
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -101,6 +101,10 @@
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
@@ -114,6 +118,18 @@
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
const uint32_t kuiScaleY);
+void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+ int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+ uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+ int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+ uint32_t uiScaleY);
+void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+ int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+ uint32_t uiScaleY);
+void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+ int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+ uint32_t uiScaleY);
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -247,58 +247,52 @@
}
}
-
-#ifdef X86_ASM
-void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+#if defined(X86_ASM) || defined(HAVE_NEON) || defined(HAVE_NEON_AARCH64)
+static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+ const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
+ void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
+ uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
- GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
- pSrc, kiSrcStride, uiScalex, uiScaley);
+ func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
}
-void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBit = 15;
- const uint32_t kuiScale = (1 << kiScaleBit);
+#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
+ void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
+ uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
+ GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
+ pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
+ }
- uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
- uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
+ void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
+ uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
+ GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
+ pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
+ }
+#endif
- GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
- pSrc, kiSrcStride, uiScalex, uiScaley);
-}
+#ifdef X86_ASM
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
+DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
#endif //X86_ASM
#ifdef HAVE_NEON
-void GeneralBilinearAccurateDownsamplerWrap_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBit = 15;
- const uint32_t kuiScale = (1 << kiScaleBit);
- uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
- uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
- GeneralBilinearAccurateDownsampler_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex,
- uiScaley);
-}
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
#endif
#ifdef HAVE_NEON_AARCH64
-void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
- const int32_t kiDstWidth, const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBit = 15;
- const uint32_t kuiScale = (1 << kiScaleBit);
- uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
- uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
- GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride,
- uiScalex, uiScaley);
-}
+DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
#endif
WELSVP_NAMESPACE_END
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -53,13 +53,23 @@
; Local Data (Read Only)
;***********************************************************************
-SECTION .rodata align=16
+SECTION .rodata align=32
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
-ALIGN 16
+ALIGN 32
+db80h_256:
+ times 32 db 80h
+shufb_0000000088888888:
+ times 8 db 0
+ times 8 db 8
+shufb_000044448888CCCC:
+ times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
shufb_mask_low:
db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
@@ -2595,3 +2605,2260 @@
%endif
ret
+; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
+%macro SSE2_BilinearIncXposuw 5
+ movdqa %5, %2
+ paddw %2, %4
+ paddusw %5, %4
+ pcmpeqw %5, %2
+ paddb %1, %3
+ paddb %1, %5 ; subtract 1 if no carry
+%endmacro
+
+; outl=%1 outh=%2 in=%3
+%macro SSE2_UnpckXFracuw 3
+ pcmpeqw %1, %1
+ pxor %1, %3
+ movdqa %2, %1
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%endmacro
+
+; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4
+%macro SSE2_BilinearFastCalcXYFrac 4
+ movdqa %2, %1
+ pmulhuw %1, %3
+ pmulhuw %2, %4
+%endmacro
+
+; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
+%macro SSE2_BilinearFastPackDwordsToBytes 3
+ psrld %1, 14
+ psrld %2, 14
+ packssdw %1, %2
+ pavgw %1, %3
+ packuswb %1, %1
+%endmacro
+
+%macro SSSE3_BilinearFastDownsample2xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, xmm_0
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ pshufb xmm_tmp4, xmm_xpos_int
+ movdqa xmm_tmp5, xmm_tmp4
+ punpcklbw xmm_tmp4, xmm_0
+ punpckhbw xmm_tmp5, xmm_0
+ SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+ SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1
+ pmaddwd xmm_tmp0, xmm_tmp4
+ pmaddwd xmm_tmp1, xmm_tmp5
+ movdqu xmm_tmp4, [p_src_row1 + r_tmp0]
+ pshufb xmm_tmp4, xmm_xpos_int
+ movdqa xmm_tmp5, xmm_tmp4
+ punpcklbw xmm_tmp4, xmm_0
+ punpckhbw xmm_tmp5, xmm_0
+ pmaddwd xmm_tmp2, xmm_tmp4
+ pmaddwd xmm_tmp3, xmm_tmp5
+ paddd xmm_tmp0, xmm_tmp2
+ paddd xmm_tmp1, xmm_tmp3
+ SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
+%endmacro
+
+%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, [shufb_0000000088888888]
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movdqu xmm_tmp3, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp4, [p_src_row1 + r_tmp0]
+ movdqa xmm_tmp2, xmm_xpos_int
+ punpcklbw xmm_tmp2, [db80h_256]
+ pshufb xmm_tmp3, xmm_tmp2
+ pshufb xmm_tmp4, xmm_tmp2
+ SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+ pmaddwd xmm_tmp0, xmm_tmp3
+ pmaddwd xmm_tmp2, xmm_tmp4
+ paddd xmm_tmp0, xmm_tmp2
+ lea r_tmp0, [i_xpos + 4 * i_scalex]
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ movdqu xmm_tmp3, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp4, [p_src_row1 + r_tmp0]
+ movdqa xmm_tmp2, xmm_xpos_int
+ punpckhbw xmm_tmp2, [db80h_256]
+ pshufb xmm_tmp3, xmm_tmp2
+ pshufb xmm_tmp4, xmm_tmp2
+ SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+ pmaddwd xmm_tmp1, xmm_tmp3
+ pmaddwd xmm_tmp2, xmm_tmp4
+ paddd xmm_tmp1, xmm_tmp2
+ SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0
+%endmacro
+
+%macro SSE2_GeneralBilinearFastDownsample_8px 0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp3, [p_src_row0 + r_tmp0]
+ movd xmm_tmp4, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp3, xmm_0
+ punpcklbw xmm_tmp4, xmm_0
+ movdqa xmm_tmp0, xmm_xfrac0
+ SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+ pmaddwd xmm_tmp0, xmm_tmp3
+ pmaddwd xmm_tmp2, xmm_tmp4
+ paddd xmm_tmp0, xmm_tmp2
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp3, [p_src_row0 + r_tmp0]
+ movd xmm_tmp4, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp3, xmm_0
+ punpcklbw xmm_tmp4, xmm_0
+ movdqa xmm_tmp1, xmm_xfrac1
+ SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
+ pmaddwd xmm_tmp1, xmm_tmp3
+ pmaddwd xmm_tmp2, xmm_tmp4
+ paddd xmm_tmp1, xmm_tmp2
+ SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ paddw xmm_xfrac0, xmm_xfrac_inc
+ paddw xmm_xfrac1, xmm_xfrac_inc
+%endmacro
+
+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6
+%macro SSE2_BilinearIncXposw 6
+ pxor %6, %6
+ paddw %2, %4
+ pcmpgtw %6, %2
+ paddb %1, %3
+ psubb %1, %6 ; add carry
+ pand %2, %5
+%endmacro
+
+; outl=%1 outh=%2 in=%3 7FFFh=%4
+%macro SSE2_UnpckXFracw 4
+ movdqa %1, %3
+ pxor %1, %4
+ movdqa %2, %1
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro SSE41_LinearAccurateInterpolateVerticalDwords 6
+ pshufd %1, %2, 10110001b
+ pshufd %6, %3, 10110001b
+ pmuludq %1, %4
+ pmuludq %6, %5
+ paddq %1, %6
+ pmuludq %2, %4
+ pmuludq %3, %5
+ paddq %2, %3
+ psllq %1, 3
+ psrlq %2, 29
+ blendps %1, %2, 0101b
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, xmm_0
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ pshufb xmm_tmp4, xmm_xpos_int
+ movdqa xmm_tmp5, xmm_tmp4
+ punpcklbw xmm_tmp4, xmm_0
+ punpckhbw xmm_tmp5, xmm_0
+ pmaddwd xmm_tmp4, xmm_tmp0
+ pmaddwd xmm_tmp5, xmm_tmp1
+ movdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ pshufb xmm_tmp2, xmm_xpos_int
+ movdqa xmm_tmp3, xmm_tmp2
+ punpcklbw xmm_tmp2, xmm_0
+ punpckhbw xmm_tmp3, xmm_0
+ pmaddwd xmm_tmp2, xmm_tmp0
+ pmaddwd xmm_tmp3, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
+ movdqa xmm_tmp0, xmm_xpos_int
+ pshufb xmm_tmp0, [shufb_0000000088888888]
+ psubb xmm_xpos_int, xmm_tmp0
+ SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movdqa xmm_tmp3, xmm_xpos_int
+ punpcklbw xmm_tmp3, [db80h_256]
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex]
+ lea i_xpos, [i_xpos + 8 * i_scalex]
+ shr r_tmp0, 16
+ pshufb xmm_tmp4, xmm_tmp3
+ pshufb xmm_tmp2, xmm_tmp3
+ pmaddwd xmm_tmp4, xmm_tmp0
+ pmaddwd xmm_tmp2, xmm_tmp0
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+ movdqa xmm_tmp2, xmm_xpos_int
+ punpckhbw xmm_tmp2, [db80h_256]
+ movdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ movdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ pshufb xmm_tmp4, xmm_tmp2
+ pshufb xmm_tmp3, xmm_tmp2
+ pmaddwd xmm_tmp4, xmm_tmp1
+ pmaddwd xmm_tmp3, xmm_tmp1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0
+%endmacro
+
+%macro SSE41_GeneralBilinearAccurateDownsample_8px 0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp4, [p_src_row0 + r_tmp0]
+ movd xmm_tmp2, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp4, xmm_0
+ punpcklbw xmm_tmp2, xmm_0
+ pmaddwd xmm_tmp4, xmm_xfrac0
+ pmaddwd xmm_tmp2, xmm_xfrac0
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movd xmm_tmp4, [p_src_row0 + r_tmp0]
+ movd xmm_tmp3, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3
+ pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 3
+ punpcklbw xmm_tmp4, xmm_0
+ punpcklbw xmm_tmp3, xmm_0
+ pmaddwd xmm_tmp4, xmm_xfrac1
+ pmaddwd xmm_tmp3, xmm_xfrac1
+ SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2
+ packssdw xmm_tmp0, xmm_tmp1
+ pavgw xmm_tmp0, xmm_0
+ packuswb xmm_tmp0, xmm_tmp0
+ movlps [p_dst], xmm_tmp0
+ add p_dst, 8
+ paddw xmm_xfrac0, xmm_xfrac_inc
+ paddw xmm_xfrac1, xmm_xfrac_inc
+ pand xmm_xfrac0, xmm_7fff
+ pand xmm_xfrac1, xmm_7fff
+%endmacro
+
+; downsample_8px_macro=%1 b_fast=%2
+%macro SSE2_GeneralBilinearDownsampler_loop 2
+%%height:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov p_src_row1, p_src_row0
+ add p_src_row1, i_src_stride
+ movd xmm_tmp1, i_yposd
+%if %2
+ pshuflw xmm_tmp1, xmm_tmp1, 0
+ psllw xmm_tmp1, 1
+ psrlw xmm_tmp1, 1
+%else
+ pslld xmm_tmp1, 17
+ psrld xmm_tmp1, 17
+%endif
+%ifdef X86_32
+ pshufd xmm_tmp1, xmm_tmp1, 0
+ pcmpeqw xmm_tmp0, xmm_tmp0
+%if %2
+ psrlw xmm_tmp0, 1
+%else
+ psrld xmm_tmp0, 17
+%endif
+ pxor xmm_tmp0, xmm_tmp1
+ movdqa xmm_yfrac0, xmm_tmp0
+ movdqa xmm_yfrac1, xmm_tmp1
+%else
+ pshufd xmm_yfrac1, xmm_tmp1, 0
+ pcmpeqw xmm_yfrac0, xmm_yfrac0
+%if %2
+ psrlw xmm_yfrac0, 1
+%else
+ psrld xmm_yfrac0, 17
+%endif
+ pxor xmm_yfrac0, xmm_yfrac1
+%endif
+
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+ sub i_width_cnt, 1
+
+%ifdef xmm_xpos_int
+ movdqa xmm_xpos_int, xmm_xpos_int_begin
+ movdqa xmm_xpos_frac, xmm_xpos_frac_begin
+%else
+ movdqa xmm_xfrac0, xmm_xfrac0_begin
+ movdqa xmm_xfrac1, xmm_xfrac1_begin
+%endif
+
+%%width:
+ %1
+ sub i_width_cnt, 8
+ jg %%width
+
+ lea p_dst, [p_dst + i_width_cnt + 1]
+ imul i_width_cnt, i_scalex
+ add i_xpos, i_width_cnt
+ shr i_xpos, 16
+ movzx r_tmp0, byte [p_src_row0 + i_xpos]
+ mov [p_dst - 1], r_tmp0b
+%ifdef X86_32
+ mov r_tmp0, i_scaleyd
+ add i_yposd, r_tmp0
+%else
+ add i_yposd, i_scaleyd
+%endif
+ add p_dst, i_dst_stride_less_width
+ sub i_dst_height, 1
+ jg %%height
+%endmacro
+
+;**************************************************************************************************************
+;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+; uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_ssse3
+ %assign push_num 0
+%ifndef X86_32
+ push r12
+ push r13
+ push rbx
+ push rbp
+ %assign push_num 4
+%ifdef WIN64
+ push rdi
+ push rsi
+ %assign push_num push_num + 2
+%endif
+%endif
+ LOAD_7_PARA
+ PUSH_XMM 16
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ ZERO_EXTENSION r6d
+ sub r1, r2 ; dst_stride - dst_width
+%ifdef X86_32
+ movd xmm0, arg8
+ movd xmm1, esp
+ and esp, -16
+ sub esp, 8 * 4 + 7 * 16
+ movd [esp], xmm1
+ %define p_dst r0
+ %define i_dst_stride_less_width [esp + 1 * 4]
+ %define i_dst_width [esp + 2 * 4]
+ %define i_dst_height dword [esp + 3 * 4]
+ %define p_src [esp + 4 * 4]
+ %define i_src_stride [esp + 5 * 4]
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd [esp + 6 * 4]
+ %define i_xpos r2
+ %define i_ypos dword [esp + 7 * 4]
+ %define i_yposd dword [esp + 7 * 4]
+ %define p_src_row0 r3
+ %define p_src_row1 r4
+ %define i_width_cnt r5
+ %define r_tmp0 r1
+ %define r_tmp0b r1b
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc [esp + 8 * 4]
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16]
+ %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16]
+ %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16]
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm0
+ %define xmm_tmp2 xmm2
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_0 [esp + 8 * 4 + 4 * 16]
+ %define xmm_xpos_int_begin [esp + 8 * 4 + 5 * 16]
+ %define xmm_xpos_frac_begin [esp + 8 * 4 + 6 * 16]
+ mov i_dst_stride_less_width, r1
+ mov i_dst_width, r2
+ mov i_dst_height, r3
+ mov p_src, r4
+ mov i_src_stride, r5
+ movd i_scaleyd, xmm0
+ pxor xmm_tmp0, xmm_tmp0
+ movdqa xmm_0, xmm_tmp0
+%else
+ %define p_dst r0
+ %define i_dst_stride_less_width r1
+ %define i_dst_width r2
+ %define i_dst_height r3
+ %define p_src r4
+ %define i_src_stride r5
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd dword arg8d
+ %define i_xpos r12
+ %define i_ypos r13
+ %define i_yposd r13d
+ %define p_src_row0 rbp
+%ifdef WIN64
+ %define p_src_row1 rsi
+ %define i_width_cnt rdi
+%else
+ %define p_src_row1 r11
+ %define i_width_cnt rax
+%endif
+ %define r_tmp0 rbx
+ %define r_tmp0b bl
+ %define xmm_0 xmm0
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc xmm8
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc xmm10
+ %define xmm_yfrac0 xmm11
+ %define xmm_yfrac1 xmm12
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm2
+ %define xmm_tmp2 xmm9
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_xpos_int_begin xmm14
+ %define xmm_xpos_frac_begin xmm15
+ pxor xmm_0, xmm_0
+%endif
+
+ sub i_dst_height, 1
+ je .final_row
+ jl .done
+
+ mov i_ypos, 1 << 14
+ movd xmm_xpos_frac, i_scalexd
+ pshufd xmm_xpos_frac, xmm_xpos_frac, 0
+ movdqa xmm_tmp0, xmm_xpos_frac
+ pslld xmm_tmp0, 2
+ pslldq xmm_xpos_frac, 4
+ paddd xmm_tmp0, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_xpos_frac
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pcmpeqw xmm_tmp1, xmm_tmp1
+ psrld xmm_tmp1, 31
+ pslld xmm_tmp1, 15
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ movdqa xmm_xpos_int, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_tmp0
+ psrld xmm_xpos_int, 16
+ psrld xmm_tmp1, 16
+ packssdw xmm_xpos_int, xmm_tmp1
+ packuswb xmm_xpos_int, xmm_xpos_int
+ movdqa xmm_tmp1, xmm_xpos_int
+ pcmpeqw xmm_tmp2, xmm_tmp2
+ psubb xmm_tmp1, xmm_tmp2
+ punpcklbw xmm_xpos_int, xmm_tmp1
+ pslld xmm_xpos_frac, 16
+ pslld xmm_tmp0, 16
+ psrad xmm_xpos_frac, 16
+ psrad xmm_tmp0, 16
+ packssdw xmm_xpos_frac, xmm_tmp0
+ movd xmm_tmp0, i_scalexd
+ pslld xmm_tmp0, 3
+ movdqa xmm_tmp1, xmm_tmp0
+ punpcklwd xmm_tmp0, xmm_tmp0
+ pshufd xmm_tmp0, xmm_tmp0, 0
+ movdqa xmm_xpos_frac_inc, xmm_tmp0
+ psrld xmm_tmp1, 16
+ psubw xmm_tmp1, xmm_tmp2
+ pxor xmm_tmp2, xmm_tmp2
+ pshufb xmm_tmp1, xmm_tmp2
+ movdqa xmm_xpos_int_inc, xmm_tmp1
+ movdqa xmm_xpos_int_begin, xmm_xpos_int
+ movdqa xmm_xpos_frac_begin, xmm_xpos_frac
+
+ cmp i_scalex, 4 << 16
+ ja .scalex_above4
+ cmp i_scalex, 2 << 16
+ ja .scalex_above2_beloweq4
+ SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1
+ jmp .final_row
+%ifdef X86_32
+ %undef xmm_yfrac0
+ %xdefine xmm_yfrac0 xmm_tmp5
+ %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+ SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1
+ jmp .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+ SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0
+ movdqa xmm_xfrac0, xmm_tmp0
+ movdqa xmm_xfrac0_begin, xmm_xfrac0
+ movdqa xmm_xfrac1_begin, xmm_xfrac1
+ pcmpeqw xmm_tmp0, xmm_tmp0
+ pmullw xmm_tmp0, xmm_xfrac_inc
+ punpcklwd xmm_tmp0, xmm_xfrac_inc
+ movdqa xmm_xfrac_inc, xmm_tmp0
+ SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1
+
+.final_row:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+
+.final_row_width:
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movzx r_tmp0, byte [p_src_row0 + r_tmp0]
+ mov [p_dst], r_tmp0b
+ add p_dst, 1
+ add i_xpos, i_scalex
+ sub i_width_cnt, 1
+ jg .final_row_width
+
+.done:
+%ifdef X86_32
+ mov esp, [esp]
+%endif
+ POP_XMM
+ LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+ pop rsi
+ pop rdi
+%endif
+ pop rbp
+ pop rbx
+ pop r13
+ pop r12
+%endif
+ ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+; uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
+ %assign push_num 0
+%ifndef X86_32
+ push r12
+ push r13
+ push rbx
+ push rbp
+ %assign push_num 4
+%ifdef WIN64
+ push rdi
+ push rsi
+ %assign push_num push_num + 2
+%endif
+%endif
+ LOAD_7_PARA
+ PUSH_XMM 16
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ ZERO_EXTENSION r6d
+ sub r1, r2 ; dst_stride - dst_width
+ add r6, r6 ; 2 * scalex
+%ifdef X86_32
+ movd xmm0, arg8
+ movd xmm1, esp
+ and esp, -16
+ sub esp, 8 * 4 + 8 * 16
+ movd [esp], xmm1
+ %define p_dst r0
+ %define i_dst_stride_less_width [esp + 1 * 4]
+ %define i_dst_width [esp + 2 * 4]
+ %define i_dst_height dword [esp + 3 * 4]
+ %define p_src [esp + 4 * 4]
+ %define i_src_stride [esp + 5 * 4]
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd [esp + 6 * 4]
+ %define i_xpos r2
+ %define i_ypos dword [esp + 7 * 4]
+ %define i_yposd dword [esp + 7 * 4]
+ %define p_src_row0 r3
+ %define p_src_row1 r4
+ %define i_width_cnt r5
+ %define r_tmp0 r1
+ %define r_tmp0b r1b
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc [esp + 8 * 4]
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16]
+ %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16]
+ %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16]
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm0
+ %define xmm_tmp2 xmm2
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_0 [esp + 8 * 4 + 4 * 16]
+ %define xmm_7fff [esp + 8 * 4 + 5 * 16]
+ %define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16]
+ %define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16]
+ mov i_dst_stride_less_width, r1
+ mov i_dst_width, r2
+ mov i_dst_height, r3
+ mov p_src, r4
+ mov i_src_stride, r5
+ movd i_scaleyd, xmm0
+ pxor xmm_tmp5, xmm_tmp5
+ movdqa xmm_0, xmm_tmp5
+ pcmpeqw xmm_tmp5, xmm_tmp5
+ psrlw xmm_tmp5, 1
+ movdqa xmm_7fff, xmm_tmp5
+%else
+ %define p_dst r0
+ %define i_dst_stride_less_width r1
+ %define i_dst_width r2
+ %define i_dst_height r3
+ %define p_src r4
+ %define i_src_stride r5
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd dword arg8d
+ %define i_xpos r12
+ %define i_ypos r13
+ %define i_yposd r13d
+ %define p_src_row0 rbp
+%ifdef WIN64
+ %define p_src_row1 rsi
+ %define i_width_cnt rdi
+%else
+ %define p_src_row1 r11
+ %define i_width_cnt rax
+%endif
+ %define r_tmp0 rbx
+ %define r_tmp0b bl
+ %define xmm_0 xmm0
+ %define xmm_xpos_frac xmm1
+ %define xmm_xpos_frac_inc xmm8
+ %define xmm_xpos_int xmm3
+ %define xmm_xpos_int_inc xmm10
+ %define xmm_yfrac0 xmm11
+ %define xmm_yfrac1 xmm12
+ %define xmm_tmp0 xmm7
+ %define xmm_tmp1 xmm2
+ %define xmm_tmp2 xmm9
+ %define xmm_tmp3 xmm4
+ %define xmm_tmp4 xmm5
+ %define xmm_tmp5 xmm6
+ %define xmm_7fff xmm13
+ %define xmm_xpos_int_begin xmm14
+ %define xmm_xpos_frac_begin xmm15
+ pxor xmm_0, xmm_0
+ pcmpeqw xmm_7fff, xmm_7fff
+ psrlw xmm_7fff, 1
+%endif
+
+ sub i_dst_height, 1
+ je .final_row
+ jl .done
+
+ mov i_ypos, 1 << 14
+ movd xmm_xpos_frac, i_scalexd
+ pshufd xmm_xpos_frac, xmm_xpos_frac, 0
+ movdqa xmm_tmp0, xmm_xpos_frac
+ pslld xmm_tmp0, 2
+ pslldq xmm_xpos_frac, 4
+ paddd xmm_tmp0, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_xpos_frac
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pslldq xmm_tmp1, 4
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ pcmpeqw xmm_tmp1, xmm_tmp1
+ psrld xmm_tmp1, 31
+ pslld xmm_tmp1, 15
+ paddd xmm_xpos_frac, xmm_tmp1
+ paddd xmm_tmp0, xmm_tmp1
+ movdqa xmm_xpos_int, xmm_xpos_frac
+ movdqa xmm_tmp1, xmm_tmp0
+ psrld xmm_xpos_int, 16
+ psrld xmm_tmp1, 16
+ packssdw xmm_xpos_int, xmm_tmp1
+ packuswb xmm_xpos_int, xmm_xpos_int
+ movdqa xmm_tmp1, xmm_xpos_int
+ pcmpeqw xmm_tmp2, xmm_tmp2
+ psubb xmm_tmp1, xmm_tmp2
+ punpcklbw xmm_xpos_int, xmm_tmp1
+ pslld xmm_xpos_frac, 16
+ pslld xmm_tmp0, 16
+ psrad xmm_xpos_frac, 16
+ psrad xmm_tmp0, 16
+ packssdw xmm_xpos_frac, xmm_tmp0
+ psrlw xmm_xpos_frac, 1
+ movd xmm_tmp0, i_scalexd
+ pslld xmm_tmp0, 3
+ movdqa xmm_tmp1, xmm_tmp0
+ punpcklwd xmm_tmp0, xmm_tmp0
+ pshufd xmm_tmp0, xmm_tmp0, 0
+ psrlw xmm_tmp0, 1
+ movdqa xmm_xpos_frac_inc, xmm_tmp0
+ psrld xmm_tmp1, 16
+ pxor xmm_tmp2, xmm_tmp2
+ pshufb xmm_tmp1, xmm_tmp2
+ movdqa xmm_xpos_int_inc, xmm_tmp1
+ movdqa xmm_xpos_int_begin, xmm_xpos_int
+ movdqa xmm_xpos_frac_begin, xmm_xpos_frac
+
+ cmp i_scalex, 4 << 16
+ ja .scalex_above4
+ cmp i_scalex, 2 << 16
+ ja .scalex_above2_beloweq4
+ SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0
+ jmp .final_row
+%ifdef X86_32
+ %undef xmm_yfrac0
+ %xdefine xmm_yfrac0 xmm_tmp5
+ %undef xmm_tmp5
+%endif
+.scalex_above2_beloweq4:
+ SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0
+ jmp .final_row
+.scalex_above4:
+%xdefine xmm_xfrac0 xmm_xpos_frac
+%xdefine xmm_xfrac1 xmm_xpos_int
+%xdefine xmm_xfrac0_begin xmm_xpos_int_begin
+%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin
+%xdefine xmm_xfrac_inc xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_frac
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xpos_int_inc
+%undef xmm_xpos_frac_inc
+ SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff
+ movdqa xmm_xfrac0, xmm_tmp0
+ movdqa xmm_xfrac0_begin, xmm_xfrac0
+ movdqa xmm_xfrac1_begin, xmm_xfrac1
+ pcmpeqw xmm_tmp0, xmm_tmp0
+ pmullw xmm_tmp0, xmm_xfrac_inc
+ punpcklwd xmm_tmp0, xmm_xfrac_inc
+ movdqa xmm_xfrac_inc, xmm_tmp0
+ SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0
+
+.final_row:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+
+.final_row_width:
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movzx r_tmp0, byte [p_src_row0 + r_tmp0]
+ mov [p_dst], r_tmp0b
+ add p_dst, 1
+ add i_xpos, i_scalex
+ sub i_width_cnt, 1
+ jg .final_row_width
+
+.done:
+%ifdef X86_32
+ mov esp, [esp]
+%endif
+ POP_XMM
+ LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+ pop rsi
+ pop rdi
+%endif
+ pop rbp
+ pop rbx
+ pop r13
+ pop r12
+%endif
+ ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef xmm_0
+%undef xmm_xpos_frac
+%undef xmm_xpos_frac_inc
+%undef xmm_xpos_int
+%undef xmm_xpos_int_inc
+%undef xmm_yfrac0
+%undef xmm_yfrac1
+%undef xmm_tmp0
+%undef xmm_tmp1
+%undef xmm_tmp2
+%undef xmm_tmp3
+%undef xmm_tmp4
+%undef xmm_tmp5
+%undef xmm_7fff
+%undef xmm_xpos_int_begin
+%undef xmm_xpos_frac_begin
+%undef xmm_xfrac0
+%undef xmm_xfrac1
+%undef xmm_xfrac0_begin
+%undef xmm_xfrac1_begin
+%undef xmm_xfrac_inc
+
+; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
+%macro AVX2_BilinearIncXposuw 5
+ vpaddusw %5, %2, %4
+ vpaddw %2, %2, %4
+ vpcmpeqw %5, %5, %2
+ vpaddb %1, %1, %3
+ vpaddb %1, %1, %5 ; subtract 1 if no carry
+%endmacro
+
+; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4
+%macro AVX2_UnpckXFrac 4
+ vpxor %1, %3, %4
+ vpunpckhwd %2, %1, %3
+ vpunpcklwd %1, %1, %3
+%endmacro
+
+; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5
+%macro AVX2_BilinearFastCalcXYFrac 5
+ vpmulhuw %2, %3, %5
+ vpmulhuw %1, %3, %4
+%endmacro
+
+; [in:dwordsl out:bytes] dwordsh=%2 zero=%3
+%macro AVX2_BilinearFastPackDwordsToBytes 3
+ vpsrld %1, %1, 14
+ vpsrld %2, %2, 14
+ vpackssdw %1, %1, %2
+ vpavgw %1, %1, %3
+ vpackuswb %1, %1, %1
+%endmacro
+
+%macro AVX2_BilinearFastDownsample2xOrLess_16px 0
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ lea i_xpos, [i_xpos + 8 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
+ AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+ vpunpcklbw ymm_tmp3, ymm_tmp4, ymm_0
+ vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp3
+ vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
+ AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+ vpunpckhbw ymm_tmp2, ymm_tmp4, ymm_0
+ vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp2
+ vpunpckhbw ymm_tmp2, ymm_tmp5, ymm_0
+ vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp2
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
+ AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
+ vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex2]
+ lea i_xpos, [r_tmp0 + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vpunpcklbw ymm_tmp2, ymm_xpos_int, ymm_ffff
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2
+ vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2
+ AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 2 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+ vpunpckhbw ymm_tmp2, ymm_xpos_int, ymm_ffff
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2
+ vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2
+ AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2
+ AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
+ vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_ffff
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
+ vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
+ AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5
+ vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ mov r_tmp0, i_xpos
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
+ vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int
+ vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
+ vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0
+ vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0
+ AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
+ vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp5
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
+ AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0
+%endmacro
+
+%macro AVX2_GeneralBilinearFastDownsample_16px 0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0]
+ vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+ vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+ vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0]
+ vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+ vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+ vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
+ vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
+ vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0
+ vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0
+ AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4
+ vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp5
+ vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0
+ vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0
+ AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1
+ vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2
+ AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0
+ vpermq ymm_tmp0, ymm_tmp0, 0010b
+ vmovdqu [p_dst], xmm_tmp0
+ add p_dst, 16
+ vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
+ vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
+%endmacro
+
+; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7
+%macro AVX2_BilinearIncXposw 7
+ vpaddb %1, %1, %3
+ vpaddw %6, %2, %4
+ vpcmpgtw %7, %2, %6
+ vpsubb %1, %1, %7 ; add carry
+ vpand %2, %6, %5
+%endmacro
+
+; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6
+%macro AVX2_LinearAccurateInterpolateVerticalDwords 6
+ vpshufd %1, %2, 10110001b
+ vpshufd %6, %3, 10110001b
+ vpmuludq %1, %1, %4
+ vpmuludq %6, %6, %5
+ vpaddq %1, %1, %6
+ vpmuludq %2, %2, %4
+ vpmuludq %3, %3, %5
+ vpaddq %2, %2, %3
+ vpsllq %1, %1, 3
+ vpsrlq %2, %2, 29
+ vpblendd %1, %1, %2, 01010101b
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ lea i_xpos, [i_xpos + 8 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
+ vpunpcklbw ymm_tmp2, ymm_tmp4, ymm_0
+ vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0
+ vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0
+ vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp0
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
+ vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
+ vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vpavgw ymm_tmp0, ymm_tmp0, ymm_0
+ vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
+ vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex2]
+ lea i_xpos, [r_tmp0 + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
+ vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0]
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 2 * i_scalex2]
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+ vpunpckhbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
+ vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
+ vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp1
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+ vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vpavgw ymm_tmp0, ymm_tmp0, ymm_0
+ vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
+ vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
+ vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
+ vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
+ vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b
+ AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0
+ vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp0
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 4 * i_scalex2]
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1
+ mov r_tmp0, i_xpos
+ lea i_xpos, [i_xpos + 4 * i_scalex2]
+ shr r_tmp0, 16
+ vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0]
+ vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0]
+ mov r_tmp0, i_xpos
+ add i_xpos, i_scalex2
+ shr r_tmp0, 16
+ vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1
+ vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1
+ vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int
+ vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int
+ vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int
+ vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b
+ vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0
+ vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
+ vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3
+ vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vpavgw ymm_tmp0, ymm_tmp0, ymm_0
+ vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vmovlps [p_dst], xmm_tmp0
+ vextracti128 [p_dst + 8], ymm_tmp0, 1
+ add p_dst, 16
+ AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1
+%endmacro
+
+%macro AVX2_GeneralBilinearAccurateDownsample_16px 0
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0]
+ vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+ vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+ vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0]
+ vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + 1 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0]
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0]
+ vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2]
+ vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b
+ vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2]
+ vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
+ vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b
+ vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ vmovd xmm_tmp0, [p_src_row0 + r_tmp0]
+ vmovd xmm_tmp1, [p_src_row1 + r_tmp0]
+ lea r_tmp0, [i_xpos + i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1
+ lea r_tmp0, [i_xpos + 2 * i_scalex]
+ lea i_xpos, [i_xpos + 4 * i_scalex]
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2
+ mov r_tmp0, i_xpos
+ sub r_tmp0, i_scalex
+ shr r_tmp0, 16
+ vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3
+ vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b
+ vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b
+ vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0
+ vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac0
+ vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac0
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1
+ vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0
+ vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0
+ vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac1
+ vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac1
+ AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2
+ vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vpavgw ymm_tmp0, ymm_tmp0, ymm_0
+ vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vextracti128 [p_dst], ymm_tmp0, 1
+ vmovlps [p_dst + 8], xmm_tmp0
+ add p_dst, 16
+ vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc
+ vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc
+ vpand ymm_xfrac0, ymm_xfrac0, ymm_7fff
+ vpand ymm_xfrac1, ymm_xfrac1, ymm_7fff
+%endmacro
+
+; downsample_16px_macro=%1 b_fast=%2
+%macro AVX2_GeneralBilinearDownsampler_loop 2
+%%height:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov p_src_row1, p_src_row0
+ add p_src_row1, i_src_stride
+%ifdef X86_32
+%if %2
+ vpbroadcastw ymm_tmp1, i_ypos
+ vpsllw ymm_tmp1, ymm_tmp1, 1
+ vpsrlw ymm_tmp1, ymm_tmp1, 1
+ vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vpsrlw ymm_tmp0, ymm_tmp0, 1
+%else
+ vpbroadcastd ymm_tmp1, i_ypos
+ vpslld ymm_tmp1, ymm_tmp1, 17
+ vpsrld ymm_tmp1, ymm_tmp1, 17
+ vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vpsrld ymm_tmp0, ymm_tmp0, 17
+%endif
+ vpxor ymm_tmp0, ymm_tmp0, ymm_tmp1
+ vmovdqa ymm_yfrac0, ymm_tmp0
+ vmovdqa ymm_yfrac1, ymm_tmp1
+%else
+ vmovd xmm_tmp0, i_yposd
+ vpbroadcastw ymm_yfrac1, xmm_tmp0
+%if %2
+ vpsllw ymm_yfrac1, ymm_yfrac1, 1
+ vpsrlw ymm_yfrac1, ymm_yfrac1, 1
+ vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
+ vpsrlw ymm_yfrac0, ymm_yfrac0, 1
+%else
+ vpslld ymm_yfrac1, ymm_yfrac1, 17
+ vpsrld ymm_yfrac1, ymm_yfrac1, 17
+ vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0
+ vpsrld ymm_yfrac0, ymm_yfrac0, 17
+%endif
+ vpxor ymm_yfrac0, ymm_yfrac0, ymm_yfrac1
+%endif
+
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+ sub i_width_cnt, 1
+
+%ifdef ymm_xpos_int
+ vmovdqa ymm_xpos_int, ymm_xpos_int_begin
+ vmovdqa ymm_xpos_frac, ymm_xpos_frac_begin
+%else
+ vmovdqa ymm_xfrac0, ymm_xfrac0_begin
+ vmovdqa ymm_xfrac1, ymm_xfrac1_begin
+%endif
+
+%%width:
+ %1
+ sub i_width_cnt, 16
+ jg %%width
+
+ lea p_dst, [p_dst + i_width_cnt + 1]
+%ifdef i_scalex2
+ mov r_tmp0, i_scalex2
+ shr r_tmp0, 1
+ imul i_width_cnt, r_tmp0
+%else
+ imul i_width_cnt, i_scalex
+%endif
+ add i_xpos, i_width_cnt
+ shr i_xpos, 16
+ movzx r_tmp0, byte [p_src_row0 + i_xpos]
+ mov [p_dst - 1], r_tmp0b
+%ifdef X86_32
+ mov r_tmp0, i_scaleyd
+ add i_yposd, r_tmp0
+%else
+ add i_yposd, i_scaleyd
+%endif
+ add p_dst, i_dst_stride_less_width
+ sub i_dst_height, 1
+ jg %%height
+%endmacro
+
+;**************************************************************************************************************
+;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+; uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_avx2
+ %assign push_num 0
+%ifndef X86_32
+ push r12
+ push r13
+ push rbx
+ push rbp
+ %assign push_num 4
+%ifdef WIN64
+ push rdi
+ push rsi
+ %assign push_num push_num + 2
+%endif
+%endif
+ LOAD_7_PARA
+ PUSH_XMM 16
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ ZERO_EXTENSION r6d
+ sub r1, r2 ; dst_stride - dst_width
+%ifdef X86_32
+ vmovd xmm0, arg8
+ vmovd xmm1, esp
+ and esp, -32
+ sub esp, 8 * 4 + 8 * 32
+ vmovd [esp], xmm1
+ %define p_dst r0
+ %define i_dst_stride_less_width [esp + 1 * 4]
+ %define i_dst_width [esp + 2 * 4]
+ %define i_dst_height dword [esp + 3 * 4]
+ %define p_src [esp + 4 * 4]
+ %define i_src_stride [esp + 5 * 4]
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd [esp + 6 * 4]
+ %define i_xpos r2
+ %define i_ypos [esp + 7 * 4]
+ %define i_yposd dword [esp + 7 * 4]
+ %define p_src_row0 r3
+ %define p_src_row1 r4
+ %define i_width_cnt r5
+ %define r_tmp0 r1
+ %define r_tmp0b r1b
+ %define ymm_xpos_frac ymm1
+ %define ymm_xpos_frac_inc [esp + 8 * 4]
+ %define ymm_xpos_int ymm3
+ %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32]
+ %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32]
+ %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32]
+ %define xmm_tmp0 xmm7
+ %define ymm_tmp0 ymm7
+ %define xmm_tmp1 xmm0
+ %define ymm_tmp1 ymm0
+ %define xmm_tmp2 xmm2
+ %define ymm_tmp2 ymm2
+ %define xmm_tmp3 xmm4
+ %define ymm_tmp3 ymm4
+ %define xmm_tmp4 xmm5
+ %define ymm_tmp4 ymm5
+ %define xmm_tmp5 xmm6
+ %define ymm_tmp5 ymm6
+ %define ymm_0 [esp + 8 * 4 + 4 * 32]
+ %define ymm_ffff [esp + 8 * 4 + 5 * 32]
+ %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
+ %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+ mov i_dst_stride_less_width, r1
+ mov i_dst_width, r2
+ mov i_dst_height, r3
+ mov p_src, r4
+ mov i_src_stride, r5
+ vmovd i_scaleyd, xmm0
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa ymm_0, ymm0
+ vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vmovdqa ymm_ffff, ymm_tmp0
+%else
+ %define p_dst r0
+ %define i_dst_stride_less_width r1
+ %define i_dst_width r2
+ %define i_dst_height r3
+ %define p_src r4
+ %define i_src_stride r5
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd dword arg8d
+ %define i_xpos r12
+ %define i_ypos r13
+ %define i_yposd r13d
+ %define p_src_row0 rbp
+%ifdef WIN64
+ %define p_src_row1 rsi
+ %define i_width_cnt rdi
+%else
+ %define p_src_row1 r11
+ %define i_width_cnt rax
+%endif
+ %define r_tmp0 rbx
+ %define r_tmp0b bl
+ %define ymm_0 ymm0
+ %define ymm_xpos_frac ymm1
+ %define ymm_xpos_frac_inc ymm2
+ %define ymm_xpos_int ymm3
+ %define ymm_xpos_int_inc ymm4
+ %define ymm_yfrac0 ymm5
+ %define ymm_yfrac1 ymm6
+ %define xmm_tmp0 xmm7
+ %define ymm_tmp0 ymm7
+ %define xmm_tmp1 xmm8
+ %define ymm_tmp1 ymm8
+ %define xmm_tmp2 xmm9
+ %define ymm_tmp2 ymm9
+ %define xmm_tmp3 xmm10
+ %define ymm_tmp3 ymm10
+ %define xmm_tmp4 xmm11
+ %define ymm_tmp4 ymm11
+ %define xmm_tmp5 xmm12
+ %define ymm_tmp5 ymm12
+ %define ymm_ffff ymm13
+ %define ymm_xpos_int_begin ymm14
+ %define ymm_xpos_frac_begin ymm15
+ vpxor ymm_0, ymm_0, ymm_0
+ vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff
+%endif
+
+ sub i_dst_height, 1
+ je .final_row
+ jl .done
+
+ mov i_yposd, 1 << 14
+ vmovd xmm_tmp0, i_scalexd
+ vpbroadcastd ymm_tmp0, xmm_tmp0
+ vpslld ymm_tmp1, ymm_tmp0, 2
+ vpslld ymm_tmp2, ymm_tmp0, 3
+ vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
+ vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0
+ vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
+ vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
+ vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0
+ vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
+ vpsrld ymm_tmp3, ymm_tmp3, 31
+ vpslld ymm_tmp3, ymm_tmp3, 15
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
+ vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpsrld ymm_xpos_int, ymm_tmp1, 16
+ vpsrld ymm_tmp0, ymm_tmp2, 16
+ vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ vpermq ymm_xpos_int, ymm_xpos_int, 11011000b
+ vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
+ vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
+ vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3
+ vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ vpslld ymm_tmp1, ymm_tmp1, 16
+ vpsrld ymm_tmp1, ymm_tmp1, 16
+ vpslld ymm_tmp2, ymm_tmp2, 16
+ vpsrld ymm_tmp2, ymm_tmp2, 16
+ vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2
+ vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b
+ vmovd xmm_tmp0, i_scalexd
+ vpslld xmm_tmp0, xmm_tmp0, 4
+ vpbroadcastw ymm_tmp1, xmm_tmp0
+ vmovdqa ymm_xpos_frac_inc, ymm_tmp1
+ vpsrld xmm_tmp0, xmm_tmp0, 16
+ vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3
+ vpbroadcastb ymm_tmp0, xmm_tmp0
+ vmovdqa ymm_xpos_int_inc, ymm_tmp0
+ vmovdqa ymm_xpos_int_begin, ymm_xpos_int
+ vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac
+
+ cmp i_scalex, 4 << 16
+ ja .scalex_above4
+ cmp i_scalex, 2 << 16
+ ja .scalex_above2_beloweq4
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above2_beloweq4:
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above4:
+ cmp i_scalex, 8 << 16
+ ja .scalex_above8
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above8:
+%xdefine ymm_xfrac0 ymm_xpos_frac
+%xdefine ymm_xfrac1 ymm_xpos_int
+%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
+%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
+%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_frac
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xpos_int_inc
+%undef ymm_xpos_frac_inc
+ AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff
+ vpermq ymm_xfrac0, ymm_tmp0, 01001110b
+ vpermq ymm_xfrac1, ymm_xfrac1, 01001110b
+ vmovdqa ymm_xfrac0_begin, ymm_xfrac0
+ vmovdqa ymm_xfrac1_begin, ymm_xfrac1
+ vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+ vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+ vmovdqa ymm_xfrac_inc, ymm_tmp0
+ AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1
+
+.final_row:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+
+.final_row_width:
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movzx r_tmp0, byte [p_src_row0 + r_tmp0]
+ mov [p_dst], r_tmp0b
+ add p_dst, 1
+ add i_xpos, i_scalex
+ sub i_width_cnt, 1
+ jg .final_row_width
+
+.done:
+ vzeroupper
+%ifdef X86_32
+ mov esp, [esp]
+%endif
+ POP_XMM
+ LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+ pop rsi
+ pop rdi
+%endif
+ pop rbp
+ pop rbx
+ pop r13
+ pop r12
+%endif
+ ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef ymm_xpos_frac
+%undef ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_int_inc
+%undef ymm_yfrac0
+%undef ymm_yfrac1
+%undef xmm_tmp0
+%undef ymm_tmp0
+%undef xmm_tmp1
+%undef ymm_tmp1
+%undef xmm_tmp2
+%undef ymm_tmp2
+%undef xmm_tmp3
+%undef ymm_tmp3
+%undef xmm_tmp4
+%undef ymm_tmp4
+%undef xmm_tmp5
+%undef ymm_tmp5
+%undef ymm_ffff
+%undef ymm_0
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xfrac0
+%undef ymm_xfrac1
+%undef ymm_xfrac0_begin
+%undef ymm_xfrac1_begin
+%undef ymm_xfrac_inc
+
+;**************************************************************************************************************
+;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
+; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
+; uint32_t uiScaleY);
+;
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2
+ %assign push_num 0
+%ifndef X86_32
+ push r12
+ push r13
+ push rbx
+ push rbp
+ %assign push_num 4
+%ifdef WIN64
+ push rdi
+ push rsi
+ %assign push_num push_num + 2
+%endif
+%endif
+ LOAD_7_PARA
+ PUSH_XMM 16
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ ZERO_EXTENSION r6d
+ sub r1, r2 ; dst_stride - dst_width
+ add r6, r6 ; 2 * scalex
+%ifdef X86_32
+ vmovd xmm0, arg8
+ vmovd xmm1, esp
+ and esp, -32
+ sub esp, 8 * 4 + 8 * 32
+ vmovd [esp], xmm1
+ %define p_dst r0
+ %define i_dst_stride_less_width [esp + 1 * 4]
+ %define i_dst_width [esp + 2 * 4]
+ %define i_dst_height dword [esp + 3 * 4]
+ %define p_src [esp + 4 * 4]
+ %define i_src_stride [esp + 5 * 4]
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd [esp + 6 * 4]
+ %define i_xpos r2
+ %define i_ypos [esp + 7 * 4]
+ %define i_yposd dword [esp + 7 * 4]
+ %define p_src_row0 r3
+ %define p_src_row1 r4
+ %define i_width_cnt r5
+ %define r_tmp0 r1
+ %define r_tmp0b r1b
+ %define ymm_xpos_frac ymm1
+ %define ymm_xpos_frac_inc [esp + 8 * 4]
+ %define ymm_xpos_int ymm3
+ %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32]
+ %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32]
+ %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32]
+ %define xmm_tmp0 xmm7
+ %define ymm_tmp0 ymm7
+ %define xmm_tmp1 xmm0
+ %define ymm_tmp1 ymm0
+ %define xmm_tmp2 xmm2
+ %define ymm_tmp2 ymm2
+ %define xmm_tmp3 xmm4
+ %define ymm_tmp3 ymm4
+ %define xmm_tmp4 xmm5
+ %define ymm_tmp4 ymm5
+ %define xmm_tmp5 xmm6
+ %define ymm_tmp5 ymm6
+ %define ymm_0 [esp + 8 * 4 + 4 * 32]
+ %define ymm_7fff [esp + 8 * 4 + 5 * 32]
+ %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
+ %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+ mov i_dst_stride_less_width, r1
+ mov i_dst_width, r2
+ mov i_dst_height, r3
+ mov p_src, r4
+ mov i_src_stride, r5
+ vmovd i_scaleyd, xmm0
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa ymm_0, ymm0
+ vpcmpeqw ymm0, ymm0, ymm0
+ vpsrlw ymm0, ymm0, 1
+ vmovdqa ymm_7fff, ymm0
+%else
+ %define p_dst r0
+ %define i_dst_stride_less_width r1
+ %define i_dst_width r2
+ %define i_dst_height r3
+ %define p_src r4
+ %define i_src_stride r5
+ %define i_scalex r6
+ %define i_scalexd r6d
+ %define i_scaleyd dword arg8d
+ %define i_xpos r12
+ %define i_ypos r13
+ %define i_yposd r13d
+ %define p_src_row0 rbp
+%ifdef WIN64
+ %define p_src_row1 rsi
+ %define i_width_cnt rdi
+%else
+ %define p_src_row1 r11
+ %define i_width_cnt rax
+%endif
+ %define r_tmp0 rbx
+ %define r_tmp0b bl
+ %define ymm_0 ymm0
+ %define ymm_xpos_frac ymm1
+ %define ymm_xpos_int ymm3
+ %define ymm_xpos_frac_inc ymm2
+ %define ymm_xpos_int_inc ymm4
+ %define ymm_yfrac0 ymm5
+ %define ymm_yfrac1 ymm6
+ %define xmm_tmp0 xmm7
+ %define ymm_tmp0 ymm7
+ %define xmm_tmp1 xmm8
+ %define ymm_tmp1 ymm8
+ %define xmm_tmp2 xmm9
+ %define ymm_tmp2 ymm9
+ %define xmm_tmp3 xmm10
+ %define ymm_tmp3 ymm10
+ %define xmm_tmp4 xmm11
+ %define ymm_tmp4 ymm11
+ %define xmm_tmp5 xmm12
+ %define ymm_tmp5 ymm12
+ %define ymm_7fff ymm13
+ %define ymm_xpos_int_begin ymm14
+ %define ymm_xpos_frac_begin ymm15
+ vpxor ymm_0, ymm_0, ymm_0
+ vpcmpeqw ymm_7fff, ymm_7fff, ymm_7fff
+ vpsrlw ymm_7fff, ymm_7fff, 1
+%endif
+
+ sub i_dst_height, 1
+ je .final_row
+ jl .done
+
+ mov i_yposd, 1 << 14
+ vmovd xmm_tmp0, i_scalexd
+ vpbroadcastd ymm_tmp0, xmm_tmp0
+ vpslld ymm_tmp1, ymm_tmp0, 2
+ vpslld ymm_tmp2, ymm_tmp0, 3
+ vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b
+ vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b
+ vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0
+ vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b
+ vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b
+ vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0
+ vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0
+ vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
+ vpsrld ymm_tmp3, ymm_tmp3, 31
+ vpslld ymm_tmp3, ymm_tmp3, 15
+ vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3
+ vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3
+ vpsrld ymm_xpos_int, ymm_tmp1, 16
+ vpsrld ymm_tmp0, ymm_tmp2, 16
+ vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ vpermq ymm_xpos_int, ymm_xpos_int, 11011000b
+ vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int
+ vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3
+ vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3
+ vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0
+ vpslld ymm_tmp1, ymm_tmp1, 16
+ vpsrld ymm_tmp1, ymm_tmp1, 16
+ vpslld ymm_tmp2, ymm_tmp2, 16
+ vpsrld ymm_tmp2, ymm_tmp2, 16
+ vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2
+ vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b
+ vpsrlw ymm_xpos_frac, ymm_xpos_frac, 1
+ vmovd xmm_tmp0, i_scalexd
+ vpslld xmm_tmp0, xmm_tmp0, 4
+ vpbroadcastw ymm_tmp1, xmm_tmp0
+ vpsrlw ymm_tmp1, ymm_tmp1, 1
+ vmovdqa ymm_xpos_frac_inc, ymm_tmp1
+ vpsrld xmm_tmp0, xmm_tmp0, 16
+ vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3
+ vpbroadcastb ymm_tmp0, xmm_tmp0
+ vmovdqa ymm_xpos_int_inc, ymm_tmp0
+ vmovdqa ymm_xpos_int_begin, ymm_xpos_int
+ vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac
+
+ cmp i_scalex, 4 << 16
+ ja .scalex_above4
+ cmp i_scalex, 2 << 16
+ ja .scalex_above2_beloweq4
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above2_beloweq4:
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above4:
+ cmp i_scalex, 8 << 16
+ ja .scalex_above8
+ add i_scalex, i_scalex
+%xdefine i_scalex2 i_scalex
+%undef i_scalex
+ AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0
+ shr i_scalex2, 1
+%xdefine i_scalex i_scalex2
+%undef i_scalex2
+ jmp .final_row
+.scalex_above8:
+%xdefine ymm_xfrac0 ymm_xpos_frac
+%xdefine ymm_xfrac1 ymm_xpos_int
+%xdefine ymm_xfrac0_begin ymm_xpos_int_begin
+%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin
+%xdefine ymm_xfrac_inc ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_frac
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xpos_int_inc
+%undef ymm_xpos_frac_inc
+ AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff
+ vpermq ymm_xfrac0, ymm_tmp0, 01001110b
+ vpermq ymm_xfrac1, ymm_xfrac1, 01001110b
+ vmovdqa ymm_xfrac0_begin, ymm_xfrac0
+ vmovdqa ymm_xfrac1_begin, ymm_xfrac1
+ vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0
+ vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+ vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc
+ vmovdqa ymm_xfrac_inc, ymm_tmp0
+ AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0
+
+.final_row:
+ mov p_src_row0, i_ypos
+ shr p_src_row0, 15
+ imul p_src_row0, i_src_stride
+ add p_src_row0, p_src
+ mov i_xpos, 1 << 15
+ mov i_width_cnt, i_dst_width
+
+.final_row_width:
+ mov r_tmp0, i_xpos
+ shr r_tmp0, 16
+ movzx r_tmp0, byte [p_src_row0 + r_tmp0]
+ mov [p_dst], r_tmp0b
+ add p_dst, 1
+ add i_xpos, i_scalex
+ sub i_width_cnt, 1
+ jg .final_row_width
+
+.done:
+ vzeroupper
+%ifdef X86_32
+ mov esp, [esp]
+%endif
+ POP_XMM
+ LOAD_7_PARA_POP
+%ifndef X86_32
+%ifdef WIN64
+ pop rsi
+ pop rdi
+%endif
+ pop rbp
+ pop rbx
+ pop r13
+ pop r12
+%endif
+ ret
+%undef p_dst
+%undef i_dst_stride_less_width
+%undef i_dst_width
+%undef i_dst_height
+%undef p_src
+%undef i_src_stride
+%undef i_scalex
+%undef i_scalexd
+%undef i_scaleyd
+%undef i_xpos
+%undef i_ypos
+%undef i_yposd
+%undef p_src_row0
+%undef p_src_row1
+%undef i_width_cnt
+%undef r_tmp0
+%undef r_tmp0b
+%undef ymm_xpos_frac
+%undef ymm_xpos_frac_inc
+%undef ymm_xpos_int
+%undef ymm_xpos_int_inc
+%undef ymm_yfrac0
+%undef ymm_yfrac1
+%undef xmm_tmp0
+%undef ymm_tmp0
+%undef xmm_tmp1
+%undef ymm_tmp1
+%undef xmm_tmp2
+%undef ymm_tmp2
+%undef xmm_tmp3
+%undef ymm_tmp3
+%undef xmm_tmp4
+%undef ymm_tmp4
+%undef xmm_tmp5
+%undef ymm_tmp5
+%undef ymm_0
+%undef ymm_7fff
+%undef ymm_xpos_int_begin
+%undef ymm_xpos_frac_begin
+%undef ymm_xfrac0
+%undef ymm_xfrac1
+%undef ymm_xfrac0_begin
+%undef ymm_xfrac1_begin
+%undef ymm_xfrac_inc
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -296,22 +296,24 @@
int src_stride_a; \
int src_width_a; \
int src_height_a; \
- dst_stride_c = dst_stride_a = 320; \
- src_stride_c = src_stride_a = 320; \
- src_width_c = src_width_a = 320; \
- src_height_c = src_height_a = 180; \
- dst_width_c = dst_width_a = 300; \
- dst_height_c = dst_height_a = 160; \
- for (int j = 0; j < 70000; j++) { \
- dst_c[j] = dst_a[j] = rand() % 256; \
- src_c[j] = src_a[j] = rand() % 256; \
- } \
- ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
- func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
- for (int j = 0; j < dst_height_c; j++) { \
- for (int m = 0; m < dst_width_c ; m++) { \
- ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+ for (int i = 0; i < 5; i++) { \
+ dst_stride_c = dst_stride_a = 320; \
+ src_stride_c = src_stride_a = 320; \
+ src_width_c = src_width_a = 320; \
+ src_height_c = src_height_a = 180; \
+ dst_width_c = dst_width_a = (src_width_c >> (i + 1)) + rand() % (src_width_c >> (i + 1)); \
+ dst_height_c = dst_height_a = (src_height_c >> (i + 1)) + rand() % (src_height_c >> (i + 1)); \
+ for (int j = 0; j < 70000; j++) { \
+ dst_c[j] = dst_a[j] = rand() % 256; \
+ src_c[j] = src_a[j] = rand() % 256; \
} \
+ ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
+ func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
+ for (int j = 0; j < dst_height_c; j++) { \
+ for (int m = 0; m < dst_width_c ; m++) { \
+ ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+ } \
+ } \
} \
}
@@ -343,6 +345,14 @@
WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
+ WELS_CPU_SSSE3)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
+ GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
+ WELS_CPU_AVX2)
+GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
+ GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
#endif
#if defined(HAVE_NEON)