ref: 64657d3cfdb1dbdc64b30acae3f7828862e91094
parent: 69a62ea58ec64c5e2d6c7f3d8f8ed4659a45cb85
author: Guangwei Wang <guangwwa@cisco.com>
date: Wed Aug 19 11:23:54 EDT 2015
add new c and assembly functions to optimize downsampler when downscale equal 1:3/1:4
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -338,4 +338,121 @@
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
+WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
+ stmdb sp!, {r4-r8, lr}
+
+ //Get the width and height
+ ldr r4, [sp, #24] //src_width
+ ldr r5, [sp, #28] //src_height
+
+ //Initialize the register
+ mov r6, r2
+ mov r8, r0
+ mov lr, #0
+
+ //Save the tailer for the un-aligned size
+ mla r7, r1, r5, r0
+ vld1.32 {q15}, [r7]
+
+ add r7, r2, r3
+ //processing a colume data
+comp_ds_bilinear_onethird_loop0:
+
+ vld3.8 {d0, d1, d2}, [r2]!
+ vld3.8 {d3, d4, d5}, [r2]!
+ vld3.8 {d16, d17, d18}, [r7]!
+ vld3.8 {d19, d20, d21}, [r7]!
+
+ vaddl.u8 q11, d0, d1
+ vaddl.u8 q12, d3, d4
+ vaddl.u8 q13, d16, d17
+ vaddl.u8 q14, d19, d20
+ vrshr.u16 q11, #1
+ vrshr.u16 q12, #1
+ vrshr.u16 q13, #1
+ vrshr.u16 q14, #1
+
+ vrhadd.u16 q11, q13
+ vrhadd.u16 q12, q14
+
+ vmovn.u16 d0, q11
+ vmovn.u16 d1, q12
+ vst1.8 {q0}, [r0]!
+
+ add lr, #48
+ cmp lr, r4
+ movcs lr, #0
+ addcs r6, r3, lsl #1
+ addcs r6, r6, r3
+ movcs r2, r6
+ addcs r7, r2, r3
+ addcs r8, r1
+ movcs r0, r8
+ subscs r5, #1
+ bne comp_ds_bilinear_onethird_loop0
+
+ //restore the tailer for the un-aligned size
+ vst1.32 {q15}, [r0]
+
+ ldmia sp!, {r4-r8,lr}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
+ stmdb sp!, {r4-r8, lr}
+
+ //Get the width and height
+ ldr r4, [sp, #24] //src_width
+ ldr r5, [sp, #28] //src_height
+
+ //Initialize the register
+ mov r6, r2
+ mov r8, r0
+ mov lr, #0
+ lsr r5, #2
+
+ //Save the tailer for the un-aligned size
+ mla r7, r1, r5, r0
+ vld1.32 {q15}, [r7]
+
+ add r7, r2, r3
+ //processing a colume data
+comp_ds_bilinear_quarter_loop0:
+
+ vld2.16 {q0, q1}, [r2]!
+ vld2.16 {q2, q3}, [r2]!
+ vld2.16 {q8, q9}, [r7]!
+ vld2.16 {q10, q11}, [r7]!
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q8, q8
+ vpaddl.u8 q10, q10
+ vrshr.u16 q0, #1
+ vrshr.u16 q2, #1
+ vrshr.u16 q8, #1
+ vrshr.u16 q10, #1
+
+ vrhadd.u16 q0, q8
+ vrhadd.u16 q2, q10
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vst1.8 {q0}, [r0]!
+
+ add lr, #64
+ cmp lr, r4
+ movcs lr, #0
+ addcs r6, r3, lsl #2
+ movcs r2, r6
+ addcs r7, r2, r3
+ addcs r8, r1
+ movcs r0, r8
+ subscs r5, #1
+ bne comp_ds_bilinear_quarter_loop0
+
+ //restore the tailer for the un-aligned size
+ vst1.32 {q15}, [r0]
+
+ ldmia sp!, {r4-r8,lr}
+WELS_ASM_FUNC_END
+
#endif
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -84,7 +84,6 @@
WELS_ASM_AARCH64_FUNC_END
-
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
sub w9, w3, w4
sub w1, w1, w4, lsr #1
@@ -121,6 +120,113 @@
add x0, x0, w1, sxtw
sub w5, w5, #1
cbnz w5, comp_ds_bilinear_w_x32_loop0
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon
+
+ //Initialize the register
+ mov x6, x2
+ mov x8, x0
+ mov w9, #0
+
+ //Save the tailer for the unasigned size
+ smaddl x7, w1, w5, x0
+ ld1 {v16.16b}, [x7]
+
+ add x7, x2, w3, sxtw
+ //processing a colume data
+comp_ds_bilinear_onethird_loop0:
+
+ ld3 {v0.16b, v1.16b, v2.16b}, [x2], #48
+ ld3 {v4.16b, v5.16b, v6.16b}, [x7], #48
+
+ uaddl v2.8h, v0.8b, v1.8b
+ uaddl2 v3.8h, v0.16b, v1.16b
+ uaddl v6.8h, v4.8b, v5.8b
+ uaddl2 v7.8h, v4.16b, v5.16b
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ urshr v6.8h, v6.8h, #1
+ urshr v7.8h, v7.8h, #1
+
+ urhadd v0.8h, v2.8h, v6.8h
+ urhadd v1.8h, v3.8h, v7.8h
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+ st1 {v0.8b,v1.8b}, [x0], #16
+
+ add w9, w9, #48
+
+ cmp w9, w4
+ b.cc comp_ds_bilinear_onethird_loop0
+
+ mov w9, #0
+ add x6, x6, w3, sxtw #1
+ add x6, x6, w3, sxtw
+ mov x2, x6
+ add x7, x2, w3, sxtw
+ add x8, x8, w1, sxtw
+ mov x0, x8
+ sub w5, w5, #1
+
+ cbnz w5, comp_ds_bilinear_onethird_loop0
+
+ //restore the tailer for the unasigned size
+ st1 {v16.16b}, [x0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
+ //Initialize the register
+ mov x6, x2
+ mov x8, x0
+ mov w9, #0
+ lsr w5, w5, #2
+
+ //Save the tailer for the unasigned size
+ smaddl x7, w1, w5, x0
+ ld1 {v16.16b}, [x7]
+
+ add x7, x2, w3, sxtw
+ //processing a colume data
+comp_ds_bilinear_quarter_loop0:
+
+ ld2 {v0.8h, v1.8h}, [x2], #32
+ ld2 {v2.8h, v3.8h}, [x2], #32
+ ld2 {v4.8h, v5.8h}, [x7], #32
+ ld2 {v6.8h, v7.8h}, [x7], #32
+
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v2.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v5.8h, v6.16b
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+ urshr v4.8h, v4.8h, #1
+ urshr v5.8h, v5.8h, #1
+
+ urhadd v0.8h, v0.8h, v4.8h
+ urhadd v1.8h, v1.8h, v5.8h
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+ st1 {v0.8b,v1.8b}, [x0], #16
+
+ add w9, w9, #64
+
+ cmp w9, w4
+ b.cc comp_ds_bilinear_quarter_loop0
+
+ mov w9, #0
+ add x6, x6, w3, sxtw #2
+ mov x2, x6
+ add x7, x2, w3, sxtw
+ add x8, x8, w1, sxtw
+ mov x0, x8
+ sub w5, w5, #1
+
+ cbnz w5, comp_ds_bilinear_quarter_loop0
+
+ //restore the tailer for the unasigned size
+ st1 {v16.16b}, [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -53,6 +53,8 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
#if defined(X86_ASM)
@@ -60,6 +62,7 @@
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
}
if (iCpuFlag & WELS_CPU_SSE2) {
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
@@ -68,10 +71,14 @@
if (iCpuFlag & WELS_CPU_SSSE3) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
+ sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
+ sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
}
#endif//X86_ASM
@@ -81,6 +88,8 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+ sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
}
@@ -92,6 +101,8 @@
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+ sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
+ sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
}
@@ -124,6 +135,28 @@
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+ } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
+
+ m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+ (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+ m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+ (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+ m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+ (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+
+ } else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) {
+
+ m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+ (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY);
+
+ m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+ (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV);
+
+ m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+ (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV);
+
} else {
m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -54,20 +54,29 @@
uint8_t* pSrc, const int32_t kiSrcStride,
const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
+ uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiHeight);
+
typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
typedef HalveDownsampleFunc* PHalveDownsampleFunc;
+typedef SpecificDownsampleFunc* PSpecificDownsampleFunc;
typedef GeneralDownsampleFunc* PGeneralDownsampleFunc;
-HalveDownsampleFunc DyadicBilinearDownsampler_c;
+HalveDownsampleFunc DyadicBilinearDownsampler_c;
GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_c;
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c;
typedef struct {
// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
PHalveDownsampleFunc pfHalfAverage[4];
+ PSpecificDownsampleFunc pfOneThirdDownsampler;
+ PSpecificDownsampleFunc pfQuarterDownsampler;
PGeneralDownsampleFunc pfGeneralRatioLuma;
PGeneralDownsampleFunc pfGeneralRatioChroma;
} SDownsampleFuncs;
@@ -93,10 +102,19 @@
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
+SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse;
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_ssse3;
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_sse4;
+
void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+ const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
+ const uint32_t kuiScaleY);
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+ const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
+ const uint32_t kuiScaleY);
+
WELSVP_EXTERN_C_END
#endif
@@ -109,6 +127,10 @@
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
+SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_neon;
+
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_neon;
+
void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
@@ -125,8 +147,13 @@
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
-void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_AArch64_neon;
+
+SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_AArch64_neon;
+
+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
+ const int32_t kiDstWidth, const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -68,6 +68,53 @@
}
}
+void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+ uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+
+{
+ uint8_t* pDstLine = pDst;
+ uint8_t* pSrcLine = pSrc;
+ const int32_t kiSrcStridex4 = kiSrcStride << 2;
+ const int32_t kiDstWidth = kiSrcWidth >> 2;
+ const int32_t kiDstHeight = kiSrcHeight >> 2;
+
+ for (int32_t j = 0; j < kiDstHeight; j ++) {
+ for (int32_t i = 0; i < kiDstWidth; i ++) {
+ const int32_t kiSrcX = i << 2;
+ const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+ const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+ pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+ }
+ pDstLine += kiDstStride;
+ pSrcLine += kiSrcStridex4;
+ }
+}
+
+void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+ uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiDstHeight)
+
+{
+ uint8_t* pDstLine = pDst;
+ uint8_t* pSrcLine = pSrc;
+ const int32_t kiSrcStridex3 = kiSrcStride * 3;
+ const int32_t kiDstWidth = kiSrcWidth / 3;
+
+ for (int32_t j = 0; j < kiDstHeight; j ++) {
+ for (int32_t i = 0; i < kiDstWidth; i ++) {
+ const int32_t kiSrcX = i * 3;
+ const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+ const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+ pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+ }
+ pDstLine += kiDstStride;
+ pSrcLine += kiSrcStridex3;
+ }
+}
+
void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -67,7 +67,23 @@
add_extra_half:
dd 16384,0,0,0
+shufb_mask_quarter:
+db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h
+shufb_mask_onethird_low_1:
+db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_low_2:
+db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_low_3:
+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh
+
+shufb_mask_onethird_high_1:
+db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_high_2:
+db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h
+shufb_mask_onethird_high_3:
+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh
+
;***********************************************************************
; Code
;***********************************************************************
@@ -1896,3 +1912,686 @@
pop r12
ret
%endif
+
+;***********************************************************************
+; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%else
+ %assign push_num 0
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r4
+%endif
+
+ mov r6, r1 ;Save the tailer for the unasigned size
+ imul r6, r5
+ add r6, r0
+ movdqa xmm7, [r6]
+
+.yloops_onethird_sse3:
+%ifdef X86_32
+ mov r4, arg5
+%else
+ mov r4, r12
+%endif
+
+ mov r6, r0 ;save base address
+ ; each loop = source bandwidth: 48 bytes
+.xloops_onethird_sse3:
+ ; 1st part horizonal loop: x48 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
+ ; xmm2: k K * j J * i I * h H * g G * f
+ ; xmm2: * p P * o O * n N * m M * l L *
+ ;
+ ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
+ ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
+ ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P' .. A'
+ ;: p' .. a'
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;1st line
+ movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
+ movdqa xmm1, xmm0
+ movdqa xmm5, [shufb_mask_onethird_low_1]
+ movdqa xmm6, [shufb_mask_onethird_high_1]
+ pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
+ pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
+
+ movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_2]
+ movdqa xmm6, [shufb_mask_onethird_high_2]
+ pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
+ pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
+
+ paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
+ paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
+
+ movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_3]
+ movdqa xmm6, [shufb_mask_onethird_high_3]
+ pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
+ pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
+
+ paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
+ paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
+ pavgb xmm0, xmm1 ;1st line average -> xmm0
+
+ ;2nd line
+ movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_1]
+ movdqa xmm6, [shufb_mask_onethird_high_1]
+ pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
+ pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
+
+ movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
+ movdqa xmm4, xmm1
+ movdqa xmm5, [shufb_mask_onethird_low_2]
+ movdqa xmm6, [shufb_mask_onethird_high_2]
+ pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
+ pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
+
+ paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+ paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+
+ movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
+ movdqa xmm4, xmm1
+ movdqa xmm5, [shufb_mask_onethird_low_3]
+ movdqa xmm6, [shufb_mask_onethird_high_3]
+ pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
+ pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
+
+ paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+ paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+ pavgb xmm2, xmm3 ;2nd line average -> xmm2
+
+ pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
+
+ ; write pDst
+ movdqa [r0], xmm0 ;write result in dst
+
+ ; next SMB
+ lea r2, [r2+48] ;current src address
+ lea r0, [r0+16] ;current dst address
+
+ sub r4, 48 ;xloops counter
+ cmp r4, 0
+ jg near .xloops_onethird_sse3
+
+ sub r6, r0 ;offset = base address - current address
+ lea r2, [r2+2*r3] ;
+ lea r2, [r2+r3] ;
+ lea r2, [r2+2*r6] ;current line + 3 lines
+ lea r2, [r2+r6]
+ lea r0, [r0+r1]
+ lea r0, [r0+r6] ;current dst lien + 1 line
+
+ dec r5
+ jg near .yloops_onethird_sse3
+
+ movdqa [r0], xmm7 ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+ pop r12
+%endif
+
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
+;***********************************************************************
+; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%else
+ %assign push_num 0
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r4
+%endif
+
+ mov r6, r1 ;Save the tailer for the unasigned size
+ imul r6, r5
+ add r6, r0
+ movdqa xmm7, [r6]
+
+.yloops_onethird_sse4:
+%ifdef X86_32
+ mov r4, arg5
+%else
+ mov r4, r12
+%endif
+
+ mov r6, r0 ;save base address
+ ; each loop = source bandwidth: 48 bytes
+.xloops_onethird_sse4:
+ ; 1st part horizonal loop: x48 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A
+ ; xmm2: k K * j J * i I * h H * g G * f
+ ; xmm2: * p P * o O * n N * m M * l L *
+ ;
+ ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A'
+ ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f'
+ ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' *
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P' .. A'
+ ;: p' .. a'
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;1st line
+ movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
+ movdqa xmm1, xmm0
+ movdqa xmm5, [shufb_mask_onethird_low_1]
+ movdqa xmm6, [shufb_mask_onethird_high_1]
+ pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
+ pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
+
+ movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_2]
+ movdqa xmm6, [shufb_mask_onethird_high_2]
+ pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
+ pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
+
+ paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0
+ paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1
+
+ movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_3]
+ movdqa xmm6, [shufb_mask_onethird_high_3]
+ pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
+ pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
+
+ paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0
+ paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1
+ pavgb xmm0, xmm1 ;1st line average -> xmm0
+
+ ;2nd line
+ movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
+ movdqa xmm3, xmm2
+ movdqa xmm5, [shufb_mask_onethird_low_1]
+ movdqa xmm6, [shufb_mask_onethird_high_1]
+ pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
+ pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
+
+ movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
+ movdqa xmm4, xmm1
+ movdqa xmm5, [shufb_mask_onethird_low_2]
+ movdqa xmm6, [shufb_mask_onethird_high_2]
+ pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
+ pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
+
+ paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+ paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+
+ movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
+ movdqa xmm4, xmm1
+ movdqa xmm5, [shufb_mask_onethird_low_3]
+ movdqa xmm6, [shufb_mask_onethird_high_3]
+ pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
+ pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
+
+ paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2
+ paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3
+ pavgb xmm2, xmm3 ;2nd line average -> xmm2
+
+ pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line )
+
+ ; write pDst
+ movdqa [r0], xmm0 ;write result in dst
+
+ ; next SMB
+ lea r2, [r2+48] ;current src address
+ lea r0, [r0+16] ;current dst address
+
+ sub r4, 48 ;xloops counter
+ cmp r4, 0
+ jg near .xloops_onethird_sse4
+
+ sub r6, r0 ;offset = base address - current address
+ lea r2, [r2+2*r3] ;
+ lea r2, [r2+r3] ;
+ lea r2, [r2+2*r6] ;current line + 3 lines
+ lea r2, [r2+r6]
+ lea r0, [r0+r1]
+ lea r0, [r0+r6] ;current dst lien + 1 line
+
+ dec r5
+ jg near .yloops_onethird_sse4
+
+ movdqa [r0], xmm7 ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+ pop r12
+%endif
+
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
+;***********************************************************************
+; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%else
+ %assign push_num 0
+%endif
+ LOAD_6_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r4
+%endif
+ sar r5, $02 ; iSrcHeight >> 2
+
+ mov r6, r1 ;Save the tailer for the unasigned size
+ imul r6, r5
+ add r6, r0
+ movq xmm7, [r6]
+
+.yloops_quarter_sse:
+%ifdef X86_32
+ mov r4, arg5
+%else
+ mov r4, r12
+%endif
+
+ mov r6, r0 ;save base address
+ ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;
+ ;=> target:
+ ;: G E C A,
+ ;:
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [r2] ; 1st pSrc line
+ movq mm1, [r2+8] ; 1st pSrc line + 8
+ movq mm2, [r2+r3] ; 2nd pSrc line
+ movq mm3, [r2+r3+8] ; 2nd pSrc line + 8
+
+ pshufw mm0, mm0, 0d8h ; x X x X c C a A
+ pshufw mm1, mm1, 0d8h ; x X x X g G e E
+ pshufw mm2, mm2, 0d8h ; x X x X k K i I
+ pshufw mm3, mm3, 0d8h ; x X x X o O m M
+
+ punpckldq mm0, mm1 ; g G e E c C a A
+ punpckldq mm2, mm3 ; o O m M k K i I
+
+ ; to handle mm0,mm2
+ pshufw mm4, mm0, 0d8h ;g G c C e E a A
+ pshufw mm5, mm4, 04eh ;e E a A g G c C
+ punpcklbw mm4, mm5 ;g e G E c a C A -> mm4
+ pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4
+
+ pshufw mm5, mm2, 0d8h ;o O k K m M i I
+ pshufw mm6, mm5, 04eh ;m M i I o O k K
+ punpcklbw mm5, mm6 ;o m O M k i K I
+ pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5
+
+ ; to handle mm4, mm5
+ movq mm0, mm4
+ punpckldq mm0, mm6 ;x x x x G E C A
+ punpckhdq mm4, mm6 ;x x x x g e c a
+
+ movq mm1, mm5
+ punpckldq mm1, mm6 ;x x x x O M K I
+ punpckhdq mm5, mm6 ;x x x x o m k i
+
+ ; avg within MB horizon width (8 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ ; 2nd part horizonal loop: x16 bytes
+ movq mm1, [r2+16] ; 1st pSrc line + 16
+ movq mm2, [r2+24] ; 1st pSrc line + 24
+ movq mm3, [r2+r3+16] ; 2nd pSrc line + 16
+ movq mm4, [r2+r3+24] ; 2nd pSrc line + 24
+
+ pshufw mm1, mm1, 0d8h
+ pshufw mm2, mm2, 0d8h
+ pshufw mm3, mm3, 0d8h
+ pshufw mm4, mm4, 0d8h
+
+ punpckldq mm1, mm2
+ punpckldq mm3, mm4
+
+ ; to handle mm1, mm3
+ pshufw mm4, mm1, 0d8h
+ pshufw mm5, mm4, 04eh
+ punpcklbw mm4, mm5
+ pshufw mm4, mm4, 0d8h
+
+ pshufw mm5, mm3, 0d8h
+ pshufw mm6, mm5, 04eh
+ punpcklbw mm5, mm6
+ pshufw mm5, mm5, 0d8h
+
+ ; to handle mm4, mm5
+ movq mm2, mm4
+ punpckldq mm2, mm6
+ punpckhdq mm4, mm6
+
+ movq mm3, mm5
+ punpckldq mm3, mm6
+ punpckhdq mm5, mm6
+
+ ; avg within MB horizon width (8 x 2 lines)
+ pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+ movd [r0 ], mm0
+ movd [r0+4], mm2
+
+ ; next SMB
+ lea r2, [r2+32]
+ lea r0, [r0+8]
+
+ sub r4, 32
+ cmp r4, 0
+ jg near .xloops_quarter_sse
+
+ sub r6, r0
+ ; next line
+ lea r2, [r2+4*r3] ; next 4 end of lines
+ lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
+ lea r0, [r0+r1]
+ lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
+
+ dec r5
+ jg near .yloops_quarter_sse
+
+ movq [r0], xmm7 ;restored the tailer for the unasigned size
+
+ WELSEMMS
+%ifndef X86_32
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
+;***********************************************************************
+; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3
+ ;push ebx
+ ;push edx
+ ;push esi
+ ;push edi
+ ;push ebp
+
+ ;mov edi, [esp+24] ; pDst
+ ;mov edx, [esp+28] ; iDstStride
+ ;mov esi, [esp+32] ; pSrc
+ ;mov ecx, [esp+36] ; iSrcStride
+ ;mov ebp, [esp+44] ; iSrcHeight
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%else
+ %assign push_num 0
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r4
+%endif
+ sar r5, $02 ; iSrcHeight >> 2
+
+ mov r6, r1 ;Save the tailer for the unasigned size
+ imul r6, r5
+ add r6, r0
+ movq xmm7, [r6]
+
+ movdqa xmm6, [shufb_mask_quarter]
+.yloops_quarter_sse3:
+ ;mov eax, [esp+40] ; iSrcWidth
+ ;sar eax, $02 ; iSrcWidth >> 2
+ ;mov ebx, eax ; iDstWidth restored at ebx
+ ;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb
+ ;neg ebx ; - (iSrcWidth >> 2)
+%ifdef X86_32
+ mov r4, arg5
+%else
+ mov r4, r12
+%endif
+
+ mov r6, r0
+ ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse3:
+ ; 1st part horizonal loop: x32 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [r2] ; 1st_src_line
+ movdqa xmm1, [r2+16] ; 1st_src_line + 16
+ movdqa xmm2, [r2+r3] ; 2nd_src_line
+ movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
+
+ pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+ pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+ pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+ pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+ punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
+ punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
+ punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
+ punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
+
+ pavgb xmm0, xmm4
+ pavgb xmm2, xmm5
+ pavgb xmm0, xmm2 ;average
+
+ ; write pDst
+ movq [r0], xmm0
+
+ ; next SMB
+ lea r2, [r2+32]
+ lea r0, [r0+8]
+
+ sub r4, 32
+ cmp r4, 0
+ jg near .xloops_quarter_sse3
+
+ sub r6, r0
+ ; next line
+ lea r2, [r2+4*r3] ; next end of lines
+ lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth]
+ lea r0, [r0+r1]
+ lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
+
+ dec r5
+ jg near .yloops_quarter_sse3
+
+ movq [r0], xmm7 ;restored the tailer for the unasigned size
+
+%ifndef X86_32
+ pop r12
+%endif
+
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
+;***********************************************************************
+; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%else
+ %assign push_num 0
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r4
+%endif
+ sar r5, $02 ; iSrcHeight >> 2
+
+ mov r6, r1 ;Save the tailer for the unasigned size
+ imul r6, r5
+ add r6, r0
+ movq xmm7, [r6]
+
+ movdqa xmm6, [shufb_mask_quarter] ;mask
+
+.yloops_quarter_sse4:
+%ifdef X86_32
+ mov r4, arg5
+%else
+ mov r4, r12
+%endif
+
+ mov r6, r0
+ ; each loop = source bandwidth: 32 bytes
+.xloops_quarter_sse4:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [r2] ; 1st_src_line
+ movntdqa xmm1, [r2+16] ; 1st_src_line + 16
+ movntdqa xmm2, [r2+r3] ; 2nd_src_line
+ movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
+
+ pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+ pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+ pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A
+ pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+ punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0
+ punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4
+ punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2
+ punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5
+
+ pavgb xmm0, xmm4
+ pavgb xmm2, xmm5
+ pavgb xmm0, xmm2 ;average
+
+ ; write pDst
+ movq [r0], xmm0
+
+ ; next SMB
+ lea r2, [r2+32]
+ lea r0, [r0+8]
+
+ sub r4, 32
+ cmp r4, 0
+ jg near .xloops_quarter_sse4
+
+ sub r6, r0
+ lea r2, [r2+4*r3] ; next end of lines
+ lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth]
+ lea r0, [r0+r1]
+ lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
+
+ dec r5
+ jg near .yloops_quarter_sse4
+
+ movq [r0], xmm7 ;restore the tailer for the unasigned size
+
+%ifndef X86_32
+ pop r12
+%endif
+
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
--- a/test/api/encode_decode_api_test.cpp
+++ b/test/api/encode_decode_api_test.cpp
@@ -2512,7 +2512,7 @@
const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
const char* pHashStr[] = { //DO NOT CHANGE!
- "058076b265686fc85b2b99cf7a53106f216f16c3",
+ "585663f78cadb70d9c9f179b9b53b90ffddf3178",
"f350001c333902029800bd291fbed915a4bdf19a",
"eb9d853b7daec03052c4850027ac94adc84c3a7e"
};
--- a/test/api/encoder_test.cpp
+++ b/test/api/encoder_test.cpp
@@ -131,7 +131,7 @@
},
{
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
- "a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
+ "2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
},
// the following values may be adjusted for times since we start tuning the strategy
{
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -199,6 +199,79 @@
} \
}
+#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
+TEST (DownSampleTest, func) { \
+ if (ASM) {\
+ int32_t iCpuCores = 0; \
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
+ if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
+ return; \
+ } \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
+ int dst_stride_c; \
+ int src_stride_c; \
+ int src_width_c; \
+ int src_height_c; \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
+ int dst_stride_a; \
+ int src_stride_a; \
+ int src_width_a; \
+ int src_height_a; \
+ dst_stride_c = dst_stride_a = 560; \
+ src_stride_c = src_stride_a = 560; \
+ src_width_c = src_width_a = 480; \
+ src_height_c = src_height_a = 30; \
+ for (int j = 0; j < 50000; j++) { \
+ dst_c[j] = dst_a[j] = rand() % 256; \
+ src_c[j] = src_a[j] = rand() % 256; \
+ } \
+ DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \
+ func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \
+ for (int j = 0; j < (src_height_c /3 ); j++) { \
+ for (int m = 0; m < (src_width_c /3); m++) { \
+ ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+ } \
+ } \
+}
+
+#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \
+TEST (DownSampleTest, func) { \
+ if (ASM) {\
+ int32_t iCpuCores = 0; \
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
+ if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
+ return; \
+ } \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \
+ int dst_stride_c; \
+ int src_stride_c; \
+ int src_width_c; \
+ int src_height_c; \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \
+ ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \
+ int dst_stride_a; \
+ int src_stride_a; \
+ int src_width_a; \
+ int src_height_a; \
+ dst_stride_c = dst_stride_a = 560; \
+ src_stride_c = src_stride_a = 560; \
+ src_width_c = src_width_a = 640; \
+ src_height_c = src_height_a = 80; \
+ for (int j = 0; j < 50000; j++) { \
+ dst_c[j] = dst_a[j] = rand() % 256; \
+ src_c[j] = src_a[j] = rand() % 256; \
+ } \
+ DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
+ func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
+ for (int j = 0; j < (src_height_c >> 2); j++) { \
+ for (int m = 0; m < (src_width_c >> 2); m++) { \
+ ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
+ } \
+ } \
+}
#define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \
TEST (DownSampleTest, func) { \
if (ASM) {\
@@ -259,6 +332,13 @@
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE)
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41)
+
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
@@ -269,6 +349,10 @@
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)
+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON)
+
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)
#endif
@@ -276,6 +360,10 @@
#if defined(HAVE_NEON_AARCH64)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
+
+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
+
+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)