shithub: openh264

--- a/codec/processing/src/arm/down_sample_neon.S

+++ b/codec/processing/src/arm/down_sample_neon.S

@@ -338,4 +338,121 @@

     ldmia sp!, {r4-r12, lr}

 WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon

+    stmdb sp!, {r4-r8, lr}

+    //Get the width and height

+	ldr  r4, [sp, #24]  //src_width

+	ldr  r5, [sp, #28]	//src_height

+	//Initialize the register

+	mov r6, r2

+	mov r8, r0

+	mov lr, #0

+	//Save the tailer for the un-aligned size

+	mla  r7, r1, r5, r0

+	vld1.32 {q15}, [r7]

+	add r7, r2, r3

+	//processing a colume data

+comp_ds_bilinear_onethird_loop0:

+    vld3.8 {d0, d1, d2}, [r2]!

+    vld3.8 {d3, d4, d5}, [r2]!

+    vld3.8 {d16, d17, d18}, [r7]!

+    vld3.8 {d19, d20, d21}, [r7]!

+    vaddl.u8 q11, d0, d1

+    vaddl.u8 q12, d3, d4

+    vaddl.u8 q13, d16, d17

+    vaddl.u8 q14, d19, d20

+    vrshr.u16 q11, #1

+    vrshr.u16 q12, #1

+    vrshr.u16 q13, #1

+    vrshr.u16 q14, #1

+    vrhadd.u16 q11, q13

+    vrhadd.u16 q12, q14

+    vmovn.u16 d0, q11

+    vmovn.u16 d1, q12

+    vst1.8 {q0}, [r0]!

+    add lr, #48

+    cmp lr, r4

+    movcs lr, #0

+    addcs r6, r3, lsl #1

+    addcs r6, r6, r3

+    movcs r2, r6

+    addcs r7, r2, r3

+    addcs r8, r1

+    movcs r0, r8

+    subscs r5, #1

+    bne	comp_ds_bilinear_onethird_loop0

+	//restore the tailer for the un-aligned size

+	vst1.32 {q15}, [r0]

+    ldmia sp!, {r4-r8,lr}

+WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon

+    stmdb sp!, {r4-r8, lr}

+    //Get the width and height

+	ldr  r4, [sp, #24]  //src_width

+	ldr  r5, [sp, #28]	//src_height

+	//Initialize the register

+	mov r6, r2

+	mov r8, r0

+	mov lr, #0

+	lsr r5, #2

+	//Save the tailer for the un-aligned size

+	mla  r7, r1, r5, r0

+	vld1.32 {q15}, [r7]

+	add r7, r2, r3

+	//processing a colume data

+comp_ds_bilinear_quarter_loop0:

+	vld2.16 {q0, q1}, [r2]!

+    vld2.16 {q2, q3}, [r2]!

+	vld2.16 {q8, q9}, [r7]!

+    vld2.16 {q10, q11}, [r7]!

+    vpaddl.u8 q0, q0

+    vpaddl.u8 q2, q2

+    vpaddl.u8 q8, q8

+    vpaddl.u8 q10, q10

+    vrshr.u16 q0, #1

+    vrshr.u16 q2, #1

+    vrshr.u16 q8, #1

+    vrshr.u16 q10, #1

+    vrhadd.u16 q0, q8

+    vrhadd.u16 q2, q10

+    vmovn.u16 d0, q0

+    vmovn.u16 d1, q2

+    vst1.8 {q0}, [r0]!

+    add lr, #64

+    cmp lr, r4

+    movcs lr, #0

+    addcs r6, r3, lsl #2

+    movcs r2, r6

+    addcs r7, r2, r3

+    addcs r8, r1

+    movcs r0, r8

+    subscs r5, #1

+    bne	comp_ds_bilinear_quarter_loop0

+	//restore the tailer for the un-aligned size

+	vst1.32 {q15}, [r0]

+    ldmia sp!, {r4-r8,lr}

+WELS_ASM_FUNC_END

 #endif

--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S

+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S

@@ -84,7 +84,6 @@

 WELS_ASM_AARCH64_FUNC_END

 WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon

     sub     w9, w3, w4

     sub     w1, w1, w4, lsr #1

@@ -121,6 +120,113 @@

     add     x0, x0, w1, sxtw

     sub     w5, w5, #1

     cbnz    w5, comp_ds_bilinear_w_x32_loop0

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon

+    //Initialize the register

+    mov x6, x2

+    mov x8, x0

+    mov w9, #0

+    //Save the tailer   for the unasigned   size

+    smaddl  x7, w1, w5, x0

+    ld1 {v16.16b}, [x7]

+    add x7, x2, w3, sxtw

+    //processing a colume   data

+comp_ds_bilinear_onethird_loop0:

+    ld3     {v0.16b, v1.16b, v2.16b}, [x2], #48

+    ld3     {v4.16b, v5.16b, v6.16b}, [x7], #48

+    uaddl   v2.8h, v0.8b, v1.8b

+    uaddl2  v3.8h, v0.16b, v1.16b

+    uaddl   v6.8h, v4.8b, v5.8b

+    uaddl2  v7.8h, v4.16b, v5.16b

+    urshr   v2.8h, v2.8h, #1

+    urshr   v3.8h, v3.8h, #1

+    urshr   v6.8h, v6.8h, #1

+    urshr   v7.8h, v7.8h, #1

+    urhadd  v0.8h, v2.8h, v6.8h

+    urhadd  v1.8h, v3.8h, v7.8h

+    xtn     v0.8b, v0.8h

+    xtn     v1.8b, v1.8h

+    st1     {v0.8b,v1.8b}, [x0], #16

+    add     w9, w9, #48

+    cmp     w9, w4

+    b.cc    comp_ds_bilinear_onethird_loop0

+    mov     w9, #0

+    add     x6, x6, w3, sxtw #1

+    add     x6, x6, w3, sxtw

+    mov     x2, x6

+    add     x7, x2, w3, sxtw

+    add     x8, x8, w1, sxtw

+    mov     x0, x8

+    sub     w5, w5, #1

+    cbnz    w5, comp_ds_bilinear_onethird_loop0

+    //restore   the tailer for the unasigned size

+    st1     {v16.16b}, [x0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon

+    //Initialize the register

+    mov x6, x2

+    mov x8, x0

+    mov w9, #0

+    lsr w5, w5, #2

+    //Save the tailer   for the unasigned   size

+    smaddl  x7, w1, w5, x0

+    ld1 {v16.16b}, [x7]

+    add x7, x2, w3, sxtw

+    //processing a colume   data

+comp_ds_bilinear_quarter_loop0:

+    ld2     {v0.8h, v1.8h}, [x2], #32

+    ld2     {v2.8h, v3.8h}, [x2], #32

+    ld2     {v4.8h, v5.8h}, [x7], #32

+    ld2     {v6.8h, v7.8h}, [x7], #32

+    uaddlp  v0.8h, v0.16b

+    uaddlp  v1.8h, v2.16b

+    uaddlp  v4.8h, v4.16b

+    uaddlp  v5.8h, v6.16b

+    urshr   v0.8h, v0.8h, #1

+    urshr   v1.8h, v1.8h, #1

+    urshr   v4.8h, v4.8h, #1

+    urshr   v5.8h, v5.8h, #1

+    urhadd  v0.8h, v0.8h, v4.8h

+    urhadd  v1.8h, v1.8h, v5.8h

+    xtn     v0.8b, v0.8h

+    xtn     v1.8b, v1.8h

+    st1     {v0.8b,v1.8b}, [x0], #16

+    add     w9, w9, #64

+    cmp     w9, w4

+    b.cc    comp_ds_bilinear_quarter_loop0

+    mov     w9, #0

+    add     x6, x6, w3, sxtw #2

+    mov     x2, x6

+    add     x7, x2, w3, sxtw

+    add     x8, x8, w1, sxtw

+    mov     x0, x8

+    sub     w5, w5, #1

+    cbnz    w5, comp_ds_bilinear_quarter_loop0

+    //restore   the tailer for the unasigned size

+    st1     {v16.16b}, [x0]

 WELS_ASM_AARCH64_FUNC_END

 WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -53,6 +53,8 @@

   sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;

   sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;

   sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;

+  sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;

+  sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_c;

   sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsampler_c;

   sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsampler_c;

 #if defined(X86_ASM)

@@ -60,6 +62,7 @@

     sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse;

     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse;

     sDownsampleFunc.pfHalfAverage[2]    = DyadicBilinearDownsamplerWidthx8_sse;

+    sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;

   if (iCpuFlag & WELS_CPU_SSE2) {

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;

@@ -68,10 +71,14 @@

   if (iCpuFlag & WELS_CPU_SSSE3) {

     sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_ssse3;

     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_ssse3;

+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;

+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_ssse3;

   if (iCpuFlag & WELS_CPU_SSE41) {

     sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse4;

     sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;

+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;

+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;

 #endif//X86_ASM

@@ -81,6 +88,8 @@

     sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;

     sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;

     sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;

+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;

+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_neon;

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;

     sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearAccurateDownsamplerWrap_neon;

@@ -92,6 +101,8 @@

     sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;

     sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;

     sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;

+    sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;

+    sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_AArch64_neon;

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

     sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

@@ -124,6 +135,28 @@

         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);

     m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);

+  } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {

+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

+                                         (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);

+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

+                                         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);

+    m_pfDownsample.pfQuarterDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

+                                         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);

+  } else if ((iSrcWidthY / 3) == iDstWidthY && (iSrcHeightY / 3) == iDstHeightY) {

+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

+                                          (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iDstHeightY);

+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

+                                          (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iDstHeightUV);

+    m_pfDownsample.pfOneThirdDownsampler ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

+                                          (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iDstHeightUV);

   } else {

     m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,

                                        (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -54,20 +54,29 @@

                                     uint8_t* pSrc, const int32_t kiSrcStride,

                                     const int32_t kiSrcWidth, const int32_t kiSrcHeight);

+typedef void (SpecificDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,

+                                       uint8_t* pSrc, const int32_t kiSrcStride,

+                                       const int32_t kiSrcWidth, const int32_t kiHeight);

 typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

                                       const int32_t kiDstHeight,

                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);

 typedef HalveDownsampleFunc*    PHalveDownsampleFunc;

+typedef SpecificDownsampleFunc* PSpecificDownsampleFunc;

 typedef GeneralDownsampleFunc*  PGeneralDownsampleFunc;

-HalveDownsampleFunc   DyadicBilinearDownsampler_c;

+HalveDownsampleFunc		DyadicBilinearDownsampler_c;

 GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_c;

+SpecificDownsampleFunc	DyadicBilinearQuarterDownsampler_c;

 typedef struct {

   // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;

   PHalveDownsampleFunc          pfHalfAverage[4];

+  PSpecificDownsampleFunc       pfOneThirdDownsampler;

+  PSpecificDownsampleFunc       pfQuarterDownsampler;

   PGeneralDownsampleFunc        pfGeneralRatioLuma;

   PGeneralDownsampleFunc        pfGeneralRatioChroma;

 } SDownsampleFuncs;

@@ -93,10 +102,19 @@

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;

+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_sse;

+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_ssse3;

+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_sse4;

 void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

-    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,

+    const uint32_t kuiScaleY);

 void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

-    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,

+    const uint32_t kuiScaleY);

 WELSVP_EXTERN_C_END

 #endif

@@ -109,6 +127,10 @@

 GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_neon;

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_neon;

+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_neon;

 void GeneralBilinearAccurateDownsampler_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

     const int32_t kiDstHeight,

     uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

@@ -125,8 +147,13 @@

 GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

-void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,

-                                                      uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

+SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_AArch64_neon;

+SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_AArch64_neon;

+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,

+    const int32_t kiDstWidth, const int32_t kiDstHeight,

+    uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

 WELSVP_EXTERN_C_END

 #endif

--- a/codec/processing/src/downsample/downsamplefuncs.cpp

+++ b/codec/processing/src/downsample/downsamplefuncs.cpp

@@ -68,6 +68,53 @@

+void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,

+    uint8_t* pSrc, const int32_t kiSrcStride,

+    const int32_t kiSrcWidth, const int32_t kiSrcHeight)

+{

+  uint8_t* pDstLine     = pDst;

+  uint8_t* pSrcLine     = pSrc;

+  const int32_t kiSrcStridex4   = kiSrcStride << 2;

+  const int32_t kiDstWidth      = kiSrcWidth  >> 2;

+  const int32_t kiDstHeight     = kiSrcHeight >> 2;

+  for (int32_t j = 0; j < kiDstHeight; j ++) {

+    for (int32_t i = 0; i < kiDstWidth; i ++) {

+      const int32_t kiSrcX = i << 2;

+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;

+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;

+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);

+    }

+    pDstLine    += kiDstStride;

+    pSrcLine    += kiSrcStridex4;

+  }

+}

+void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,

+    uint8_t* pSrc, const int32_t kiSrcStride,

+    const int32_t kiSrcWidth, const int32_t kiDstHeight)

+{

+  uint8_t* pDstLine     = pDst;

+  uint8_t* pSrcLine     = pSrc;

+  const int32_t kiSrcStridex3   = kiSrcStride * 3;

+  const int32_t kiDstWidth      = kiSrcWidth / 3;

+  for (int32_t j = 0; j < kiDstHeight; j ++) {

+    for (int32_t i = 0; i < kiDstWidth; i ++) {

+      const int32_t kiSrcX = i * 3;

+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;

+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;

+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);

+    }

+    pDstLine    += kiDstStride;

+    pSrcLine    += kiSrcStridex3;

+  }

+}

 void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

                                        const int32_t kiDstHeight,

                                        uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -67,7 +67,23 @@

 add_extra_half:

     dd 16384,0,0,0

+shufb_mask_quarter:

+db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h

+shufb_mask_onethird_low_1:

+db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h

+shufb_mask_onethird_low_2:

+db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h

+shufb_mask_onethird_low_3:

+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh

+shufb_mask_onethird_high_1:

+db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h

+shufb_mask_onethird_high_2:

+db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h

+shufb_mask_onethird_high_3:

+db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh

 ;***********************************************************************

 ; Code

 ;***********************************************************************

@@ -1896,3 +1912,686 @@

     pop     r12

ret

 %endif

+;***********************************************************************

+;   void DyadicBilinearOneThirdDownsampler_ssse3(    unsigned char* pDst, const int iDstStride,

+;                   unsigned char* pSrc, const int iSrcStride,

+;                   const int iSrcWidth, const int iSrcHeight );

+;***********************************************************************

+WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    mov r6, r1             ;Save the tailer for the unasigned size

+    imul r6, r5

+    add r6, r0

+    movdqa xmm7, [r6]

+.yloops_onethird_sse3:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    mov r6, r0        ;save base address

+    ; each loop = source bandwidth: 48 bytes

+.xloops_onethird_sse3:

+    ; 1st part horizonal loop: x48 bytes

+    ;               mem  hi<-       ->lo

+    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A

+    ;               xmm2: k K * j J * i I * h H * g G * f

+    ;               xmm2: * p P * o O * n N * m M * l L *

+    ;

+    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'

+    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'

+    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *

+    ;=> target:

+    ;: P O N M L K J I H G F E D C B A

+    ;: p o n m l k j i h g f e d c b a

+    ;: P' ..                          A'

+    ;: p' ..                          a'

+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+    ;1st line

+    movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A

+    movdqa xmm1, xmm0

+    movdqa xmm5, [shufb_mask_onethird_low_1]

+    movdqa xmm6, [shufb_mask_onethird_high_1]

+    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0

+    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1

+    movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_2]

+    movdqa xmm6, [shufb_mask_onethird_high_2]

+    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2

+    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3

+    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0

+    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1

+    movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_3]

+    movdqa xmm6, [shufb_mask_onethird_high_3]

+    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2

+    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3

+    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0

+    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1

+    pavgb xmm0, xmm1                            ;1st line average                -> xmm0

+    ;2nd line

+    movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_1]

+    movdqa xmm6, [shufb_mask_onethird_high_1]

+    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2

+    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3

+    movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'

+    movdqa xmm4, xmm1

+    movdqa xmm5, [shufb_mask_onethird_low_2]

+    movdqa xmm6, [shufb_mask_onethird_high_2]

+    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1

+    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4

+    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2

+    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3

+    movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *

+    movdqa xmm4, xmm1

+    movdqa xmm5, [shufb_mask_onethird_low_3]

+    movdqa xmm6, [shufb_mask_onethird_high_3]

+    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1

+    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4

+    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2

+    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3

+    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2

+    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )

+    ; write pDst

+    movdqa [r0], xmm0                           ;write result in dst

+    ; next SMB

+    lea r2, [r2+48]                             ;current src address

+    lea r0, [r0+16]                             ;current dst address

+    sub r4, 48                                  ;xloops counter

+    cmp r4, 0

+    jg near .xloops_onethird_sse3

+    sub r6, r0                                  ;offset = base address - current address

+    lea r2, [r2+2*r3]                           ;

+    lea r2, [r2+r3]                             ;

+    lea r2, [r2+2*r6]                           ;current line + 3 lines

+    lea r2, [r2+r6]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]                             ;current dst lien + 1 line

+    dec r5

+    jg near .yloops_onethird_sse3

+    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

+    ret

+;***********************************************************************

+;   void DyadicBilinearOneThirdDownsampler_sse4(    unsigned char* pDst, const int iDstStride,

+;                   unsigned char* pSrc, const int iSrcStride,

+;                   const int iSrcWidth, const int iSrcHeight );

+;***********************************************************************

+WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    mov r6, r1             ;Save the tailer for the unasigned size

+    imul r6, r5

+    add r6, r0

+    movdqa xmm7, [r6]

+.yloops_onethird_sse4:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    mov r6, r0        ;save base address

+    ; each loop = source bandwidth: 48 bytes

+.xloops_onethird_sse4:

+    ; 1st part horizonal loop: x48 bytes

+    ;               mem  hi<-       ->lo

+    ;1st Line Src:  xmm0: F * e E * d D * c C * b B * a A

+    ;               xmm2: k K * j J * i I * h H * g G * f

+    ;               xmm2: * p P * o O * n N * m M * l L *

+    ;

+    ;2nd Line Src:  xmm2: F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'

+    ;               xmm1: k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'

+    ;               xmm1: *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *

+    ;=> target:

+    ;: P O N M L K J I H G F E D C B A

+    ;: p o n m l k j i h g f e d c b a

+    ;: P' ..                          A'

+    ;: p' ..                          a'

+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+    ;1st line

+    movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A

+    movdqa xmm1, xmm0

+    movdqa xmm5, [shufb_mask_onethird_low_1]

+    movdqa xmm6, [shufb_mask_onethird_high_1]

+    pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0

+    pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1

+    movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_2]

+    movdqa xmm6, [shufb_mask_onethird_high_2]

+    pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2

+    pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3

+    paddusb xmm0, xmm2                          ;0 0 0 0 0 K J I H G F E D C B A -> xmm0

+    paddusb xmm1, xmm3                          ;0 0 0 0 0 k j i h g f e d c b a -> xmm1

+    movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_3]

+    movdqa xmm6, [shufb_mask_onethird_high_3]

+    pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2

+    pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3

+    paddusb xmm0, xmm2                          ;P O N M L K J I H G F E D C B A -> xmm0

+    paddusb xmm1, xmm3                          ;p o n m l k j i h g f e d c b a -> xmm1

+    pavgb xmm0, xmm1                            ;1st line average                -> xmm0

+    ;2nd line

+    movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'

+    movdqa xmm3, xmm2

+    movdqa xmm5, [shufb_mask_onethird_low_1]

+    movdqa xmm6, [shufb_mask_onethird_high_1]

+    pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2

+    pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3

+    movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'

+    movdqa xmm4, xmm1

+    movdqa xmm5, [shufb_mask_onethird_low_2]

+    movdqa xmm6, [shufb_mask_onethird_high_2]

+    pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1

+    pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4

+    paddusb xmm2, xmm1                          ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2

+    paddusb xmm3, xmm4                          ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3

+    movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *

+    movdqa xmm4, xmm1

+    movdqa xmm5, [shufb_mask_onethird_low_3]

+    movdqa xmm6, [shufb_mask_onethird_high_3]

+    pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1

+    pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4

+    paddusb xmm2, xmm1                          ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2

+    paddusb xmm3, xmm4                          ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3

+    pavgb xmm2, xmm3                            ;2nd line average                                -> xmm2

+    pavgb xmm0, xmm2                            ; bytes-average(1st line , 2nd line )

+    ; write pDst

+    movdqa [r0], xmm0                           ;write result in dst

+    ; next SMB

+    lea r2, [r2+48]                             ;current src address

+    lea r0, [r0+16]                             ;current dst address

+    sub r4, 48                                  ;xloops counter

+    cmp r4, 0

+    jg near .xloops_onethird_sse4

+    sub r6, r0                                  ;offset = base address - current address

+    lea r2, [r2+2*r3]                           ;

+    lea r2, [r2+r3]                             ;

+    lea r2, [r2+2*r6]                           ;current line + 3 lines

+    lea r2, [r2+r6]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]                             ;current dst lien + 1 line

+    dec r5

+    jg near .yloops_onethird_sse4

+    movdqa [r0], xmm7                           ;restore the tailer for the unasigned size

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

+    ret

+;***********************************************************************

+;   void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,

+;                   unsigned char* pSrc, const int iSrcStride,

+;                   const int iSrcWidth, const int iSrcHeight );

+;***********************************************************************

+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $02            ; iSrcHeight >> 2

+    mov r6, r1             ;Save the tailer for the unasigned size

+    imul r6, r5

+    add r6, r0

+    movq xmm7, [r6]

+.yloops_quarter_sse:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    mov r6, r0        ;save base address

+    ; each loop = source bandwidth: 32 bytes

+.xloops_quarter_sse:

+    ; 1st part horizonal loop: x16 bytes

+    ;               mem  hi<-       ->lo

+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E

+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M

+    ;

+    ;=> target:

+    ;: G E C A,

+    ;:

+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+    movq mm0, [r2]         ; 1st pSrc line

+    movq mm1, [r2+8]       ; 1st pSrc line + 8

+    movq mm2, [r2+r3]     ; 2nd pSrc line

+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8

+    pshufw mm0, mm0, 0d8h    ; x X x X c C a A

+    pshufw mm1, mm1, 0d8h    ; x X x X g G e E

+    pshufw mm2, mm2, 0d8h    ; x X x X k K i I

+    pshufw mm3, mm3, 0d8h    ; x X x X o O m M

+    punpckldq mm0, mm1       ; g G e E c C a A

+    punpckldq mm2, mm3       ; o O m M k K i I

+    ; to handle mm0,mm2

+    pshufw mm4, mm0, 0d8h       ;g G c C e E a A

+    pshufw mm5, mm4, 04eh       ;e E a A g G c C

+    punpcklbw mm4, mm5          ;g e G E c a C A  -> mm4

+    pshufw mm4, mm4, 0d8h       ;g e c a G E C A  -> mm4

+    pshufw mm5, mm2, 0d8h       ;o O k K m M i I

+    pshufw mm6, mm5, 04eh       ;m M i I o O k K

+    punpcklbw mm5, mm6          ;o m O M k i K I

+    pshufw mm5, mm5, 0d8h       ;o m k i O M K I  -> mm5

+    ; to handle mm4, mm5

+    movq mm0, mm4

+    punpckldq mm0, mm6          ;x x x x G E C A

+    punpckhdq mm4, mm6          ;x x x x g e c a

+    movq mm1, mm5

+    punpckldq mm1, mm6          ;x x x x O M K I

+    punpckhdq mm5, mm6          ;x x x x o m k i

+    ; avg within MB horizon width (8 x 2 lines)

+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1

+    pavgb mm1, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2

+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

+    ; 2nd part horizonal loop: x16 bytes

+    movq mm1, [r2+16]      ; 1st pSrc line + 16

+    movq mm2, [r2+24]      ; 1st pSrc line + 24

+    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16

+    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24

+    pshufw mm1, mm1, 0d8h

+    pshufw mm2, mm2, 0d8h

+    pshufw mm3, mm3, 0d8h

+    pshufw mm4, mm4, 0d8h

+    punpckldq mm1, mm2

+    punpckldq mm3, mm4

+    ; to handle mm1, mm3

+    pshufw mm4, mm1, 0d8h

+    pshufw mm5, mm4, 04eh

+    punpcklbw mm4, mm5

+    pshufw mm4, mm4, 0d8h

+    pshufw mm5, mm3, 0d8h

+    pshufw mm6, mm5, 04eh

+    punpcklbw mm5, mm6

+    pshufw mm5, mm5, 0d8h

+    ; to handle mm4, mm5

+    movq mm2, mm4

+    punpckldq mm2, mm6

+    punpckhdq mm4, mm6

+    movq mm3, mm5

+    punpckldq mm3, mm6

+    punpckhdq mm5, mm6

+    ; avg within MB horizon width (8 x 2 lines)

+    pavgb mm2, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1

+    pavgb mm3, mm5      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2

+    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part

+    movd [r0  ], mm0

+    movd [r0+4], mm2

+    ; next SMB

+    lea r2, [r2+32]

+    lea r0, [r0+8]

+    sub r4, 32

+    cmp r4, 0

+    jg near .xloops_quarter_sse

+    sub  r6, r0

+    ; next line

+    lea r2, [r2+4*r3]    ; next 4 end of lines

+    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

+    dec r5

+    jg near .yloops_quarter_sse

+    movq [r0], xmm7      ;restored the tailer for the unasigned size

+    WELSEMMS

+%ifndef X86_32

+    pop r12

+%endif

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

+    ret

+;***********************************************************************

+;   void DyadicBilinearQuarterDownsampler_ssse3(   unsigned char* pDst, const int iDstStride,

+;                   unsigned char* pSrc, const int iSrcStride,

+;                   const int iSrcWidth, const int iSrcHeight );

+;***********************************************************************

+WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3

+    ;push ebx

+    ;push edx

+    ;push esi

+    ;push edi

+    ;push ebp

+    ;mov edi, [esp+24]   ; pDst

+    ;mov edx, [esp+28]   ; iDstStride

+    ;mov esi, [esp+32]   ; pSrc

+    ;mov ecx, [esp+36]   ; iSrcStride

+    ;mov ebp, [esp+44]   ; iSrcHeight

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $02            ; iSrcHeight >> 2

+    mov r6, r1             ;Save the tailer for the unasigned size

+    imul r6, r5

+    add r6, r0

+    movq xmm7, [r6]

+    movdqa xmm6, [shufb_mask_quarter]

+.yloops_quarter_sse3:

+    ;mov eax, [esp+40]   ; iSrcWidth

+    ;sar eax, $02            ; iSrcWidth >> 2

+    ;mov ebx, eax        ; iDstWidth restored at ebx

+    ;sar eax, $04            ; (iSrcWidth >> 2) / 16     ; loop count = num_of_mb

+    ;neg ebx             ; - (iSrcWidth >> 2)

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    mov r6, r0

+    ; each loop = source bandwidth: 32 bytes

+.xloops_quarter_sse3:

+    ; 1st part horizonal loop: x32 bytes

+    ;               mem  hi<-       ->lo

+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

+    ;               xmm1: p P o O n N m M l L k K j J i I

+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

+    ;               xmm3: p P o O n N m M l L k K j J i I

+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+    movdqa xmm0, [r2]          ; 1st_src_line

+    movdqa xmm1, [r2+16]       ; 1st_src_line + 16

+    movdqa xmm2, [r2+r3]       ; 2nd_src_line

+    movdqa xmm3, [r2+r3+16]    ; 2nd_src_line + 16

+    pshufb xmm0, xmm6           ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A

+    pshufb xmm1, xmm6           ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I

+    pshufb xmm2, xmm6           ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A

+    pshufb xmm3, xmm6           ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I

+    movdqa xmm4, xmm0

+    movdqa xmm5, xmm2

+    punpckldq xmm0, xmm1        ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0

+    punpckhdq xmm4, xmm1        ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4

+    punpckldq xmm2, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2

+    punpckhdq xmm5, xmm3        ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5

+    pavgb xmm0, xmm4

+    pavgb xmm2, xmm5

+    pavgb xmm0, xmm2            ;average

+    ; write pDst

+    movq [r0], xmm0

+    ; next SMB

+    lea r2, [r2+32]

+    lea r0, [r0+8]

+    sub r4, 32

+    cmp r4, 0

+    jg near .xloops_quarter_sse3

+    sub r6, r0

+    ; next line

+    lea r2, [r2+4*r3]    ; next end of lines

+    lea r2, [r2+4*r6]    ; reset to base 0 [- 4 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

+    dec r5

+    jg near .yloops_quarter_sse3

+    movq [r0], xmm7      ;restored the tailer for the unasigned size

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

+    ret

+;***********************************************************************

+;   void DyadicBilinearQuarterDownsampler_sse4(    unsigned char* pDst, const int iDstStride,

+;                   unsigned char* pSrc, const int iSrcStride,

+;                   const int iSrcWidth, const int iSrcHeight );

+;***********************************************************************

+WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4

+%ifdef X86_32

+    push r6

+    %assign push_num 1

+%else

+    %assign push_num 0

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION r1, r1d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    SIGN_EXTENSION r5, r5d

+%ifndef X86_32

+    push r12

+    mov r12, r4

+%endif

+    sar r5, $02            ; iSrcHeight >> 2

+    mov r6, r1             ;Save the tailer for the unasigned size

+    imul r6, r5

+    add r6, r0

+    movq xmm7, [r6]

+    movdqa xmm6, [shufb_mask_quarter]    ;mask

+.yloops_quarter_sse4:

+%ifdef X86_32

+    mov r4, arg5

+%else

+    mov r4, r12

+%endif

+    mov r6, r0

+    ; each loop = source bandwidth: 32 bytes

+.xloops_quarter_sse4:

+    ; 1st part horizonal loop: x16 bytes

+    ;               mem  hi<-       ->lo

+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

+    ;               xmm1: p P o O n N m M l L k K j J i I

+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

+    ;               xmm3: p P o O n N m M l L k K j J i I

+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+    movntdqa xmm0, [r2]            ; 1st_src_line

+    movntdqa xmm1, [r2+16]         ; 1st_src_line + 16

+    movntdqa xmm2, [r2+r3]         ; 2nd_src_line

+    movntdqa xmm3, [r2+r3+16]      ; 2nd_src_line + 16

+    pshufb xmm0, xmm6               ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A

+    pshufb xmm1, xmm6               ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I

+    pshufb xmm2, xmm6               ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A

+    pshufb xmm3, xmm6               ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I

+    movdqa xmm4, xmm0

+    movdqa xmm5, xmm2

+    punpckldq xmm0, xmm1            ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0

+    punpckhdq xmm4, xmm1            ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4

+    punpckldq xmm2, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2

+    punpckhdq xmm5, xmm3            ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5

+    pavgb xmm0, xmm4

+    pavgb xmm2, xmm5

+    pavgb xmm0, xmm2                ;average

+    ; write pDst

+    movq [r0], xmm0

+    ; next SMB

+    lea r2, [r2+32]

+    lea r0, [r0+8]

+    sub r4, 32

+    cmp r4, 0

+    jg near .xloops_quarter_sse4

+    sub r6, r0

+    lea r2, [r2+4*r3]    ; next end of lines

+    lea r2, [r2+4*r6]    ; reset to base 0 [- 2 * iDstWidth]

+    lea r0, [r0+r1]

+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

+    dec r5

+    jg near .yloops_quarter_sse4

+    movq [r0], xmm7      ;restore the tailer for the unasigned size

+%ifndef X86_32

+    pop r12

+%endif

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop r6

+%endif

+    ret

--- a/test/api/encode_decode_api_test.cpp

+++ b/test/api/encode_decode_api_test.cpp

@@ -2512,7 +2512,7 @@

 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!

 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!

 const char* pHashStr[] = { //DO NOT CHANGE!

-  "058076b265686fc85b2b99cf7a53106f216f16c3",

+  "585663f78cadb70d9c9f179b9b53b90ffddf3178",

   "f350001c333902029800bd291fbed915a4bdf19a",

   "eb9d853b7daec03052c4850027ac94adc84c3a7e"

};

--- a/test/api/encoder_test.cpp

+++ b/test/api/encoder_test.cpp

@@ -131,7 +131,7 @@

},

     "res/Cisco_Absolute_Power_1280x720_30fps.yuv",

-    "a4707845cacc437fb52010eb020fca6d4bc1102d", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

+    "2b5965c752e1f722592c3ce9a1eb82445c9dbaa3", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

},

   // the following values may be adjusted for times since we start tuning the strategy

--- a/test/processing/ProcessUT_DownSample.cpp

+++ b/test/processing/ProcessUT_DownSample.cpp

@@ -199,6 +199,79 @@

} \

+#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \

+TEST (DownSampleTest, func) { \

+  if (ASM) {\

+    int32_t iCpuCores = 0; \

+    uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \

+    if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \

+    return; \

+  } \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \

+  int dst_stride_c; \

+  int src_stride_c; \

+  int src_width_c; \

+  int src_height_c; \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \

+  int dst_stride_a; \

+  int src_stride_a; \

+  int src_width_a; \

+  int src_height_a; \

+  dst_stride_c = dst_stride_a = 560; \

+  src_stride_c = src_stride_a = 560; \

+  src_width_c = src_width_a = 480; \

+  src_height_c = src_height_a = 30; \

+  for (int j = 0; j < 50000; j++) { \

+    dst_c[j] = dst_a[j] = rand() % 256; \

+    src_c[j] = src_a[j] = rand() % 256; \

+  } \

+  DyadicBilinearOneThirdDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c/3); \

+  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a/3); \

+  for (int j = 0; j < (src_height_c /3 ); j++) { \

+    for (int m = 0; m < (src_width_c /3); m++) { \

+      ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \

+    } \

+  } \

+}

+#define GENERATE_DyadicBilinearQuarterDownsampler_UT(func, ASM, CPUFLAGS) \

+TEST (DownSampleTest, func) { \

+  if (ASM) {\

+    int32_t iCpuCores = 0; \

+    uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \

+    if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \

+    return; \

+  } \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_c, 50000, 16); \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_c, 50000, 16); \

+  int dst_stride_c; \

+  int src_stride_c; \

+  int src_width_c; \

+  int src_height_c; \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, dst_a, 50000, 16); \

+  ENFORCE_STACK_ALIGN_1D (uint8_t, src_a, 50000, 16); \

+  int dst_stride_a; \

+  int src_stride_a; \

+  int src_width_a; \

+  int src_height_a; \

+  dst_stride_c = dst_stride_a = 560; \

+  src_stride_c = src_stride_a = 560; \

+  src_width_c = src_width_a = 640; \

+  src_height_c = src_height_a = 80; \

+  for (int j = 0; j < 50000; j++) { \

+    dst_c[j] = dst_a[j] = rand() % 256; \

+    src_c[j] = src_a[j] = rand() % 256; \

+  } \

+  DyadicBilinearQuarterDownsampler_c (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \

+  func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \

+  for (int j = 0; j < (src_height_c >> 2); j++) { \

+    for (int m = 0; m < (src_width_c >> 2); m++) { \

+      ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \

+    } \

+  } \

+}

 #define GENERATE_GeneralBilinearDownsampler_UT(func, ref, ASM, CPUFLAGS) \

 TEST (DownSampleTest, func) { \

   if (ASM) {\

@@ -259,6 +332,13 @@

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)

+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse, 1, WELS_CPU_SSE)

+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_ssse3, 1, WELS_CPU_SSSE3)

+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_sse4, 1, WELS_CPU_SSE41)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2, GeneralBilinearFastDownsampler_ref, 1,

                                         WELS_CPU_SSE2)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,

@@ -269,6 +349,10 @@

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_neon, 1, WELS_CPU_NEON)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_neon,

                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)

 #endif

@@ -276,6 +360,10 @@

 #if defined(HAVE_NEON_AARCH64)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_AArch64_neon, 1, WELS_CPU_NEON)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_AArch64_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_AArch64_neon, 1, WELS_CPU_NEON)

+GENERATE_DyadicBilinearQuarterDownsampler_UT (DyadicBilinearQuarterDownsampler_AArch64_neon, 1, WELS_CPU_NEON)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_AArch64_neon,

                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_NEON)

--

⑨