shithub: openh264

--- a/codec/processing/src/vaacalc/vaacalculation.cpp

+++ b/codec/processing/src/vaacalc/vaacalculation.cpp

@@ -64,6 +64,13 @@

     sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;

     sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_sse2;

+  if (iCpuFlag & WELS_CPU_AVX2) {

+    sVaaFuncs.pfVAACalcSad       = VAACalcSad_avx2;

+    sVaaFuncs.pfVAACalcSadBgd    = VAACalcSadBgd_avx2;

+    sVaaFuncs.pfVAACalcSadSsd    = VAACalcSadSsd_avx2;

+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_avx2;

+    sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_avx2;

+  }

 #endif//X86_ASM

 #ifdef HAVE_NEON

   if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {

--- a/codec/processing/src/vaacalc/vaacalculation.h

+++ b/codec/processing/src/vaacalc/vaacalculation.h

@@ -104,6 +104,11 @@

 VAACalcSadFunc          VAACalcSad_sse2;

 VAACalcSadVarFunc       VAACalcSadVar_sse2;

 VAACalcSadSsdFunc       VAACalcSadSsd_sse2;

+VAACalcSadBgdFunc       VAACalcSadBgd_avx2;

+VAACalcSadSsdBgdFunc    VAACalcSadSsdBgd_avx2;

+VAACalcSadFunc          VAACalcSad_avx2;

+VAACalcSadVarFunc       VAACalcSadVar_avx2;

+VAACalcSadSsdFunc       VAACalcSadSsd_avx2;

 WELSVP_EXTERN_C_END

 #endif

--- a/codec/processing/src/x86/vaa.asm

+++ b/codec/processing/src/x86/vaa.asm

@@ -2028,3 +2028,1532 @@

 %undef          localsize

ret

 %endif

+%ifdef X86_32

+%define ptrword dword

+%else

+%define ptrword qword

+%endif

+%define xmm_width 16

+%define ymm_width 32

+%macro PUSHM 1-*

+    %rep %0

+        push           %1

+        %rotate 1

+    %endrep

+    %assign push_num push_num + %0

+%endmacro

+%macro POPM 1-*

+    %rep %0

+        %rotate -1

+        pop            %1

+    %endrep

+    %assign push_num push_num - %0

+%endmacro

+%ifdef X86_32

+%define stack_alloc_min 4

+%else

+%define stack_alloc_min 8

+%endif

+; Allocate aligned stack space.

+; address_out=%1 size=%2 alignment=%3

+%macro STACK_ALLOC 3

+%if (%3) & ((%3) - 1)

+    %error non-power-of-2 alignment requested.

+%endif

+%if (%3) > 0

+    %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min

+%else

+    %assign stack_alloc_align 1

+%endif

+    %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1

+    %assign push_num push_num + stack_alloc_num

+    sub            r7, stack_alloc_min * stack_alloc_num

+%if stack_alloc_align == 1

+    mov            %1, r7

+%else

+    lea            %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)]

+    and            %1, -(stack_alloc_min * stack_alloc_align)

+%endif

+%endmacro

+; Deallocate stack space allocated with STACK_ALLOC.

+%macro STACK_DEALLOC 0

+    add            r7, stack_alloc_min * stack_alloc_num

+    %assign push_num push_num - stack_alloc_num

+%endmacro

+; Max unsigned byte per quadword

+; out=%1 in=%2 tmp=%3

+%macro AVX2_Maxubq 3

+    vpsrlq         %3, %2, 32

+    vpmaxub        %1, %2, %3

+    vpsrlq         %3, %1, 16

+    vpmaxub        %1, %1, %3

+    vpsrlq         %3, %1,  8

+    vpmaxub        %1, %1, %3

+%endmacro

+; Max unsigned byte per quadword. 2 register input.

+; Results interleaved as least significant byte of even/odd doublewords.

+; out=%1 in_a=%2 in_b=%3 tmp=%4

+%macro AVX2_Maxubq2 4

+    vpblendd       %4, %2, %3, 10101010b

+    vpshufd        %4, %4, 10110001b

+    vpblendd       %1, %2, %3, 01010101b

+    vpmaxub        %1, %4, %1

+    vpsrld         %4, %1, 16

+    vpmaxub        %1, %1, %4

+    vpsrld         %4, %1,  8

+    vpmaxub        %1, %1, %4

+%endmacro

+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5

+%macro AVX2_Sqsumbdw 5

+    vpunpcklbw     %4, %2, %3

+%if %5

+    vpmaddwd       %4, %4, %4

+    vpaddd         %1, %1, %4

+%else

+    vpmaddwd       %1, %4, %4

+%endif

+    vpunpckhbw     %4, %2, %3

+    vpmaddwd       %4, %4, %4

+    vpaddd         %1, %1, %4

+%endmacro

+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5

+%macro AVX2_Sumbdw 5

+%if %5

+    vpsadbw        %4, %2, %3

+    vpaddd         %1, %1, %4

+%else

+    vpsadbw        %1, %2, %3

+%endif

+%endmacro

+; res=%1 a=%2 b=%3 a=%4 tmp=%5

+%macro AVX2_AbsDiffub 5

+    vpsubusb       %5, %2, %3

+    vpsubusb       %1, %3, %4

+    vpor           %1, %5, %1

+%endmacro

+; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5

+%macro AVX2_Sadbdw 5

+%if %5

+    vpsadbw        %4, %2, %3

+    vpaddd         %1, %1, %4

+%else

+    vpsadbw        %1, %2, %3

+%endif

+%endmacro

+; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8

+%macro AVX2_SadSumSqsumbdw 8

+    AVX2_Sadbdw    %1, %4, %5, %7, %8

+    AVX2_Sumbdw    %2, %4, %6, %7, %8

+    AVX2_Sqsumbdw  %3, %4, %6, %7, %8

+%endmacro

+; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5

+%macro AVX2_Sad 5

+    vmovdqu        %4, [%2]

+    AVX2_Sadbdw    %1, %4, [%3], %4, %5

+%endmacro

+; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9

+%macro AVX2_SadSumSqsum 9

+    vmovdqu        %7, [%4]

+    AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9

+%endmacro

+; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11

+%macro AVX2_SadSumSqsumSqdiff 11

+    vmovdqu        %8,  [%5]

+    vmovdqu        %9,  [%6]

+    AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11

+    AVX2_AbsDiffub %9,  %8,  %9,  %8,  %10

+    AVX2_Sqsumbdw  %4,  %9,  %7,  %10, %11

+%endmacro

+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11

+%macro AVX2_SadSdMad 11

+    vmovdqu        %8,  [%5]

+    vmovdqu        %9,  [%6]

+    AVX2_Sumbdw    %2,  %8,  %7,  %10, %11

+    AVX2_Sumbdw    %3,  %9,  %7,  %10, %11

+    AVX2_Sadbdw    %1,  %8,  %9,  %10, %11

+%if %11

+    AVX2_AbsDiffub %9,  %8,  %9,  %8, %10

+    vpmaxub        %4,  %4,  %9

+%else

+    AVX2_AbsDiffub %4,  %8,  %9,  %8, %10

+%endif

+%endmacro

+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13

+%macro AVX2_SadBgdSqdiff 13

+%ifidn %12, 0

+    vmovdqu        %10, [%7]

+    AVX2_Sumbdw    %2,  %10, %9,  %11, %13

+    AVX2_Sqsumbdw  %6,  %10, %9,  %11, %13

+    vmovdqu        %11, [%8]

+    AVX2_Sadbdw    %1,  %10, %11, %10, %13

+    AVX2_Sumbdw    %3,  %11, %9,  %10, %13

+    vmovdqu        %10, [%7]

+%if %13

+    AVX2_AbsDiffub %11, %10, %11, [%7], %10

+    vpmaxub        %4,  %4,  %11

+    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13

+%else

+    AVX2_AbsDiffub %4,  %10, %11, [%7], %10

+    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13

+%endif

+%else

+    vmovdqu        %10, [%7]

+    vmovdqu        %11, [%8]

+    AVX2_Sadbdw    %1,  %10, %11, %12, %13

+    AVX2_Sumbdw    %2,  %10, %9,  %12, %13

+    AVX2_Sumbdw    %3,  %11, %9,  %12, %13

+    AVX2_Sqsumbdw  %6,  %10, %9,  %12, %13

+%if %13

+    AVX2_AbsDiffub %11, %10, %11, %10, %12

+    vpmaxub        %4,  %4,  %11

+    AVX2_Sqsumbdw  %5,  %11, %9,  %10, %13

+%else

+    AVX2_AbsDiffub %4,  %10, %11, %10, %12

+    AVX2_Sqsumbdw  %5,  %4,  %9,  %10, %13

+%endif

+%endif

+%endmacro

+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5

+%macro AVX2_Store8x8Accdw 5

+    vpshufd        %2%4, %2%3, 1000b

+%ifidni %2, x

+    vmovlps        [%1 + 8 * %5], x%4

+%elif %5 == 0

+    vmovdqu        [%1], %2%4

+%else

+    vmovlps        [%1 +  8], x%4

+    vextracti128   x%4, %2%4, 1

+    vmovlps        [%1 + 24], x%4

+%endif

+%endmacro

+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5

+%macro AVX2_Store8x8Accb 5

+    vpunpckhqdq    %2%4, %2%3, %2%3

+    vpunpcklbw     %2%4, %2%3, %2%4

+%if %5 == 0

+    vmovd          [%1 + 0], x%4

+%ifidni %2, y

+    vextracti128   x%4, %2%4, 1

+    vmovd          [%1 + 4], x%4

+%endif

+%else

+    vpextrw        [%1 + 2], x%4, 0

+%ifidni %2, y

+    vextracti128   x%4, %2%4, 1

+    vpextrw        [%1 + 6], x%4, 0

+%endif

+%endif

+%endmacro

+; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5

+%macro AVX2_Store2x8x8Accb 5

+    vpunpckhqdq    y%3, y%2, y%2

+    vpunpcklbw     y%3, y%2, y%3

+    vextracti128   x%4, y%3, 1

+    vpsllq         x%4, x%4, 32

+    vpblendd       x%4, x%3, x%4, 1010b

+%if %5

+    vpslld         x%4, x%4, 16

+    vpblendw       x%4, x%4, [%1], 01010101b

+%endif

+    vmovdqu        [%1], x%4

+%endmacro

+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5

+%macro AVX2_Store16x16Accdw 5

+%ifidni %2, x

+%if %5

+    vmovd          x%4, [%1 + 0]

+    vpaddd         x%3, x%4, x%3

+%endif

+    vmovd          [%1 + 0], x%3

+%elif %5 == 0

+    vmovd          [%1 + 0], x%3

+    vextracti128   x%3, %2%3, 1

+    vmovd          [%1 + 4], x%3

+%else

+    vextracti128   x%4, %2%3, 1

+    vpunpckldq     x%4, x%3, x%4

+    vmovq          x%3, [%1 + 0]

+    vpaddd         x%3, x%3, x%4

+    vmovlps        [%1 + 0], x%3

+%endif

+%endmacro

+; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8

+%macro AVX2_Store2x16x16Accdw 8

+%ifidni %5, x

+    mov            %4, %1

+%if %8 == 0

+    vmovd          [%4 + %3], x%6

+    mov            %4, %2

+    vpextrd        [%4 + %3], x%6, 2

+%else

+    vmovd          x%7, [%4 + %3]

+    vpaddd         x%7, x%7, x%6

+    vmovd          [%4 + %3], x%7

+    mov            %4, %2

+    vpbroadcastd   x%7, [%4 + %3]

+    vpaddd         x%7, x%7, x%6

+    vpextrd        [%4 + %3], x%7, 2

+%endif

+%else

+    vextracti128   x%7, %5%6, 1

+    vpblendd       x%6, x%6, x%7, 1010b

+    mov            %4, %1

+%if %8 == 0

+    vmovlps        [%4 + %3], x%6

+    mov            %4, %2

+    vmovhps        [%4 + %3], x%6

+%else

+    vmovq          x%7, [%4 + %3]

+    vpaddd         x%7, x%7, x%6

+    vmovlps        [%4 + %3], x%7

+    mov            %4, %2

+    vpbroadcastq   x%7, [%4 + %3]

+    vpaddd         x%7, x%7, x%6

+    vmovhps        [%4 + %3], x%7

+%endif

+%endif

+%endmacro

+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7

+%macro AVX2_CalcSad_8Lines 7

+%define mm_tmp0    %2

+%define mm_sad     %3

+%define mm_sad2    %4

+%define mm_sad3    %5

+%define mm_sad4    %6

+%define b_second_blocks %7

+%ifdef i_stride5

+    %define i_stride5_ i_stride5

+%else

+    lea            r_tmp, [5 * i_stride]

+    %define i_stride5_ r_tmp

+%endif

+    ; Use multiple accumulators to shorten dependency chains and enable more parallelism.

+    AVX2_Sad       %1 %+ mm_sad,  p_cur,                  p_ref,                  %1 %+ mm_tmp0, 0

+    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_tmp0, 0

+    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_tmp0, 0

+    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_tmp0, 0

+    AVX2_Sad       %1 %+ mm_sad,  p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_tmp0, 1

+    AVX2_Sad       %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1

+%ifdef i_stride7

+    %define i_stride7_ i_stride7

+%else

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+    %define i_stride7_ r_tmp

+%endif

+    AVX2_Sad       %1 %+ mm_sad3, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_tmp0, 1

+    AVX2_Sad       %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1

+%undef i_stride5_

+%undef i_stride7_

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    add            p_cur, %1 %+ mm_width

+    add            p_ref, %1 %+ mm_width

+    ; Collapse accumulators.

+    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad2

+    vpaddd         %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4

+    vpaddd         %1 %+ mm_sad,  %1 %+ mm_sad,  %1 %+ mm_sad3

+    AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks

+    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad

+%undef mm_tmp0

+%undef mm_sad

+%undef mm_sad2

+%undef mm_sad3

+%undef mm_sad4

+%undef b_second_blocks

+%endmacro

+;*************************************************************************************************************

+;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight

+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)

+;*************************************************************************************************************

+WELS_EXTERN VAACalcSad_avx2

+%define          p_sadframe                    ptrword arg6

+%define          p_sad8x8                      ptrword arg7

+%ifdef X86_32

+%define          saveregs                      r5, r6

+%else

+%define          saveregs                      rbx, rbp, r12

+%endif

+%assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    PUSHM          saveregs

+%define mm_zero mm0

+%define mm_sadframe mm6

+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero

+    vmovdqa        y %+ mm_sadframe, y %+ mm_zero

+    and            r2, -16                     ; iPicWidth &= -16

+    jle            .done                       ; bail if iPicWidth < 16

+    sar            r3, 4                       ; iPicHeight / 16

+    jle            .done                       ; bail if iPicHeight < 16

+    shr            r2, 2                       ; iPicWidth / 4

+%define p_cur     r0

+%define p_ref     r1

+%define i_xcnt    r2

+%define i_ycnt    ptrword arg4

+%define i_stride  r4

+%define xcnt_unit 4

+%ifdef X86_32

+    mov            i_ycnt, r3

+    mov            r5, p_sad8x8

+    %define i_stride3 r3

+    %undef  p_sad8x8

+    %define p_sad8x8  r5

+    %define r_tmp     r6

+    lea            i_stride3, [3 * i_stride]

+%else

+    mov            rbp, p_sad8x8

+    %define i_stride3 rbx

+    %define i_stride5 r12

+    %define i_stride7 r6

+    %undef  p_sad8x8

+    %define p_sad8x8  rbp

+    lea            i_stride3, [3 * i_stride]

+    lea            i_stride5, [5 * i_stride]

+    lea            i_stride7, [i_stride + 2 * i_stride3]

+%endif

+    ; offset pointer so as to compensate for the i_xcnt offset below.

+    sub            p_sad8x8, 4 * 16 / xcnt_unit

+    push           i_xcnt

+%assign push_num push_num + 1

+%define i_xcnt_load ptrword [r7]

+.height_loop:

+    ; use end-of-line pointers so as to enable use of a negative counter as index.

+    lea            p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt]

+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.

+    neg            i_xcnt

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_upper8_remaining16

+.width_loop_upper8:

+    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_upper8

+    jg             .width_loop_upper8_end

+.width_loop_upper8_remaining16:

+    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0

+.width_loop_upper8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    xor            i_xcnt, i_xcnt

+    sub            i_xcnt, i_xcnt_load

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_lower8_remaining16

+.width_loop_lower8:

+    AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_lower8

+    jg             .width_loop_lower8_end

+.width_loop_lower8_remaining16:

+    AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1

+.width_loop_lower8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    xor            i_xcnt, i_xcnt

+    sub            i_xcnt, i_xcnt_load

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    neg            i_xcnt

+    sub            i_ycnt, 1

+    jnz            .height_loop

+    pop            i_xcnt

+%assign push_num push_num - 1

+%undef i_xcnt_load

+.done:

+    mov            r6, p_sadframe

+    vextracti128   xmm2, y %+ mm_sadframe, 1

+    vpaddd         xmm2, x %+ mm_sadframe, xmm2

+    vpunpckhqdq    xmm1, xmm2, xmm2

+    vpaddd         xmm2, xmm2, xmm1

+    vmovd          [r6], xmm2

+    vzeroupper

+    POPM           saveregs

+    POP_XMM

+    LOAD_5_PARA_POP

+%undef           p_cur

+%undef           p_ref

+%undef           i_xcnt

+%undef           i_ycnt

+%undef           i_stride

+%undef           r_tmp

+%undef           xcnt_unit

+%undef           i_stride3

+%undef           i_stride5

+%undef           i_stride7

+%undef           mm_sadframe

+%undef           mm_zero

+%undef           saveregs

+%undef           p_sadframe

+%undef           p_sad8x8

+    ret

+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7

+%macro AVX2_CalcSadVar_8Lines 7

+%define mm_tmp0    %2

+%define mm_tmp1    %3

+%define mm_sad     %4

+%define mm_sum     %5

+%define mm_sqsum   %6

+%define b_second_blocks %7

+    ; Unroll for better performance on Haswell.

+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.

+%ifidni %1, y

+    lea            r_tmp, [5 * i_stride]

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    add            p_cur, %1 %+ mm_width

+    add            p_ref, %1 %+ mm_width

+%else

+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad

+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum

+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum

+    lea            r_tmp, [8 * i_stride]

+    add            p_cur, r_tmp

+    add            p_ref, r_tmp

+    neg            r_tmp

+%%loop:

+    AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1

+    add            r_tmp, i_stride

+    jl             %%loop

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+%endif

+    AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks

+    vpaddd         y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad

+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum

+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1

+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1

+    AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks

+%undef mm_tmp0

+%undef mm_tmp1

+%undef mm_sad

+%undef mm_sum

+%undef mm_sqsum

+%undef b_second_blocks

+%endmacro

+;*************************************************************************************************************

+;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight

+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)

+;*************************************************************************************************************

+WELS_EXTERN VAACalcSadVar_avx2

+%define          p_sadframe                    ptrword arg6

+%define          p_sad8x8                      ptrword arg7

+%define          p_sum16x16                    ptrword arg8

+%define          p_sqsum16x16                  ptrword arg9

+%ifdef X86_32

+%define          saveregs                      r5, r6

+%else

+%define          saveregs                      rbx, rbp, r12, r13

+%endif

+%assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    PUSHM          saveregs

+%define mm_zero mm0

+%define mm_sadframe mm6

+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero

+    vmovdqa        y %+ mm_sadframe, y %+ mm_zero

+    and            r2, -16                     ; iPicWidth &= -16

+    jle            .done                       ; bail if iPicWidth < 16

+    sar            r3, 4                       ; iPicHeight / 16

+    jle            .done                       ; bail if iPicHeight < 16

+    shr            r2, 2                       ; iPicWidth / 4

+%define p_cur     r0

+%define p_ref     r1

+%define i_xcnt    r2

+%define i_ycnt    ptrword arg4

+%define i_stride  r4

+%define r_tmp     r6

+%define xcnt_unit 4

+%ifdef X86_32

+    mov            i_ycnt, r3

+    mov            r3, p_sad8x8

+    %undef  p_sad8x8

+    %define p_sad8x8 r3

+    %define i_stride3 r5

+%else

+    mov            rbp, p_sad8x8

+    mov            r12, p_sum16x16

+    mov            r13, p_sqsum16x16

+    %undef  p_sad8x8

+    %undef  p_sum16x16

+    %undef  p_sqsum16x16

+    %define p_sad8x8 rbp

+    %define p_sum16x16 r12

+    %define p_sqsum16x16 r13

+    %define i_stride3 rbx

+%endif

+    lea            i_stride3, [3 * i_stride]

+    ; offset pointers so as to compensate for the i_xcnt offset below.

+    sub            p_sad8x8,      4 * 16 / xcnt_unit

+    sub            p_sum16x16,    1 * 16 / xcnt_unit

+    sub            p_sqsum16x16,  1 * 16 / xcnt_unit

+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.

+    neg            i_xcnt

+.height_loop:

+    push           i_xcnt

+%assign push_num push_num + 1

+%define i_xcnt_load ptrword [r7]

+    ; use end-of-line pointers so as to enable use of a negative counter as index.

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_sad8x8, r_tmp

+    sub            p_sum16x16, i_xcnt

+    sub            p_sqsum16x16, i_xcnt

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_upper8_remaining16

+.width_loop_upper8:

+    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_upper8

+    jg             .width_loop_upper8_end

+.width_loop_upper8_remaining16:

+    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0

+.width_loop_upper8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    mov            i_xcnt, i_xcnt_load

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_lower8_remaining16

+.width_loop_lower8:

+    AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_lower8

+    jg             .width_loop_lower8_end

+.width_loop_lower8_remaining16:

+    AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1

+.width_loop_lower8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+%undef i_xcnt_load

+    pop            i_xcnt

+    %assign push_num push_num - 1

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    sub            i_ycnt, 1

+    jnz            .height_loop

+.done:

+    mov            r_tmp, p_sadframe

+    vextracti128   xmm2, y %+ mm_sadframe, 1

+    vpaddd         xmm2, x %+ mm_sadframe, xmm2

+    vpunpckhqdq    xmm1, xmm2, xmm2

+    vpaddd         xmm2, xmm2, xmm1

+    vmovd          [r_tmp], xmm2

+    vzeroupper

+    POPM           saveregs

+    POP_XMM

+    LOAD_5_PARA_POP

+%undef           p_cur

+%undef           p_ref

+%undef           i_xcnt

+%undef           i_ycnt

+%undef           i_stride

+%undef           i_stride3

+%undef           r_tmp

+%undef           xcnt_unit

+%undef           mm_sadframe

+%undef           mm_zero

+%undef           saveregs

+%undef           p_sadframe

+%undef           p_sad8x8

+%undef           p_sum16x16

+%undef           p_sqsum16x16

+    ret

+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9

+%macro AVX2_CalcSadSsd_8Lines 9

+%define mm_tmp0    %2

+%define mm_tmp1    %3

+%define mm_tmp2    %4

+%define mm_sad     %5

+%define mm_sum     %6

+%define mm_sqsum   %7

+%define mm_sqdiff  %8

+%define b_second_blocks %9

+    ; Unroll for better performance on Haswell.

+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.

+%ifidni %1, y

+%ifdef i_stride5

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+    %define i_stride5_ i_stride5

+%else

+    lea            r_tmp, [5 * i_stride]

+    %define i_stride5_ r_tmp

+%endif

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur,                  p_ref,                  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride,   p_ref + 1 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride,   p_ref + 2 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3,  p_ref + 1 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride,   p_ref + 4 * i_stride,   %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+%ifndef i_stride5

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+%endif

+%undef i_stride5_

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3,  p_ref + 2 * i_stride3,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp,          p_ref + r_tmp,          %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    add            p_cur, %1 %+ mm_width

+    add            p_ref, %1 %+ mm_width

+%else

+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad

+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum

+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum

+    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff

+    lea            r_tmp, [8 * i_stride]

+    add            p_cur, r_tmp

+    add            p_ref, r_tmp

+    neg            r_tmp

+%%loop:

+    AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    add            r_tmp, i_stride

+    jl             %%loop

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+%endif

+    mov            r_tmp, p_sad8x8

+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks

+%ifdef X86_32

+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc

+    vmovdqa        sadframe_acc, y %+ mm_tmp1

+%else

+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad

+%endif

+    mov            r_tmp, i_xcnt

+    add            r_tmp, p_sum16x16

+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1

+    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks

+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff

+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1

+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1

+    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks

+%undef mm_tmp0

+%undef mm_tmp1

+%undef mm_tmp2

+%undef mm_sad

+%undef mm_sum

+%undef mm_sqsum

+%undef mm_sqdiff

+%undef b_second_blocks

+%endmacro

+;*************************************************************************************************************

+;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,

+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)

+;*************************************************************************************************************

+WELS_EXTERN VAACalcSadSsd_avx2

+%define          p_sadframe                    ptrword arg6

+%define          p_sad8x8                      ptrword arg7

+%define          p_sum16x16                    ptrword arg8

+%define          p_sqsum16x16                  ptrword arg9

+%define          p_sqdiff16x16                 ptrword arg10

+%ifdef X86_32

+%define          saveregs                      r5, r6

+%else

+%define          saveregs                      rbx, rbp, r12, r13, r14, r15

+%endif

+%assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 9

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    PUSHM          saveregs

+%define mm_zero mm0

+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero

+%ifdef X86_32

+    STACK_ALLOC    r5, ymm_width, ymm_width

+    %define sadframe_acc_addr r5

+    %define sadframe_acc [sadframe_acc_addr]

+%else

+    %define sadframe_acc ymm8

+    %define xsadframe_acc xmm8

+%endif

+    vmovdqa        sadframe_acc, y %+ mm_zero

+    and            r2, -16                     ; iPicWidth &= -16

+    jle            .done                       ; bail if iPicWidth < 16

+    sar            r3, 4                       ; iPicHeight / 16

+    jle            .done                       ; bail if iPicHeight < 16

+    shr            r2, 2                       ; iPicWidth / 4

+%define p_cur     r0

+%define p_ref     r1

+%define i_xcnt    r2

+%define i_ycnt    ptrword arg4

+%define i_stride  r4

+%define r_tmp     r6

+%define xcnt_unit 4

+%ifdef X86_32

+    mov            i_ycnt, r3

+    %define i_stride3 r3

+%else

+    mov            r12, p_sad8x8

+    mov            r13, p_sum16x16

+    mov            r14, p_sqsum16x16

+    mov            r15, p_sqdiff16x16

+    %undef  p_sad8x8

+    %undef  p_sum16x16

+    %undef  p_sqsum16x16

+    %undef  p_sqdiff16x16

+    %define p_sad8x8 r12

+    %define p_sum16x16 r13

+    %define p_sqsum16x16 r14

+    %define p_sqdiff16x16 r15

+    %define i_stride3 rbx

+    %define i_stride5 rbp

+    lea            i_stride5, [5 * i_stride]

+%endif

+    lea            i_stride3, [3 * i_stride]

+    ; offset pointers so as to compensate for i_xcnt offset below.

+    sub            p_sad8x8,      4 * 16 / xcnt_unit

+    sub            p_sum16x16,    1 * 16 / xcnt_unit

+    sub            p_sqsum16x16,  1 * 16 / xcnt_unit

+    sub            p_sqdiff16x16, 1 * 16 / xcnt_unit

+    ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.

+    neg            i_xcnt

+.height_loop:

+    push           i_xcnt

+%assign push_num push_num + 1

+%define i_xcnt_load ptrword [r7]

+    ; use end-of-line pointers so as to enable use of a negative counter as index.

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_sad8x8, r_tmp

+    sub            p_sum16x16, i_xcnt

+    sub            p_sqsum16x16, i_xcnt

+    sub            p_sqdiff16x16, i_xcnt

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_upper8_remaining16

+.width_loop_upper8:

+    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_upper8

+    jg             .width_loop_upper8_end

+.width_loop_upper8_remaining16:

+    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0

+.width_loop_upper8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    mov            i_xcnt, i_xcnt_load

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    add            i_xcnt, 16 / xcnt_unit

+    jz             .width_loop_lower8_remaining16

+.width_loop_lower8:

+    AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_lower8

+    jg             .width_loop_lower8_end

+.width_loop_lower8_remaining16:

+    AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1

+.width_loop_lower8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+%undef i_xcnt_load

+    pop            i_xcnt

+    %assign push_num push_num - 1

+    lea            p_cur, [p_cur + xcnt_unit * i_xcnt]

+    lea            p_ref, [p_ref + xcnt_unit * i_xcnt]

+    sub            i_ycnt, 1

+    jnz            .height_loop

+.done:

+    mov            r_tmp, p_sadframe

+%ifdef X86_32

+    vmovdqa        xmm2, sadframe_acc

+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]

+%else

+    vextracti128   xmm2, sadframe_acc, 1

+    vpaddd         xmm2, xsadframe_acc, xmm2

+%endif

+    vpunpckhqdq    xmm1, xmm2, xmm2

+    vpaddd         xmm2, xmm2, xmm1

+    vmovd          [r_tmp], xmm2

+    vzeroupper

+%ifdef X86_32

+    STACK_DEALLOC

+%endif

+    POPM           saveregs

+    POP_XMM

+    LOAD_5_PARA_POP

+%undef           p_cur

+%undef           p_ref

+%undef           i_xcnt

+%undef           i_ycnt

+%undef           i_stride

+%undef           i_stride3

+%undef           i_stride5

+%undef           r_tmp

+%undef           xcnt_unit

+%undef           sadframe_acc

+%undef           sadframe_acc_addr

+%undef           xsadframe_acc

+%undef           mm_zero

+%undef           saveregs

+%undef           p_sadframe

+%undef           p_sad8x8

+%undef           p_sum16x16

+%undef           p_sqsum16x16

+%undef           p_sqdiff16x16

+    ret

+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9

+%macro AVX2_CalcSadBgd_8Lines 9

+%define mm_tmp0    %2

+%define mm_tmp1    %3

+%define mm_tmp2    %8

+%define mm_mad     %4

+%define mm_sumcur  %5

+%define mm_sumref  %6

+%define mm_sad     %7

+%define b_second_blocks %9

+    ; Unroll for better performance on Haswell.

+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.

+%ifidni %1, y

+    lea            r_tmp, [5 * i_stride]

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    add            p_cur, %1 %+ mm_width

+    add            p_ref, %1 %+ mm_width

+%else

+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad

+    vpxor          x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur

+    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref

+    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad

+    lea            r_tmp, [8 * i_stride]

+    add            p_cur, r_tmp

+    add            p_ref, r_tmp

+    neg            r_tmp

+%%loop:

+    AVX2_SadSdMad  %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1

+    add            r_tmp, i_stride

+    jl             %%loop

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+%endif

+    mov            r_tmp, p_sad8x8

+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks

+%ifdef X86_32

+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc

+    vmovdqa        sadframe_acc, y %+ mm_tmp1

+%else

+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad

+%endif

+    mov            r_tmp, p_sd8x8

+    vpsubd         %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref

+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks

+    ; Coalesce store and horizontal reduction of MAD accumulator for even and

+    ; odd iterations so as to enable more parallelism.

+%ifidni %1, y

+    test           i_xcnt, 32 / xcnt_unit

+    jz             %%preserve_mad

+    mov            r_tmp, p_mad8x8

+    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0

+    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks

+%%preserve_mad:

+    vmovdqa        prev_mad, y %+ mm_mad

+%else

+    mov            r_tmp, p_mad8x8

+    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0

+    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks

+%endif

+%undef mm_tmp0

+%undef mm_tmp1

+%undef mm_tmp2

+%undef mm_mad

+%undef mm_sumcur

+%undef mm_sumref

+%undef mm_sad

+%undef b_second_blocks

+%endmacro

+; Store remaining MAD accumulator for width & 32 cases.

+; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4

+%macro AVX2_StoreRemainingSingleMad 4

+    test           %1, 32 / xcnt_unit

+    jz             %%skip

+    mov            r_tmp, p_mad8x8

+    vmovdqa        y%2, prev_mad

+    AVX2_Maxubq    y%2, y%2, y%3

+    AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4

+%%skip:

+%endmacro

+;*************************************************************************************************************

+;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,

+;                        int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)

+;*************************************************************************************************************

+WELS_EXTERN VAACalcSadBgd_avx2

+%define          p_sadframe                    arg6

+%define          p_sad8x8                      arg7

+%define          p_sd8x8                       arg8

+%define          p_mad8x8                      arg9

+%ifdef X86_32

+%define          saveregs                      r5, r6

+%else

+%define          saveregs                      rbx, rbp, r12, r13

+%endif

+%assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 10

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    PUSHM          saveregs

+%define mm_zero mm0

+    vpxor          x %+ mm_zero, x %+ mm_zero, x %+ mm_zero

+%ifdef X86_32

+    STACK_ALLOC    r5, 2 * ymm_width, ymm_width

+    %define sadframe_acc_addr r5

+    %define sadframe_acc [sadframe_acc_addr]

+    %define prev_mad [r5 + ymm_width]

+%else

+    %define sadframe_acc ymm8

+    %define xsadframe_acc xmm8

+    %define prev_mad ymm9

+%endif

+    vmovdqa        sadframe_acc, y %+ mm_zero

+    and            r2, -16                     ; iPicWidth &= -16

+    jle            .done                       ; bail if iPicWidth < 16

+    sar            r3, 4                       ; iPicHeight / 16

+    jle            .done                       ; bail if iPicHeight < 16

+    shr            r2, 2                       ; iPicWidth / 4

+%define p_cur     r0

+%define p_ref     r1

+%define i_xcnt    r2

+%define i_ycnt    ptrword arg4

+%define i_stride  r4

+%define r_tmp     r6

+%define xcnt_unit 4

+%ifdef X86_32

+    mov            i_ycnt, r3

+    %define i_stride3 r3

+%else

+    mov            rbp, p_sad8x8

+    mov            r12, p_sd8x8

+    mov            r13, p_mad8x8

+    %undef  p_sad8x8

+    %undef  p_sd8x8

+    %undef  p_mad8x8

+    %define p_sad8x8 rbp

+    %define p_sd8x8 r12

+    %define p_mad8x8 r13

+    %define i_stride3 rbx

+%endif

+    lea            i_stride3, [3 * i_stride]

+    ; offset pointers to compensate for the i_xcnt offset below.

+    mov            r_tmp, i_xcnt

+    and            r_tmp, 64 / xcnt_unit - 1

+    sub            p_mad8x8, r_tmp

+    shl            r_tmp, 2

+    sub            p_sad8x8, r_tmp

+    sub            p_sd8x8, r_tmp

+.height_loop:

+    push           i_xcnt

+%assign push_num push_num + 1

+%define i_xcnt_load ptrword [r7]

+    ; use end-of-line pointers so as to enable use of a negative counter as index.

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    add            p_sad8x8, r_tmp

+    add            p_sd8x8, r_tmp

+    add            p_mad8x8, i_xcnt

+    and            i_xcnt, -(64 / xcnt_unit)

+    jz             .width_loop_upper8_64x_end

+    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.

+    neg            i_xcnt

+.width_loop_upper8:

+    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_upper8

+    jg             .width_loop_upper8_32x_end

+.width_loop_upper8_64x_end:

+    test           i_xcnt_load, 32 / xcnt_unit

+    jnz            .width_loop_upper8

+.width_loop_upper8_32x_end:

+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0

+    test           i_xcnt_load, 16 / xcnt_unit

+    jz             .width_loop_upper8_end

+    ; remaining 16.

+    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0

+.width_loop_upper8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    mov            i_xcnt, i_xcnt_load

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+    and            i_xcnt, -(64 / xcnt_unit)

+    jz             .width_loop_lower8_64x_end

+    neg            i_xcnt

+.width_loop_lower8:

+    AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_lower8

+    jg             .width_loop_lower8_32x_end

+.width_loop_lower8_64x_end:

+    test           i_xcnt_load, 32 / xcnt_unit

+    jnz            .width_loop_lower8

+.width_loop_lower8_32x_end:

+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1

+    test           i_xcnt_load, 16 / xcnt_unit

+    jz             .width_loop_lower8_end

+    ; remaining 16.

+    AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1

+.width_loop_lower8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    pop            i_xcnt

+%undef i_xcnt_load

+    %assign push_num push_num - 1

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+    sub            i_ycnt, 1

+    jnz            .height_loop

+.done:

+    mov            r_tmp, p_sadframe

+%ifdef X86_32

+    vmovdqa        xmm2, sadframe_acc

+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]

+%else

+    vextracti128   xmm2, sadframe_acc, 1

+    vpaddd         xmm2, xsadframe_acc, xmm2

+%endif

+    vpunpckhqdq    xmm1, xmm2, xmm2

+    vpaddd         xmm2, xmm2, xmm1

+    vmovd          [r_tmp], xmm2

+    vzeroupper

+%ifdef X86_32

+    STACK_DEALLOC

+%endif

+    POPM           saveregs

+    POP_XMM

+    LOAD_5_PARA_POP

+%undef           p_cur

+%undef           p_ref

+%undef           i_xcnt

+%undef           i_ycnt

+%undef           i_stride

+%undef           i_stride3

+%undef           r_tmp

+%undef           xcnt_unit

+%undef           sadframe_acc

+%undef           sadframe_acc_addr

+%undef           xsadframe_acc

+%undef           prev_mad

+%undef           mm_zero

+%undef           saveregs

+%undef           p_sadframe

+%undef           p_sad8x8

+%undef           p_sd8x8

+%undef           p_mad8x8

+    ret

+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11

+%macro AVX2_CalcSadSsdBgd_8Lines 11

+%define mm_tmp0    %2

+%define mm_tmp1    %3

+%define mm_sad     %4

+%define mm_sum     %5

+%define mm_sumref  %6

+%define mm_mad     %7

+%define mm_sqsum   %8

+%define mm_sqdiff  %9

+%ifidn %10, 0

+%define tmp2       0

+%else

+%define tmp2       %1 %+ %10

+%endif

+%define b_second_blocks %11

+    ; Unroll for better performance on Haswell.

+    ; Avoid unrolling for the 16 px case so as to reduce the code footprint.

+%ifidni %1, y

+    lea            r_tmp, [5 * i_stride]

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur,                 p_ref,                 %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride,  p_ref + 1 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride,  p_ref + 2 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride,  p_ref + 4 * i_stride,  %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    lea            r_tmp, [i_stride + 2 * i_stride3]

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp,         p_ref + r_tmp,         %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    add            p_cur, %1 %+ mm_width

+    add            p_ref, %1 %+ mm_width

+%else

+    vpxor          x %+ mm_sad, x %+ mm_sad, x %+ mm_sad

+    vpxor          x %+ mm_sum, x %+ mm_sum, x %+ mm_sum

+    vpxor          x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref

+    vpxor          x %+ mm_mad, x %+ mm_mad, x %+ mm_mad

+    vpxor          x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum

+    vpxor          x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff

+    lea            r_tmp, [8 * i_stride]

+    add            p_cur, r_tmp

+    add            p_ref, r_tmp

+    neg            r_tmp

+%%loop:

+    AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1

+    add            r_tmp, i_stride

+    jl             %%loop

+    ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.

+    lea            r_tmp, [8 * i_stride - %1 %+ mm_width]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+%endif

+    mov            r_tmp, p_sad8x8

+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks

+%ifdef X86_32

+    vpaddd         y %+ mm_tmp1, y %+ mm_sad, sadframe_acc

+    vmovdqa        sadframe_acc, y %+ mm_tmp1

+%else

+    vpaddd         sadframe_acc, sadframe_acc, y %+ mm_sad

+%endif

+    mov            r_tmp, i_xcnt

+    add            r_tmp, p_sum16x16

+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1

+    AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks

+    mov            r_tmp, p_sd8x8

+    vpsubd         %1 %+ mm_sum,  %1 %+ mm_sum, %1 %+ mm_sumref

+    AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks

+    ; Coalesce store and horizontal reduction of MAD accumulator for even and

+    ; odd iterations so as to enable more parallelism.

+%ifidni %1, y

+    test           i_xcnt, 32 / xcnt_unit

+    jz             %%preserve_mad

+    mov            r_tmp, p_mad8x8

+    AVX2_Maxubq2   y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0

+    AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks

+%%preserve_mad:

+    vmovdqa        prev_mad, y %+ mm_mad

+%else

+    mov            r_tmp, p_mad8x8

+    AVX2_Maxubq    %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0

+    AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks

+%endif

+    vpunpcklqdq    %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff

+    vpunpckhqdq    %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1

+    vpshufd        %1 %+ mm_tmp1, %1 %+ mm_tmp0,  10110001b

+    vpaddd         %1 %+ mm_tmp0, %1 %+ mm_tmp0,  %1 %+ mm_tmp1

+    AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks

+%undef mm_tmp0

+%undef mm_tmp1

+%undef mm_sqsum

+%undef mm_sqdiff

+%undef mm_mad

+%undef mm_sum

+%undef mm_sumref

+%undef mm_sad

+%undef tmp2

+%undef b_second_blocks

+%endmacro

+;*************************************************************************************************************

+;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,

+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,

+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)

+;*************************************************************************************************************

+WELS_EXTERN VAACalcSadSsdBgd_avx2

+%define         p_sadframe                      arg6

+%define         p_sad8x8                        arg7

+%define         p_sum16x16                      arg8

+%define         p_sqsum16x16                    arg9

+%define         p_sqdiff16x16                   arg10

+%define         p_sd8x8                         arg11

+%define         p_mad8x8                        arg12

+%ifdef X86_32

+%define         saveregs                        r5, r6

+%else

+%define         saveregs                        rbx, rbp, r12, r13, r14, r15

+%endif

+%assign push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 12

+    SIGN_EXTENSION r2, r2d

+    SIGN_EXTENSION r3, r3d

+    SIGN_EXTENSION r4, r4d

+    PUSHM          saveregs

+%ifdef X86_32

+    STACK_ALLOC    r5, 3 * ymm_width, ymm_width

+    %define mm8 0

+    %define sadframe_acc_addr r5

+    %define sadframe_acc [sadframe_acc_addr]

+    %define prev_mad [r5 + ymm_width]

+    %define ymm_zero [r5 + 2 * ymm_width]

+    %define xmm_zero ymm_zero

+    vpxor          xmm0, xmm0, xmm0

+    vmovdqa        sadframe_acc, ymm0

+    vmovdqa        ymm_zero, ymm0

+%else

+    %define sadframe_acc ymm9

+    %define xsadframe_acc xmm9

+    %define prev_mad ymm10

+    %define ymm_zero ymm11

+    %define xmm_zero xmm11

+    vpxor          xmm_zero, xmm_zero, xmm_zero

+    vpxor          xsadframe_acc, xsadframe_acc, xsadframe_acc

+%endif

+    and            r2, -16                     ; iPicWidth &= -16

+    jle            .done                       ; bail if iPicWidth < 16

+    sar            r3, 4                       ; iPicHeight / 16

+    jle            .done                       ; bail if iPicHeight < 16

+    shr            r2, 2                       ; iPicWidth / 4

+%define p_cur     r0

+%define p_ref     r1

+%define i_xcnt    r2

+%define i_ycnt    ptrword arg4

+%define i_stride  r4

+%define r_tmp     r6

+%define xcnt_unit 4

+%ifdef X86_32

+    mov            i_ycnt, r3

+    %define i_stride3 r3

+%else

+    mov            rbp, p_sad8x8

+    mov            r12, p_sum16x16

+    mov            r13, p_sqsum16x16

+    mov            r14, p_sqdiff16x16

+    mov            r15, p_sd8x8

+    %undef p_sad8x8

+    %undef p_sum16x16

+    %undef p_sqsum16x16

+    %undef p_sqdiff16x16

+    %undef p_sd8x8

+    %define p_sad8x8 rbp

+    %define p_sum16x16 r12

+    %define p_sqsum16x16 r13

+    %define p_sqdiff16x16 r14

+    %define p_sd8x8 r15

+    %define i_stride3 rbx

+%endif

+    lea            i_stride3, [3 * i_stride]

+    ; offset pointers so as to compensate for the i_xcnt offset below.

+    mov            r_tmp, i_xcnt

+    and            r_tmp, 64 / xcnt_unit - 1

+    sub            p_sum16x16, r_tmp

+    sub            p_sqsum16x16, r_tmp

+    sub            p_sqdiff16x16, r_tmp

+    sub            p_mad8x8, r_tmp

+    shl            r_tmp, 2

+    sub            p_sad8x8, r_tmp

+    sub            p_sd8x8, r_tmp

+.height_loop:

+    push           i_xcnt

+%assign push_num push_num + 1

+%define i_xcnt_load ptrword [r7]

+    ; use end-of-line pointers so as to enable use of a negative counter as index.

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    add            p_sad8x8, r_tmp

+    add            p_sum16x16, i_xcnt

+    add            p_sqsum16x16, i_xcnt

+    add            p_sqdiff16x16, i_xcnt

+    add            p_sd8x8, r_tmp

+    add            p_mad8x8, i_xcnt

+    and            i_xcnt, -(64 / xcnt_unit)

+    jz             .width_loop_upper8_64x_end

+    ; use a negative loop counter to enable counting toward zero and indexing with the same counter.

+    neg            i_xcnt

+.width_loop_upper8:

+    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_upper8

+    jg             .width_loop_upper8_32x_end

+.width_loop_upper8_64x_end:

+    test           i_xcnt_load, 32 / xcnt_unit

+    jnz            .width_loop_upper8

+.width_loop_upper8_32x_end:

+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0

+    test           i_xcnt_load, 16 / xcnt_unit

+    jz             .width_loop_upper8_end

+    ; remaining 16.

+    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0

+.width_loop_upper8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    mov            i_xcnt, i_xcnt_load

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+    and            i_xcnt, -(64 / xcnt_unit)

+    jz             .width_loop_lower8_64x_end

+    neg            i_xcnt

+.width_loop_lower8:

+    AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1

+    add            i_xcnt, 32 / xcnt_unit

+    jl             .width_loop_lower8

+    jg             .width_loop_lower8_32x_end

+.width_loop_lower8_64x_end:

+    test           i_xcnt_load, 32 / xcnt_unit

+    jnz            .width_loop_lower8

+.width_loop_lower8_32x_end:

+    AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1

+    test           i_xcnt_load, 16 / xcnt_unit

+    jz             .width_loop_lower8_end

+    ; remaining 16.

+    AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1

+.width_loop_lower8_end:

+    lea            p_cur, [p_cur + 8 * i_stride]

+    lea            p_ref, [p_ref + 8 * i_stride]

+    pop            i_xcnt

+%undef i_xcnt_load

+    %assign push_num push_num - 1

+    lea            r_tmp, [xcnt_unit * i_xcnt]

+    sub            p_cur, r_tmp

+    sub            p_ref, r_tmp

+    sub            i_ycnt, 1

+    jnz            .height_loop

+.done:

+    mov            r_tmp, p_sadframe

+%ifdef X86_32

+    vmovdqa        xmm2, sadframe_acc

+    vpaddd         xmm2, xmm2, [sadframe_acc_addr + xmm_width]

+%else

+    vextracti128   xmm2, sadframe_acc, 1

+    vpaddd         xmm2, xsadframe_acc, xmm2

+%endif

+    vpunpckhqdq    xmm1, xmm2, xmm2

+    vpaddd         xmm2, xmm2, xmm1

+    vmovd          [r_tmp], xmm2

+    vzeroupper

+%ifdef X86_32

+    STACK_DEALLOC

+%endif

+    POPM           saveregs

+    POP_XMM

+    LOAD_5_PARA_POP

+%undef           p_cur

+%undef           p_ref

+%undef           i_xcnt

+%undef           i_ycnt

+%undef           i_stride

+%undef           i_stride3

+%undef           r_tmp

+%undef           xcnt_unit

+%undef           mm8

+%undef           sadframe_acc

+%undef           sadframe_acc_addr

+%undef           xsadframe_acc

+%undef           prev_mad

+%undef           ymm_zero

+%undef           xmm_zero

+%undef           saveregs

+%undef           p_sadframe

+%undef           p_sad8x8

+%undef           p_sum16x16

+%undef           p_sqsum16x16

+%undef           p_sqdiff16x16

+%undef           p_sd8x8

+%undef           p_mad8x8

+    ret

--- a/test/processing/ProcessUT_VaaCalc.cpp

+++ b/test/processing/ProcessUT_VaaCalc.cpp

@@ -828,6 +828,12 @@

 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)

 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)

 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)

+GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)

+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)

+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)

+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)

+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)

 #endif

 #if defined(HAVE_NEON)

--

⑨