shithub: libvpx

--- /dev/null

+++ b/vp8/encoder/x86/variance_impl_ssse3.asm

@@ -1,0 +1,348 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define xmm_filter_shift            7

+;void vp8_filter_block2d_bil_var_ssse3

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int  xoffset,

+;    int  yoffset,

+;    int *sum,

+;    unsigned int *sumsquared;;

+;

+;)

+;Note: The filter coefficient at offset=0 is 128. Since the second register

+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.

+global sym(vp8_filter_block2d_bil_var_ssse3)

+sym(vp8_filter_block2d_bil_var_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    push rbx

+    ; end prolog

+        pxor            xmm6,           xmm6

+        pxor            xmm7,           xmm7

+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]

+        movsxd          rax,            dword ptr arg(5)     ; xoffset

+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

+        je              filter_block2d_bil_var_ssse3_sp_only

+        shl             rax,            4                    ; point to filter coeff with xoffset

+        lea             rax,            [rax + rcx]          ; HFilter

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

+        je              filter_block2d_bil_var_ssse3_fp_only

+        shl             rdx,            4

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movdqu          xmm0,           XMMWORD PTR [rsi]

+        movdqu          xmm1,           XMMWORD PTR [rsi+1]

+        movdqa          xmm2,           xmm0

+        punpcklbw       xmm0,           xmm1

+        punpckhbw       xmm2,           xmm1

+        pmaddubsw       xmm0,           [rax]

+        pmaddubsw       xmm2,           [rax]

+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm0,           xmm_filter_shift

+        psraw           xmm2,           xmm_filter_shift

+        packuswb        xmm0,           xmm2

+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line

+        lea             rsi,            [rsi + rbx]

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

+filter_block2d_bil_var_ssse3_loop:

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqu          xmm2,           XMMWORD PTR [rsi+1]

+        movdqa          xmm3,           xmm1

+        punpcklbw       xmm1,           xmm2

+        punpckhbw       xmm3,           xmm2

+        pmaddubsw       xmm1,           [rax]

+        pmaddubsw       xmm3,           [rax]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        packuswb        xmm1,           xmm3

+        movdqa          xmm2,           xmm0

+        movdqa          xmm0,           xmm1

+        movdqa          xmm3,           xmm2

+        punpcklbw       xmm2,           xmm1

+        punpckhbw       xmm3,           xmm1

+        pmaddubsw       xmm2,           [rdx]

+        pmaddubsw       xmm3,           [rdx]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm2,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        movq            xmm1,           QWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm1,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm2,           xmm1

+        psubw           xmm3,           xmm5

+        paddw           xmm6,           xmm2

+        paddw           xmm6,           xmm3

+        pmaddwd         xmm2,           xmm2

+        pmaddwd         xmm3,           xmm3

+        paddd           xmm7,           xmm2

+        paddd           xmm7,           xmm3

+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

+        sub             rcx,            1

+        jnz             filter_block2d_bil_var_ssse3_loop

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_ssse3_sp_only:

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0

+        je              filter_block2d_bil_var_ssse3_full_pixel

+        shl             rdx,            4

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqa          xmm0,           xmm1

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+        lea             rsi,            [rsi + rax]

+filter_block2d_bil_sp_only_loop:

+        movdqu          xmm3,           XMMWORD PTR [rsi]

+        movdqa          xmm2,           xmm1

+        movdqa          xmm0,           xmm3

+        punpcklbw       xmm1,           xmm3

+        punpckhbw       xmm2,           xmm3

+        pmaddubsw       xmm1,           [rdx]

+        pmaddubsw       xmm2,           [rdx]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm2,           xmm_filter_shift

+        movq            xmm3,           QWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm3,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm1,           xmm3

+        psubw           xmm2,           xmm5

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm2

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm2,           xmm2

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm2

+        movdqa          xmm1,           xmm0

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1

+        jnz             filter_block2d_bil_sp_only_loop

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_ssse3_full_pixel:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+        pxor            xmm0,           xmm0

+filter_block2d_bil_full_pixel_loop:

+        movq            xmm1,           QWORD PTR [rsi]

+        punpcklbw       xmm1,           xmm0

+        movq            xmm2,           QWORD PTR [rsi+8]

+        punpcklbw       xmm2,           xmm0

+        movq            xmm3,           QWORD PTR [rdi]

+        punpcklbw       xmm3,           xmm0

+        movq            xmm4,           QWORD PTR [rdi+8]

+        punpcklbw       xmm4,           xmm0

+        psubw           xmm1,           xmm3

+        psubw           xmm2,           xmm4

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm2

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm2,           xmm2

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm2

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1

+        jnz             filter_block2d_bil_full_pixel_loop

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_ssse3_fp_only:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

+        pxor            xmm0,           xmm0

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+filter_block2d_bil_fp_only_loop:

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqu          xmm2,           XMMWORD PTR [rsi+1]

+        movdqa          xmm3,           xmm1

+        punpcklbw       xmm1,           xmm2

+        punpckhbw       xmm3,           xmm2

+        pmaddubsw       xmm1,           [rax]

+        pmaddubsw       xmm3,           [rax]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        movq            xmm2,           XMMWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm2,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm1,           xmm2

+        psubw           xmm3,           xmm5

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm3

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm3,           xmm3

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm3

+        lea             rsi,            [rsi + rdx]

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1

+        jnz             filter_block2d_bil_fp_only_loop

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_variance:

+        pxor        xmm0,           xmm0

+        pxor        xmm1,           xmm1

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm0,           xmm6

+        punpckhwd   xmm1,           xmm6

+        psrad       xmm0,           16

+        psrad       xmm1,           16

+        paddd       xmm0,           xmm1

+        movdqa      xmm1,           xmm0

+        movdqa      xmm6,           xmm7

+        punpckldq   xmm6,           xmm5

+        punpckhdq   xmm7,           xmm5

+        paddd       xmm6,           xmm7

+        punpckldq   xmm0,           xmm5

+        punpckhdq   xmm1,           xmm5

+        paddd       xmm0,           xmm1

+        movdqa      xmm7,           xmm6

+        movdqa      xmm1,           xmm0

+        psrldq      xmm7,           8

+        psrldq      xmm1,           8

+        paddd       xmm6,           xmm7

+        paddd       xmm0,           xmm1

+        mov         rsi,            arg(7) ;[Sum]

+        mov         rdi,            arg(8) ;[SSE]

+        movd        [rsi],       xmm0

+        movd        [rdi],       xmm6

+    ; begin epilog

+    pop rbx

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+xmm_bi_rd:

+    times 8 dw 64

+align 16

+vp8_bilinear_filters_ssse3:

+    times 8 db 128, 0

+    times 8 db 112, 16

+    times 8 db 96,  32

+    times 8 db 80,  48

+    times 8 db 64,  64

+    times 8 db 48,  80

+    times 8 db 32,  96

+    times 8 db 16,  112

--- /dev/null

+++ b/vp8/encoder/x86/variance_ssse3.c

@@ -1,0 +1,140 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp8/encoder/variance.h"

+#include "vp8/common/pragmas.h"

+#include "vpx_ports/mem.h"

+extern unsigned int vp8_get16x16var_sse2

+(

+    const unsigned char *src_ptr,

+    int source_stride,

+    const unsigned char *ref_ptr,

+    int recon_stride,

+    unsigned int *SSE,

+    int *Sum

+);

+extern void vp8_half_horiz_vert_variance16x_h_sse2

+(

+    const unsigned char *ref_ptr,

+    int ref_pixels_per_line,

+    const unsigned char *src_ptr,

+    int src_pixels_per_line,

+    unsigned int Height,

+    int *sum,

+    unsigned int *sumsquared

+);

+extern void vp8_half_horiz_variance16x_h_sse2

+(

+    const unsigned char *ref_ptr,

+    int ref_pixels_per_line,

+    const unsigned char *src_ptr,

+    int src_pixels_per_line,

+    unsigned int Height,

+    int *sum,

+    unsigned int *sumsquared

+);

+extern void vp8_half_vert_variance16x_h_sse2

+(

+    const unsigned char *ref_ptr,

+    int ref_pixels_per_line,

+    const unsigned char *src_ptr,

+    int src_pixels_per_line,

+    unsigned int Height,

+    int *sum,

+    unsigned int *sumsquared

+);

+extern void vp8_filter_block2d_bil_var_ssse3

+(

+    const unsigned char *ref_ptr,

+    int ref_pixels_per_line,

+    const unsigned char *src_ptr,

+    int src_pixels_per_line,

+    unsigned int Height,

+    int  xoffset,

+    int  yoffset,

+    int *sum,

+    unsigned int *sumsquared

+);

+unsigned int vp8_sub_pixel_variance16x16_ssse3

+(

+    const unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    const unsigned char *dst_ptr,

+    int dst_pixels_per_line,

+    unsigned int *sse

+)

+{

+    int xsum0, xsum1;

+    unsigned int xxsum0, xxsum1;

+    // note we could avoid these if statements if the calling function

+    // just called the appropriate functions inside.

+    if (xoffset == 4 && yoffset == 0)

+    {

+        vp8_half_horiz_variance16x_h_sse2(

+            src_ptr, src_pixels_per_line,

+            dst_ptr, dst_pixels_per_line, 16,

+            &xsum0, &xxsum0);

+        vp8_half_horiz_variance16x_h_sse2(

+            src_ptr + 8, src_pixels_per_line,

+            dst_ptr + 8, dst_pixels_per_line, 16,

+            &xsum1, &xxsum1);

+        xsum0 += xsum1;

+        xxsum0 += xxsum1;

+    }

+    else if (xoffset == 0 && yoffset == 4)

+    {

+        vp8_half_vert_variance16x_h_sse2(

+            src_ptr, src_pixels_per_line,

+            dst_ptr, dst_pixels_per_line, 16,

+            &xsum0, &xxsum0);

+        vp8_half_vert_variance16x_h_sse2(

+            src_ptr + 8, src_pixels_per_line,

+            dst_ptr + 8, dst_pixels_per_line, 16,

+            &xsum1, &xxsum1);

+        xsum0 += xsum1;

+        xxsum0 += xxsum1;

+    }

+    else if (xoffset == 4 && yoffset == 4)

+    {

+        vp8_half_horiz_vert_variance16x_h_sse2(

+            src_ptr, src_pixels_per_line,

+            dst_ptr, dst_pixels_per_line, 16,

+            &xsum0, &xxsum0);

+        vp8_half_horiz_vert_variance16x_h_sse2(

+            src_ptr + 8, src_pixels_per_line,

+            dst_ptr + 8, dst_pixels_per_line, 16,

+            &xsum1, &xxsum1);

+        xsum0 += xsum1;

+        xxsum0 += xxsum1;

+    }

+    else

+    {

+      vp8_filter_block2d_bil_var_ssse3(

+          src_ptr, src_pixels_per_line,

+          dst_ptr, dst_pixels_per_line, 16,

+          xoffset, yoffset,

+          &xsum0, &xxsum0);

+    }

+    *sse = xxsum0;

+    return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

--- a/vp8/encoder/x86/variance_x86.h

+++ b/vp8/encoder/x86/variance_x86.h

@@ -286,6 +286,7 @@

 #if HAVE_SSSE3

 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);

 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);

+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);

 #if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_variance_sad16x16x3

@@ -293,6 +294,9 @@

 #undef  vp8_variance_sad16x8x3

 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3

+#undef  vp8_variance_subpixvar16x16

+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3

 #endif

 #endif

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -334,6 +334,8 @@

         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;

         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;

+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;

         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -110,6 +110,8 @@

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm

 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm

+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c

+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm

 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm

 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

--

⑨