shithub: libvpx

--- a/vp9/common/vp9_mfqe.c

+++ b/vp9/common/vp9_mfqe.c

@@ -35,14 +35,26 @@

+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride, int src_weight) {

+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);

+}

+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,

+                                 uint8_t *dst, int dst_stride,

+                                 int src_weight) {

+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);

+}

 static void filter_by_weight32x32(const uint8_t *src, int src_stride,

                                   uint8_t *dst, int dst_stride, int weight) {

-  filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);

-  filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);

-  filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,

-                   dst_stride, 16, weight);

-  filter_by_weight(src + src_stride * 16 + 16, src_stride,

-                   dst + dst_stride * 16 + 16, dst_stride, 16, weight);

+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);

+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,

+                            weight);

+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,

+                            dst + dst_stride * 16, dst_stride, weight);

+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,

+                            dst + dst_stride * 16 + 16, dst_stride, weight);

 static void filter_by_weight64x64(const uint8_t *src, int src_stride,

@@ -62,13 +74,13 @@

                           int uvd_stride, BLOCK_SIZE block_size,

                           int weight) {

   if (block_size == BLOCK_16X16) {

-    filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);

-    filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);

-    filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);

+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);

+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);

+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);

   } else if (block_size == BLOCK_32X32) {

     filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);

-    filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);

-    filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);

+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);

+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);

   } else if (block_size == BLOCK_64X64) {

     filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);

     filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -274,6 +274,12 @@

 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";

 specialize qw/vp9_plane_add_noise sse2/;

 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;

+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";

+specialize qw/vp9_filter_by_weight16x16 sse2/;

+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";

+specialize qw/vp9_filter_by_weight8x8 sse2/;

--- /dev/null

+++ b/vp9/common/x86/vp9_mfqe_sse2.asm

@@ -1,0 +1,287 @@

+;

+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+;  This file is a duplicate of mfqe_sse2.asm in VP8.

+;  TODO(jackychen): Find a way to fix the duplicate.

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_filter_by_weight16x16_sse2

+;(

+;    unsigned char *src,

+;    int            src_stride,

+;    unsigned char *dst,

+;    int            dst_stride,

+;    int            src_weight

+;)

+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE

+sym(vp9_filter_by_weight16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movd        xmm0, arg(4)                ; src_weight

+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words

+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

+    movdqa      xmm1, [GLOBAL(tMFQE)]

+    psubw       xmm1, xmm0                  ; dst_weight

+    mov         rax, arg(0)                 ; src

+    mov         rsi, arg(1)                 ; src_stride

+    mov         rdx, arg(2)                 ; dst

+    mov         rdi, arg(3)                 ; dst_stride

+    mov         rcx, 16                     ; loop count

+    pxor        xmm6, xmm6

+.combine

+    movdqa      xmm2, [rax]

+    movdqa      xmm4, [rdx]

+    add         rax, rsi

+    ; src * src_weight

+    movdqa      xmm3, xmm2

+    punpcklbw   xmm2, xmm6

+    punpckhbw   xmm3, xmm6

+    pmullw      xmm2, xmm0

+    pmullw      xmm3, xmm0

+    ; dst * dst_weight

+    movdqa      xmm5, xmm4

+    punpcklbw   xmm4, xmm6

+    punpckhbw   xmm5, xmm6

+    pmullw      xmm4, xmm1

+    pmullw      xmm5, xmm1

+    ; sum, round and shift

+    paddw       xmm2, xmm4

+    paddw       xmm3, xmm5

+    paddw       xmm2, [GLOBAL(tMFQE_round)]

+    paddw       xmm3, [GLOBAL(tMFQE_round)]

+    psrlw       xmm2, 4

+    psrlw       xmm3, 4

+    packuswb    xmm2, xmm3

+    movdqa      [rdx], xmm2

+    add         rdx, rdi

+    dec         rcx

+    jnz         .combine

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_by_weight8x8_sse2

+;(

+;    unsigned char *src,

+;    int            src_stride,

+;    unsigned char *dst,

+;    int            dst_stride,

+;    int            src_weight

+;)

+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE

+sym(vp9_filter_by_weight8x8_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movd        xmm0, arg(4)                ; src_weight

+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words

+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

+    movdqa      xmm1, [GLOBAL(tMFQE)]

+    psubw       xmm1, xmm0                  ; dst_weight

+    mov         rax, arg(0)                 ; src

+    mov         rsi, arg(1)                 ; src_stride

+    mov         rdx, arg(2)                 ; dst

+    mov         rdi, arg(3)                 ; dst_stride

+    mov         rcx, 8                      ; loop count

+    pxor        xmm4, xmm4

+.combine

+    movq        xmm2, [rax]

+    movq        xmm3, [rdx]

+    add         rax, rsi

+    ; src * src_weight

+    punpcklbw   xmm2, xmm4

+    pmullw      xmm2, xmm0

+    ; dst * dst_weight

+    punpcklbw   xmm3, xmm4

+    pmullw      xmm3, xmm1

+    ; sum, round and shift

+    paddw       xmm2, xmm3

+    paddw       xmm2, [GLOBAL(tMFQE_round)]

+    psrlw       xmm2, 4

+    packuswb    xmm2, xmm4

+    movq        [rdx], xmm2

+    add         rdx, rdi

+    dec         rcx

+    jnz         .combine

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_variance_and_sad_16x16_sse2 | arg

+;(

+;    unsigned char *src1,          0

+;    int            stride1,       1

+;    unsigned char *src2,          2

+;    int            stride2,       3

+;    unsigned int  *variance,      4

+;    unsigned int  *sad,           5

+;)

+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE

+sym(vp9_variance_and_sad_16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov         rax,        arg(0)          ; src1

+    mov         rcx,        arg(1)          ; stride1

+    mov         rdx,        arg(2)          ; src2

+    mov         rdi,        arg(3)          ; stride2

+    mov         rsi,        16              ; block height

+    ; Prep accumulator registers

+    pxor        xmm3, xmm3                  ; SAD

+    pxor        xmm4, xmm4                  ; sum of src2

+    pxor        xmm5, xmm5                  ; sum of src2^2

+    ; Because we're working with the actual output frames

+    ; we can't depend on any kind of data alignment.

+.accumulate

+    movdqa      xmm0, [rax]                 ; src1

+    movdqa      xmm1, [rdx]                 ; src2

+    add         rax, rcx                    ; src1 + stride1

+    add         rdx, rdi                    ; src2 + stride2

+    ; SAD(src1, src2)

+    psadbw      xmm0, xmm1

+    paddusw     xmm3, xmm0

+    ; SUM(src2)

+    pxor        xmm2, xmm2

+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0

+    paddusw     xmm4, xmm2

+    ; pmaddubsw would be ideal if it took two unsigned values. instead,

+    ; it expects a signed and an unsigned value. so instead we zero extend

+    ; and operate on words.

+    pxor        xmm2, xmm2

+    movdqa      xmm0, xmm1

+    punpcklbw   xmm0, xmm2

+    punpckhbw   xmm1, xmm2

+    pmaddwd     xmm0, xmm0

+    pmaddwd     xmm1, xmm1

+    paddd       xmm5, xmm0

+    paddd       xmm5, xmm1

+    sub         rsi,        1

+    jnz         .accumulate

+    ; phaddd only operates on adjacent double words.

+    ; Finalize SAD and store

+    movdqa      xmm0, xmm3

+    psrldq      xmm0, 8

+    paddusw     xmm0, xmm3

+    paddd       xmm0, [GLOBAL(t128)]

+    psrld       xmm0, 8

+    mov         rax,  arg(5)

+    movd        [rax], xmm0

+    ; Accumulate sum of src2

+    movdqa      xmm0, xmm4

+    psrldq      xmm0, 8

+    paddusw     xmm0, xmm4

+    ; Square src2. Ignore high value

+    pmuludq     xmm0, xmm0

+    psrld       xmm0, 8

+    ; phaddw could be used to sum adjacent values but we want

+    ; all the values summed. promote to doubles, accumulate,

+    ; shift and sum

+    pxor        xmm2, xmm2

+    movdqa      xmm1, xmm5

+    punpckldq   xmm1, xmm2

+    punpckhdq   xmm5, xmm2

+    paddd       xmm1, xmm5

+    movdqa      xmm2, xmm1

+    psrldq      xmm1, 8

+    paddd       xmm1, xmm2

+    psubd       xmm1, xmm0

+    ; (variance + 128) >> 8

+    paddd       xmm1, [GLOBAL(t128)]

+    psrld       xmm1, 8

+    mov         rax,  arg(4)

+    movd        [rax], xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+t128:

+%ifndef __NASM_VER__

+    ddq 128

+%elif CONFIG_BIG_ENDIAN

+    dq  0, 128

+%else

+    dq  128, 0

+%endif

+align 16

+tMFQE: ; 1 << MFQE_PRECISION

+    times 8 dw 0x10

+align 16

+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)

+    times 8 dw 0x08

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -82,6 +82,7 @@

 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c

 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c

 ifeq ($(CONFIG_VP9_POSTPROC),yes)

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

 endif