shithub: libvpx

--- a/vp8/encoder/temporal_filter.c

+++ b/vp8/encoder/temporal_filter.c

@@ -36,36 +36,9 @@

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering

 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering

-#define USE_FILTER_LUT 0         // use lookup table to improve filter

 #if VP8_TEMPORAL_ALT_REF

-#if USE_FILTER_LUT

-// for (strength = 0; strength <= 6; strength++) {

-//   for (delta = 0; delta <= 18; delta++) {

-//     float coeff = (3.0 * delta * delta) / pow(2, strength);

-//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));

-//   }

-//   printf("\n");

-// }

-static int modifier_lut[7][19] =

-{

-    // Strength=0

-    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},

-    // Strength=1

-    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},

-    // Strength=2

-    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},

-    // Strength=3

-    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},

-    // Strength=4

-    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},

-    // Strength=5

-    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},

-    // Strength=6

-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}

-};

-#endif

 static void vp8_temporal_filter_predictors_mb_c

     MACROBLOCKD *x,

@@ -86,14 +59,11 @@

     if ((mv_row | mv_col) & 7)

-//        vp8_sixtap_predict16x16_c(yptr, stride,

-//                                    mv_col & 7, mv_row & 7, &pred[0], 16);

         x->subpixel_predict16x16(yptr, stride,

                                     mv_col & 7, mv_row & 7, &pred[0], 16);

     else

-        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);

         RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);

@@ -127,7 +97,7 @@

     int strength,

     int filter_weight,

     unsigned int *accumulator,

-    unsigned int *count

+    unsigned short *count

     int i, j, k;

@@ -134,10 +104,6 @@

     int modifier;

     int byte = 0;

-#if USE_FILTER_LUT

-    int *lut = modifier_lut[strength];

-#endif

     for (i = 0,k = 0; i < block_size; i++)

         for (j = 0; j < block_size; j++, k++)

@@ -146,11 +112,10 @@

             int src_byte = frame1[byte];

             int pixel_value = *frame2++;

-#if USE_FILTER_LUT

-            modifier = abs(src_byte-pixel_value);

-            modifier = modifier>18 ? 0 : lut[modifier];

-#else

             modifier   = src_byte - pixel_value;

+            // This is an integer approximation of:

+            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);

+            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);

             modifier  *= modifier;

             modifier  *= 3;

             modifier  += 1 << (strength - 1);

@@ -160,7 +125,6 @@

                 modifier = 16;

             modifier = 16 - modifier;

-#endif

             modifier *= filter_weight;

             count[k] += modifier;

@@ -331,12 +295,12 @@

     int MBs  = cpi->common.MBs;

     int mb_y_offset = 0;

     int mb_uv_offset = 0;

-    unsigned int accumulator[384];

-    unsigned int count[384];

+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);

+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);

     MACROBLOCKD *mbd = &cpi->mb.e_mbd;

     YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];

     unsigned char *dst1, *dst2;

-    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);

+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);

     // Save input state

     unsigned char *y_buffer = mbd->pre.y_buffer;

@@ -366,7 +330,7 @@

             int stride;

             vpx_memset(accumulator, 0, 384*sizeof(unsigned int));

-            vpx_memset(count, 0, 384*sizeof(unsigned int));

+            vpx_memset(count, 0, 384*sizeof(unsigned short));

 #if ALT_REF_MC_ENABLED

             // Reduced search extent by 3 for 6-tap filter & smaller UMV border

--- a/vp8/encoder/temporal_filter.h

+++ b/vp8/encoder/temporal_filter.h

@@ -22,8 +22,12 @@

      int strength, \

      int filter_weight, \

      unsigned int *accumulator, \

-     unsigned int *count \

+     unsigned short *count \

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/temporal_filter_x86.h"

+#endif

 #ifndef vp8_temporal_filter_apply

 #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c

--- /dev/null

+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm

@@ -1,0 +1,207 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; void vp8_temporal_filter_apply_sse2 | arg

+;  (unsigned char  *frame1,           |  0

+;   unsigned int    stride,           |  1

+;   unsigned char  *frame2,           |  2

+;   unsigned int    block_size,       |  3

+;   int             strength,         |  4

+;   int             filter_weight,    |  5

+;   unsigned int   *accumulator,      |  6

+;   unsigned short *count)            |  7

+global sym(vp8_temporal_filter_apply_sse2)

+sym(vp8_temporal_filter_apply_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    SAVE_XMM

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ALIGN_STACK 16, rax

+    %define block_size    0

+    %define strength      16

+    %define filter_weight 32

+    %define rounding_bit  48

+    %define rbp_backup    64

+    %define stack_size    80

+    sub         rsp,           stack_size

+    mov         [rsp + rbp_backup], rbp

+    ; end prolog

+        mov         rdx,            arg(3)

+        mov         [rsp + block_size], rdx

+        movd        xmm6,            arg(4)

+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read

+        ; calculate the rounding bit outside the loop

+        ; 0x8000 >> (16 - strength)

+        mov         rdx,            16

+        sub         rdx,            arg(4) ; 16 - strength

+        movd        xmm4,           rdx    ; can't use rdx w/ shift

+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]

+        psrlw       xmm5,           xmm4

+        movdqa      [rsp + rounding_bit], xmm5

+        mov         rsi,            arg(0) ; src/frame1

+        mov         rdx,            arg(2) ; predictor frame

+        mov         rdi,            arg(6) ; accumulator

+        mov         rax,            arg(7) ; count

+        ; dup the filter weight and store for later

+        movd        xmm0,           arg(5) ; filter_weight

+        pshuflw     xmm0,           xmm0, 0

+        punpcklwd   xmm0,           xmm0

+        movdqa      [rsp + filter_weight], xmm0

+        mov         rbp,            arg(1) ; stride

+        pxor        xmm7,           xmm7   ; zero for extraction

+        lea         rcx,            [rdx + 16*16*1]

+        cmp         dword ptr [rsp + block_size], 8

+        jne         temporal_filter_apply_load_16

+        lea         rcx,            [rdx + 8*8*1]

+temporal_filter_apply_load_8:

+        movq        xmm0,           [rsi]  ; first row

+        lea         rsi,            [rsi + rbp] ; += stride

+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

+        movq        xmm1,           [rsi]  ; second row

+        lea         rsi,            [rsi + rbp] ; += stride

+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]

+        jmp         temporal_filter_apply_load_finished

+temporal_filter_apply_load_16:

+        movdqa      xmm0,           [rsi]  ; src (frame1)

+        lea         rsi,            [rsi + rbp] ; += stride

+        movdqa      xmm1,           xmm0

+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]

+temporal_filter_apply_load_finished:

+        movdqa      xmm2,           [rdx]  ; predictor (frame2)

+        movdqa      xmm3,           xmm2

+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]

+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]

+        ; modifier = src_byte - pixel_value

+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]

+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]

+        ; modifier *= modifier

+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2

+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2

+        ; modifier *= 3

+        pmullw      xmm0,           [GLOBAL(_const_3w)]

+        pmullw      xmm1,           [GLOBAL(_const_3w)]

+        ; modifer += 0x8000 >> (16 - strength)

+        paddw       xmm0,           [rsp + rounding_bit]

+        paddw       xmm1,           [rsp + rounding_bit]

+        ; modifier >>= strength

+        psrlw       xmm0,           [rsp + strength]

+        psrlw       xmm1,           [rsp + strength]

+        ; modifier = 16 - modifier

+        ; saturation takes care of modifier > 16

+        movdqa      xmm3,           [GLOBAL(_const_16w)]

+        movdqa      xmm2,           [GLOBAL(_const_16w)]

+        psubusw     xmm3,           xmm1

+        psubusw     xmm2,           xmm0

+        ; modifier *= filter_weight

+        pmullw      xmm2,           [rsp + filter_weight]

+        pmullw      xmm3,           [rsp + filter_weight]

+        ; count

+        movdqa      xmm4,           [rax]

+        movdqa      xmm5,           [rax+16]

+        ; += modifier

+        paddw       xmm4,           xmm2

+        paddw       xmm5,           xmm3

+        ; write back

+        movdqa      [rax],          xmm4

+        movdqa      [rax+16],       xmm5

+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))

+        ; load and extract the predictor up to shorts

+        pxor        xmm7,           xmm7

+        movdqa      xmm0,           [rdx]

+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))

+        movdqa      xmm1,           xmm0

+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]

+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]

+        ; modifier *= pixel_value

+        pmullw      xmm0,           xmm2

+        pmullw      xmm1,           xmm3

+        ; expand to double words

+        movdqa      xmm2,           xmm0

+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]

+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]

+        movdqa      xmm3,           xmm1

+        punpcklwd   xmm1,           xmm7   ; [ 8-11]

+        punpckhwd   xmm3,           xmm7   ; [12-15]

+        ; accumulator

+        movdqa      xmm4,           [rdi]

+        movdqa      xmm5,           [rdi+16]

+        movdqa      xmm6,           [rdi+32]

+        movdqa      xmm7,           [rdi+48]

+        ; += modifier

+        paddw       xmm4,           xmm0

+        paddw       xmm5,           xmm2

+        paddw       xmm6,           xmm1

+        paddw       xmm7,           xmm3

+        ; write back

+        movdqa      [rdi],          xmm4

+        movdqa      [rdi+16],       xmm5

+        movdqa      [rdi+32],       xmm6

+        movdqa      [rdi+48],       xmm7

+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))

+        cmp         rdx,            rcx

+        je          temporal_filter_apply_epilog

+        pxor        xmm7,           xmm7   ; zero for extraction

+        cmp         dword ptr [rsp + block_size], 16

+        je          temporal_filter_apply_load_16

+        jmp         temporal_filter_apply_load_8

+temporal_filter_apply_epilog:

+    ; begin epilog

+    mov         rbp,            [rsp + rbp_backup]

+    add         rsp,            stack_size

+    pop         rsp

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+_const_3w:

+    times 8 dw 3

+align 16

+_const_top_bit:

+    times 8 dw 1<<15

+align 16

+_const_16w

+    times 8 dw 16

--- /dev/null

+++ b/vp8/encoder/x86/temporal_filter_x86.h

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H

+#define __INC_VP8_TEMPORAL_FILTER_X86_H

+#if HAVE_SSE2

+extern prototype_apply(vp8_temporal_filter_apply_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_temporal_filter_apply

+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2

+#endif

+#endif

+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -309,6 +309,8 @@

         /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/

         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;

+        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;

 #endif

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -94,6 +94,7 @@

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h

+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c

 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c

 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm

@@ -107,6 +108,7 @@

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm

+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm

 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm

 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm

--

⑨