shithub: libvpx

--- a/test/predict_test.cc

+++ b/test/predict_test.cc

@@ -379,17 +379,13 @@

                       make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),

                       make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));

 #endif

-#if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(

-    MMX, BilinearPredictTest,

-    ::testing::Values(make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));

-#endif

 #if HAVE_SSE2

 INSTANTIATE_TEST_CASE_P(

     SSE2, BilinearPredictTest,

     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),

                       make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),

-                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2)));

+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2),

+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2)));

 #endif

 #if HAVE_SSSE3

 INSTANTIATE_TEST_CASE_P(

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -167,7 +167,7 @@

 specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;

 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";

-specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;

+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;

 # Encoder functions below this point.

--- a/vp8/common/x86/bilinear_filter_sse2.c

+++ b/vp8/common/x86/bilinear_filter_sse2.c

@@ -14,6 +14,7 @@

 #include "./vp8_rtcd.h"

 #include "./vpx_config.h"

 #include "vp8/common/filter.h"

+#include "vpx_dsp/x86/mem_sse2.h"

 #include "vpx_ports/mem.h"

 static INLINE void horizontal_16x16(uint8_t *src, const int stride,

@@ -240,4 +241,96 @@

   horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);

   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);

+}

+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,

+                                  const int xoffset) {

+  int h;

+  const __m128i zero = _mm_setzero_si128();

+  if (xoffset == 0) {

+    for (h = 0; h < 5; ++h) {

+      const __m128i a = load_unaligned_u32(src);

+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);

+      _mm_storel_epi64((__m128i *)dst, a_u16);

+      src += stride;

+      dst += 4;

+    }

+    return;

+  }

+  {

+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));

+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);

+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);

+    for (h = 0; h < 5; ++h) {

+      const __m128i a = load_unaligned_u32(src);

+      const __m128i b = load_unaligned_u32(src + 1);

+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);

+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);

+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);

+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);

+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);

+      const __m128i compensated = _mm_add_epi16(sum, round_factor);

+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);

+      _mm_storel_epi64((__m128i *)dst, shifted);

+      src += stride;

+      dst += 4;

+    }

+  }

+}

+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,

+                                const int yoffset) {

+  int h;

+  if (yoffset == 0) {

+    for (h = 0; h < 4; h += 2) {

+      const __m128i row = _mm_load_si128((__m128i *)src);

+      __m128i packed = _mm_packus_epi16(row, row);

+      store_unaligned_u32(dst, packed);

+      dst += stride;

+      packed = _mm_srli_si128(packed, 4);

+      store_unaligned_u32(dst, packed);

+      dst += stride;

+      src += 8;

+    }

+    return;

+  }

+  {

+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));

+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);

+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);

+    for (h = 0; h < 4; h += 2) {

+      const __m128i row_0 = _mm_load_si128((__m128i *)src);

+      const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));

+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);

+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);

+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);

+      const __m128i compensated = _mm_add_epi16(sum, round_factor);

+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);

+      __m128i packed = _mm_packus_epi16(shifted, shifted);

+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));

+      packed = _mm_srli_si128(packed, 4);

+      dst += stride;

+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));

+      dst += stride;

+      src += 8;

+    }

+  }

+}

+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,

+                                  int xoffset, int yoffset, uint8_t *dst_ptr,

+                                  int dst_pitch) {

+  uint16_t FData[4 * 5];

+  assert((xoffset | yoffset) != 0);

+  horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);

+  vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);

--- a/vp8/common/x86/filter_x86.c

+++ /dev/null

@@ -1,29 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/x86/filter_x86.h"

-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {

-  { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 },

-  { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 },

-  { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 },

-  { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }

-};

-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {

-  { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },

-  { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },

-  { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },

-  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },

-  { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },

-  { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },

-  { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }

-};

--- a/vp8/common/x86/filter_x86.h

+++ /dev/null

@@ -1,33 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VPX_VP8_COMMON_X86_FILTER_X86_H_

-#define VPX_VP8_COMMON_X86_FILTER_X86_H_

-#include "vpx_ports/mem.h"

-#ifdef __cplusplus

-extern "C" {

-#endif

-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with

- * duplicated values */

-/* duplicated 4x */

-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);

-/* duplicated 8x */

-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);

-#ifdef __cplusplus

-}  // extern "C"

-#endif

-#endif  // VPX_VP8_COMMON_X86_FILTER_X86_H_

--- a/vp8/common/x86/subpixel_mmx.asm

+++ b/vp8/common/x86/subpixel_mmx.asm

@@ -10,9 +10,7 @@

 %include "vpx_ports/x86_abi_support.asm"

-extern sym(vp8_bilinear_filters_x86_8)

 %define BLOCK_HEIGHT_WIDTH 4

 %define vp8_filter_weight 128

 %define VP8_FILTER_SHIFT  7

@@ -203,125 +201,6 @@

     UNSHADOW_ARGS

     pop         rbp

ret

-;void bilinear_predict4x4_mmx

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE

-sym(vp8_bilinear_predict4x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];

-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        mov         rdi,        arg(4) ;dst_ptr           ;

-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]

-        shl         rax,        5

-        add         rax,        rcx ; HFilter

-        mov         rsi,        arg(0) ;src_ptr              ;

-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch

-        movq        mm1,        [rax]               ;

-        movq        mm2,        [rax+16]            ;

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        pxor        mm0,        mm0                 ;

-        shl         rax,        5

-        add         rax,        rcx

-        lea         rcx,        [rdi+rdx*4]          ;

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

-        ; get the first horizontal line done       ;

-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        pmullw      mm3,        mm1                 ;

-        movd        mm5,        [rsi+1]             ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm0                 ;

-        add         rsi,        rdx                 ; next line

-.next_row_4x4:

-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        pmullw      mm3,        mm1                 ;

-        movd        mm5,        [rsi+1]             ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        movq        mm5,        mm7                 ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        [rax]               ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm0                 ;

-        pmullw      mm3,        [rax+16]            ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

-        packuswb    mm3,        mm0

-        movd        [rdi],      mm3                 ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,        rdx                 ; next line

-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

-%else

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;

-        add         rsi,        rdx                 ; next line

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx                 ;

-        jne         .next_row_4x4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 SECTION_RODATA

--- a/vp8/common/x86/vp8_asm_stubs.c

+++ b/vp8/common/x86/vp8_asm_stubs.c

@@ -11,7 +11,6 @@

 #include "vpx_config.h"

 #include "vp8_rtcd.h"

 #include "vpx_ports/mem.h"

-#include "filter_x86.h"

 extern const short vp8_six_tap_x86[8][6 * 8];

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -70,8 +70,6 @@

 VP8_COMMON_SRCS-yes += common/treecoder.c

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c

 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c

 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c

--- a/vpx_dsp/x86/mem_sse2.h

+++ b/vpx_dsp/x86/mem_sse2.h

@@ -26,6 +26,17 @@

   return v;

+static INLINE __m128i load_unaligned_u32(const void *a) {

+  uint32_t val;

+  memcpy(&val, a, sizeof(val));

+  return _mm_cvtsi32_si128(val);

+}

+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {

+  const uint32_t val = _mm_cvtsi128_si32(v);

+  memcpy(a, &val, sizeof(val));

+}

 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)

 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)

--

⑨