shithub: libvpx

Download patch

ref: 0c00af126dcda6fddcc683cc15c1be06b3285054
parent: 7e77938d7200a5a55490130e156a918a241100bd
author: Alex Converse <aconverse@google.com>
date: Tue Oct 6 11:59:03 EDT 2015

Add vpx_highbd_convolve_{copy,avg}_sse2

single-threaded:
swanky (silvermont): ~1% faster overall
peppy (celeron,haswell): ~1.5% faster overall

Change-Id: Ib74f014374c63c9eaf2d38191cbd8e2edcc52073

--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -973,6 +973,14 @@
                       w, h, bd); \
 }
 #if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_USE_X86INC
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_avg_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_avg_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve_avg_sse2, 12)
+#endif  // CONFIG_USE_X86INC
 WRAP(convolve8_horiz_sse2, 8)
 WRAP(convolve8_avg_horiz_sse2, 8)
 WRAP(convolve8_vert_sse2, 8)
@@ -1116,7 +1124,11 @@
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
+#else
     wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
@@ -1124,7 +1136,11 @@
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
 const ConvolveFunctions convolve10_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
+#else
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
@@ -1132,7 +1148,11 @@
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
 const ConvolveFunctions convolve12_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
+#else
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -434,10 +434,10 @@
   # Sub Pixel Filters
   #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy/;
+  specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg/;
+  specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -12,13 +12,28 @@
 
 SECTION .text
 
-%macro convolve_fn 1
+%macro convolve_fn 1-2
 INIT_XMM sse2
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+                                 fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
                               fx, fxs, fy, fys, w, h
+%endif
   mov r4d, dword wm
+%ifidn %2, highbd
+  shl r4d, 1
+  shl srcq, 1
+  shl src_strideq, 1
+  shl dstq, 1
+  shl dst_strideq, 1
+%else
   cmp r4d, 4
   je .w4
+%endif
   cmp r4d, 8
   je .w8
   cmp r4d, 16
@@ -25,8 +40,49 @@
   je .w16
   cmp r4d, 32
   je .w32
+%ifidn %2, highbd
+  cmp r4d, 64
+  je .w64
 
   mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop128
+  RET
+%endif
+
+.w64
+  mov                    r4d, dword hm
 .loop64:
   movu                    m0, [srcq]
   movu                    m1, [srcq+16]
@@ -34,10 +90,10 @@
   movu                    m3, [srcq+48]
   add                   srcq, src_strideq
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+16]
-  pavgb                   m2, [dstq+32]
-  pavgb                   m3, [dstq+48]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
 %endif
   mova             [dstq   ], m0
   mova             [dstq+16], m1
@@ -57,10 +113,10 @@
   movu                    m3, [srcq+src_strideq+16]
   lea                   srcq, [srcq+src_strideq*2]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq            +16]
-  pavgb                   m2, [dstq+dst_strideq]
-  pavgb                   m3, [dstq+dst_strideq+16]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq            +16]
+  pavg                    m2, [dstq+dst_strideq]
+  pavg                    m3, [dstq+dst_strideq+16]
 %endif
   mova [dstq               ], m0
   mova [dstq            +16], m1
@@ -82,10 +138,10 @@
   movu                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+dst_strideq]
-  pavgb                   m2, [dstq+dst_strideq*2]
-  pavgb                   m3, [dstq+r6q]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
 %endif
   mova  [dstq              ], m0
   mova  [dstq+dst_strideq  ], m1
@@ -108,10 +164,10 @@
   movu                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+dst_strideq]
-  pavgb                   m2, [dstq+dst_strideq*2]
-  pavgb                   m3, [dstq+r6q]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
 %endif
   mova  [dstq              ], m0
   mova  [dstq+dst_strideq  ], m1
@@ -122,6 +178,7 @@
   jnz .loop8
   RET
 
+%ifnidn %2, highbd
 .w4:
   mov                    r4d, dword hm
   lea                    r5q, [src_strideq*3]
@@ -137,10 +194,10 @@
   movh                    m5, [dstq+dst_strideq]
   movh                    m6, [dstq+dst_strideq*2]
   movh                    m7, [dstq+r6q]
-  pavgb                   m0, m4
-  pavgb                   m1, m5
-  pavgb                   m2, m6
-  pavgb                   m3, m7
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
 %endif
   movh  [dstq              ], m0
   movh  [dstq+dst_strideq  ], m1
@@ -150,7 +207,12 @@
   sub                    r4d, 4
   jnz .loop4
   RET
+%endif
 %endmacro
 
 convolve_fn copy
 convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif