shithub: libvpx

Download patch

ref: 914f7c36d7d394f1569b62bf8df21963046f4e37
parent: c684d5e5f2c7c38b867b0c177c82af26335dcb3f
parent: 5a23352c030d2b190976ea55a9a759c734bd9eaa
author: Scott LaVarnway <slavarnway@google.com>
date: Thu May 19 07:22:01 EDT 2011

Merge "Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3."

--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -584,23 +584,35 @@
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp8_intra_pred_uv_ho_mmx2)
-sym(vp8_intra_pred_uv_ho_mmx2):
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1)
+sym(vp8_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
+%ifidn %1, ssse3
+    push        rbx
+%endif
     ; end prolog
 
     ; read from left and write out
+%ifidn %1, mmx2
     mov         edx,        4
+%endif
     mov         rsi,        arg(2) ;src;
     movsxd      rax,        dword ptr arg(3) ;src_stride;
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+    lea         rbx,        [rax*3]
+    lea         rdx,        [rcx*3]
+    movdqa      xmm2,       [GLOBAL(dc_00001111)]
+%endif
     dec         rsi
-vp8_intra_pred_uv_ho_mmx2_loop:
+%ifidn %1, mmx2
+vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
     movd        mm1,        [rsi+rax]
     punpcklbw   mm0,        mm0
@@ -612,15 +624,50 @@
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_ho_mmx2_loop
+    jnz vp8_intra_pred_uv_ho_%1_loop
+%else
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+    lea         rsi,        [rsi+rax*4]
+    lea         rdi,        [rdi+rcx*4]
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+%endif
 
     ; begin epilog
+%ifidn %1, ssse3
+    pop         rbx
+%endif
     pop         rdi
     pop         rsi
     UNSHADOW_ARGS
     pop         rbp
     ret
+%endmacro
 
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
+
 SECTION_RODATA
 dc_128:
     times 8 db 128
@@ -629,3 +676,7 @@
 align 16
 dc_1024:
     times 8 dw 0x400
+align 16
+dc_00001111:
+    times 8 db 0
+    times 8 db 1
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -23,6 +23,7 @@
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
@@ -31,7 +32,8 @@
                                                 unsigned char *dst_u,
                                                 unsigned char *dst_v,
                                                 int dst_stride,
-                                                build_intra_predictors_mbuv_fn_t tm_func)
+                                                build_intra_predictors_mbuv_fn_t tm_func,
+                                                build_intra_predictors_mbuv_fn_t ho_func)
 {
     int mode = x->mode_info_context->mbmi.uv_mode;
     build_intra_predictors_mbuv_fn_t fn;
@@ -39,7 +41,7 @@
 
     switch (mode) {
         case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
-        case  H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+        case  H_PRED: fn = ho_func; break;
         case TM_PRED: fn = tm_func; break;
         case DC_PRED:
             if (x->up_available) {
@@ -65,7 +67,8 @@
 {
     vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
                                         &x->predictor[320], 8,
-                                        vp8_intra_pred_uv_tm_sse2);
+                                        vp8_intra_pred_uv_tm_sse2,
+                                        vp8_intra_pred_uv_ho_mmx2);
 }
 
 void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
@@ -72,7 +75,8 @@
 {
     vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
                                         &x->predictor[320], 8,
-                                        vp8_intra_pred_uv_tm_ssse3);
+                                        vp8_intra_pred_uv_tm_ssse3,
+                                        vp8_intra_pred_uv_ho_ssse3);
 }
 
 void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
@@ -79,7 +83,8 @@
 {
     vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
                                         x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_sse2);
+                                        vp8_intra_pred_uv_tm_sse2,
+                                        vp8_intra_pred_uv_ho_mmx2);
 }
 
 void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
@@ -86,5 +91,6 @@
 {
     vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
                                         x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_ssse3);
+                                        vp8_intra_pred_uv_tm_ssse3,
+                                        vp8_intra_pred_uv_ho_ssse3);
 }