shithub: libvpx

Download patch

ref: 68f6f6c4cc50b83ff77471d7366f323b5f4a869e
parent: 955b3b66bdfc127f07381448daff7ece442b3b6c
author: Kaustubh Raste <kaustubh.raste@imgtec.com>
date: Wed Oct 5 06:12:12 EDT 2016

Modify vp8 idct msa functions store method

vp8_short_inv_walsh4x4_msa - Optimized to process in short vector type
Updated below functions to store exact number of bytes in output rather than complete vector
idct4x4_addblk_msa
idct4x4_addconst_msa
dequant_idct4x4_addblk_msa
dequant_idct4x4_addblk_2x_msa
dequant_idct_addconst_2x_msa

Change-Id: Ic1b3752e2421dc7d70a082dcdaab9d140d7e5d9c

--- a/vp8/common/mips/msa/idct_msa.c
+++ b/vp8/common/mips/msa/idct_msa.c
@@ -90,8 +90,7 @@
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 res0, res1, res2, res3;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   LD_SH2(input, 8, input0, input1);
   UNPCK_SH_SW(input0, in0, in1);
@@ -111,20 +110,17 @@
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
                                  int32_t pred_stride, uint8_t *dest,
                                  int32_t dest_stride) {
-  v8i16 vec;
-  v8i16 res0, res1, res2, res3;
+  v8i16 vec, res0, res1, res2, res3, dst0, dst1;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   vec = __msa_fill_h(in_dc);
   vec = __msa_srari_h(vec, 3);
@@ -133,55 +129,59 @@
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1);
+  dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0);
+  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
 }
 
 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
-  v8i16 input0, input1;
-  v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
-  v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
 
   LD_SH2(input, 8, input0, input1);
-  UNPCK_SH_SW(input0, in0, in1);
-  UNPCK_SH_SW(input1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
-  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
-  BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
-  ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
-  SRA_4V(vt0, vt1, vt2, vt3, 3);
-  mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
-  mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
-  mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
-  mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
-  mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
-  mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
-  mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
-  mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
-  mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
-  mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
-  mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
-  mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
-  mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
-  mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
-  mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
-  mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+  input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADD2(tmp0, 3, tmp1, 3, out0, out1);
+  out0 >>= 3;
+  out1 >>= 3;
+  mb_dq_coeff[0] = __msa_copy_s_h(out0, 0);
+  mb_dq_coeff[16] = __msa_copy_s_h(out0, 4);
+  mb_dq_coeff[32] = __msa_copy_s_h(out1, 0);
+  mb_dq_coeff[48] = __msa_copy_s_h(out1, 4);
+  mb_dq_coeff[64] = __msa_copy_s_h(out0, 1);
+  mb_dq_coeff[80] = __msa_copy_s_h(out0, 5);
+  mb_dq_coeff[96] = __msa_copy_s_h(out1, 1);
+  mb_dq_coeff[112] = __msa_copy_s_h(out1, 5);
+  mb_dq_coeff[128] = __msa_copy_s_h(out0, 2);
+  mb_dq_coeff[144] = __msa_copy_s_h(out0, 6);
+  mb_dq_coeff[160] = __msa_copy_s_h(out1, 2);
+  mb_dq_coeff[176] = __msa_copy_s_h(out1, 6);
+  mb_dq_coeff[192] = __msa_copy_s_h(out0, 3);
+  mb_dq_coeff[208] = __msa_copy_s_h(out0, 7);
+  mb_dq_coeff[224] = __msa_copy_s_h(out1, 3);
+  mb_dq_coeff[240] = __msa_copy_s_h(out1, 7);
 }
 
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
                                        uint8_t *dest, int32_t dest_stride) {
   v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
-  v8i16 in0, in1, in2, in3;
-  v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
-  v16i8 dest0, dest1, dest2, dest3;
-  v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
-  v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h;
+  v16u8 dest0, dest1, dest2, dest3;
+  v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v2i64 zero = { 0 };
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
 
   LD_SH2(input, 8, input0, input1);
   LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
@@ -196,7 +196,7 @@
   VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
   SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
   TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+  LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
   ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
              res2, res3);
   ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2,
@@ -206,9 +206,9 @@
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
@@ -215,10 +215,8 @@
                                           int16_t *dequant_input, uint8_t *dest,
                                           int32_t dest_stride) {
   v16u8 dest0, dest1, dest2, dest3;
-  v8i16 in0, in1, in2, in3;
-  v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
-  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
-  v8i16 res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
   v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
   v16i8 zero = { 0 };
@@ -247,11 +245,8 @@
              res2, res3);
   ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l);
+  ST8x4_UB(vt0l, vt1l, dest, dest_stride);
 
   __asm__ __volatile__(
       "sw   $zero,    0(%[input])  \n\t"
@@ -276,10 +271,9 @@
 
 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
                                          uint8_t *dest, int32_t dest_stride) {
-  v8i16 input_dc0, input_dc1, vec;
+  v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3;
   v16u8 dest0, dest1, dest2, dest3;
   v16i8 zero = { 0 };
-  v8i16 res0, res1, res2, res3;
 
   input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
   input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
@@ -292,11 +286,8 @@
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, res0, res1);
+  ST8x4_UB(res0, res1, dest, dest_stride);
 }
 
 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -1221,6 +1221,8 @@
   }
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 
 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                  out2, out3)                                                \