ref: d341f843e2a5b03fd4a10ee83237c1e32a2b8671
parent: 33a9d53c1037ab0168ef982d6a90cbdae58c2209
author: Jingning Han <jingning@google.com>
date: Wed Jul 22 06:52:07 EDT 2015
Refactor forward/inverse transform msa implementations This commit factors out common macro definitions from the forward and inverse transform implementations into vpx_dsp. It removes the duplicate macro definitions from encoder and decoder folders. Change-Id: I92301acbd3317075e9c5f03328a25abb123bca78
--- a/vp9/common/mips/msa/vp9_idct16x16_msa.c
+++ b/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -25,12 +25,12 @@
reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
- VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
- VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
- VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
- VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
- VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
reg8);
@@ -38,8 +38,8 @@
reg10);
/* stage 2 */
- VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
- VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
reg9 = reg1 - loc2;
reg1 = reg1 + loc2;
@@ -46,8 +46,8 @@
reg7 = reg15 - loc3;
reg15 = reg15 + loc3;
- VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
- VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
loc1 = reg15 + reg3;
@@ -63,8 +63,8 @@
tmp7 = loc1;
reg0 = loc2;
- VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
- VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
loc0 = reg9 + reg5;
reg5 = reg9 - reg5;
@@ -77,13 +77,13 @@
loc2 = reg4 - loc0;
tmp5 = loc1;
- VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
reg10 = loc0;
reg11 = loc1;
- VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
reg13 = loc2;
@@ -117,12 +117,12 @@
/* load bottom 8x8 */
LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
- VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
- VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
- VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
- VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
- VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
reg0 = reg2 - loc1;
@@ -135,8 +135,8 @@
reg10 = reg10 + loc2;
/* stage 2 */
- VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
- VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
reg9 = reg1 - loc2;
reg1 = reg1 + loc2;
@@ -143,8 +143,8 @@
reg7 = reg15 - loc3;
reg15 = reg15 + loc3;
- VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
- VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
loc1 = reg15 + reg3;
@@ -160,8 +160,8 @@
tmp7 = loc1;
reg0 = loc2;
- VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
- VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
loc0 = reg9 + reg5;
reg5 = reg9 - reg5;
@@ -174,13 +174,13 @@
loc2 = reg4 - loc0;
tmp5 = loc1;
- VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
reg10 = loc0;
reg11 = loc1;
- VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
reg13 = loc2;
@@ -350,17 +350,17 @@
k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
- VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
- VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
- VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
r1 = LD_SH(input + 1 * 16);
r2 = LD_SH(input + 2 * 16);
@@ -375,12 +375,12 @@
k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
- VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
- VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
out1 = -out1;
@@ -397,7 +397,7 @@
k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
- VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
out8 = -out8;
@@ -414,7 +414,7 @@
k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
- VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+ MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
out4 = -out4;
SRARI_H2_SH(out4, out5, 6);
dst4 = LD_UB(dst + 3 * dst_stride);
@@ -426,7 +426,7 @@
ST8x1_UB(res4, dst + 3 * dst_stride);
ST8x1_UB(res5, dst + 12 * dst_stride);
- VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
out13 = -out13;
SRARI_H2_SH(out12, out13, 6);
dst12 = LD_UB(dst + 2 * dst_stride);
@@ -440,7 +440,7 @@
k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
- VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
SRARI_H2_SH(out6, out7, 6);
dst6 = LD_UB(dst + 4 * dst_stride);
dst7 = LD_UB(dst + 11 * dst_stride);
@@ -451,7 +451,7 @@
ST8x1_UB(res6, dst + 4 * dst_stride);
ST8x1_UB(res7, dst + 11 * dst_stride);
- VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
SRARI_H2_SH(out10, out11, 6);
dst10 = LD_UB(dst + 6 * dst_stride);
dst11 = LD_UB(dst + 9 * dst_stride);
@@ -464,7 +464,7 @@
k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
- VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
SRARI_H2_SH(out2, out3, 6);
dst2 = LD_UB(dst + 7 * dst_stride);
dst3 = LD_UB(dst + 8 * dst_stride);
@@ -475,7 +475,7 @@
ST8x1_UB(res2, dst + 7 * dst_stride);
ST8x1_UB(res3, dst + 8 * dst_stride);
- VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
SRARI_H2_SH(out14, out15, 6);
dst14 = LD_UB(dst + 5 * dst_stride);
dst15 = LD_UB(dst + 10 * dst_stride);
--- a/vp9/common/mips/msa/vp9_idct32x32_msa.c
+++ b/vp9/common/mips/msa/vp9_idct32x32_msa.c
@@ -47,16 +47,16 @@
/* Even stage 1 */
LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
- VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
- VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
loc1 = vec3;
loc0 = vec1;
- VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
- VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
@@ -63,10 +63,10 @@
/* Even stage 2 */
LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
- VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
vec0 = reg0 + reg4;
reg0 = reg0 - reg4;
@@ -84,8 +84,8 @@
reg4 = reg5 - vec1;
reg5 = reg5 + vec1;
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
vec0 = reg0 - reg6;
reg0 = reg0 + reg6;
@@ -92,8 +92,8 @@
vec1 = reg7 - reg1;
reg7 = reg7 + reg1;
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
/* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
@@ -137,10 +137,10 @@
reg6 = LD_SH(tmp_buf + 25 * 8);
reg7 = LD_SH(tmp_buf + 31 * 8);
- VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
- VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
vec0 = reg0 + reg3;
reg0 = reg0 - reg3;
@@ -157,16 +157,16 @@
ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
/* 4 Stores */
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
- VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
/* Odd stage 2 */
@@ -180,21 +180,21 @@
reg6 = LD_SH(tmp_buf + 27 * 8);
reg7 = LD_SH(tmp_buf + 29 * 8);
- VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
- VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */
SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
vec0, vec1, vec2, vec3);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
- VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
- VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */
@@ -204,7 +204,7 @@
ST_SH(reg0, (tmp_odd_buf + 13 * 8));
ST_SH(reg1, (tmp_odd_buf + 14 * 8));
- VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
/* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
@@ -218,10 +218,10 @@
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
/* Load 8 & Store 8 */
@@ -233,10 +233,10 @@
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
}
@@ -363,16 +363,16 @@
LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
tmp_buf += (2 * 32);
- VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
- VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
- VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
loc1 = vec3;
loc0 = vec1;
- VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
- VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
@@ -381,10 +381,10 @@
/* Load 8 */
LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
- VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
vec0 = reg0 + reg4;
reg0 = reg0 - reg4;
@@ -402,8 +402,8 @@
reg4 = reg5 - vec1;
reg5 = reg5 + vec1;
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
vec0 = reg0 - reg6;
reg0 = reg0 + reg6;
@@ -410,8 +410,8 @@
vec1 = reg7 - reg1;
reg7 = reg7 + reg1;
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
/* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
/* Store 8 */
@@ -448,10 +448,10 @@
reg6 = LD_SH(tmp_buf + 25 * 32);
reg7 = LD_SH(tmp_buf + 31 * 32);
- VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
- VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
vec0 = reg0 + reg3;
reg0 = reg0 - reg3;
@@ -467,15 +467,15 @@
ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
ST_SH2(vec0, vec1, tmp_odd_buf, 8);
/* 4 Stores */
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
- VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
- VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
/* Odd stage 2 */
@@ -489,18 +489,18 @@
reg6 = LD_SH(tmp_buf + 27 * 32);
reg7 = LD_SH(tmp_buf + 29 * 32);
- VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
- VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
- VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
- VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */
SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
- VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
- VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */
@@ -507,7 +507,7 @@
ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
- VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
/* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
@@ -519,10 +519,10 @@
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
/* Load 8 & Store 8 */
@@ -533,10 +533,10 @@
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
- VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
}
--- a/vp9/common/mips/msa/vp9_idct_msa.h
+++ b/vp9/common/mips/msa/vp9_idct_msa.h
@@ -14,53 +14,8 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
-#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
- \
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
- ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
- \
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
-}
-
-#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
- dst0, dst1, dst2, dst3) { \
- v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
- v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
- \
- DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
- tp0_m, tp2_m, tp3_m, tp4_m); \
- DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
- tp5_m, tp6_m, tp7_m, tp8_m); \
- BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
- BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
- SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
- SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
- dst0, dst1, dst2, dst3); \
-}
-
-#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
- v8i16 dst_m; \
- v4i32 tp0_m, tp1_m; \
- \
- DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
- SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
- dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
- \
- dst_m; \
-})
-
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
@@ -79,9 +34,9 @@
\
ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
- VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst1_m, cnst2_m, cnst3_m, in7, in0, \
- in4, in3); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
+ cnst1_m, cnst2_m, cnst3_m, in7, in0, \
+ in4, in3); \
\
SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
cnst2_m = -cnst0_m; \
@@ -93,9 +48,9 @@
ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
\
- VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst1_m, cnst2_m, cnst3_m, in5, in2, \
- in6, in1); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
+ cnst1_m, cnst2_m, cnst3_m, in5, in2, \
+ in6, in1); \
BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
out7 = -s0_m; \
out0 = s1_m; \
@@ -109,9 +64,9 @@
\
ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
- VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
- cnst2_m, cnst3_m, cnst1_m, out1, out6, \
- s0_m, s1_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
+ cnst2_m, cnst3_m, cnst1_m, out1, out6, \
+ s0_m, s1_m); \
\
SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
@@ -118,10 +73,10 @@
\
ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
- out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
- out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
\
out1 = -out1; \
out3 = -out3; \
@@ -128,38 +83,6 @@
out5 = -out5; \
}
-#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
- v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
- v8i16 madd_s0_m, madd_s1_m; \
- \
- ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
- c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
- SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
-}
-
-#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
- out0, out1, out2, out3) { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
-}
-
#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \
v8i16 out0_m, r0_m, r1_m; \
\
@@ -422,38 +345,38 @@
k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
- VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
- g0_m, g1_m, g2_m, g3_m); \
+ MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
+ g0_m, g1_m, g2_m, g3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
- VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
- g4_m, g5_m, g6_m, g7_m); \
+ MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
+ g4_m, g5_m, g6_m, g7_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
- VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
- g8_m, g9_m, g10_m, g11_m); \
+ MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
+ g8_m, g9_m, g10_m, g11_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
- VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
- g12_m, g13_m, g14_m, g15_m); \
+ MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
+ g12_m, g13_m, g14_m, g15_m); \
\
/* stage 2 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
- VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
- h0_m, h1_m, h2_m, h3_m); \
+ MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
+ h0_m, h1_m, h2_m, h3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
- VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
- h4_m, h5_m, h6_m, h7_m); \
+ MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
+ h4_m, h5_m, h6_m, h7_m); \
BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
@@ -463,10 +386,10 @@
k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
- VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
- out4, out6, out5, out7); \
- VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
- out12, out14, out13, out15); \
+ MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
+ out4, out6, out5, out7); \
+ MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
+ out12, out14, out13, out15); \
\
/* stage 4 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
@@ -473,9 +396,9 @@
k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
- VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
- VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
- VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
- VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
+ MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
+ MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
+ MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
+ MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
}
#endif /* VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ */
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -14,53 +14,8 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
- \
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
- ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
- \
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
-}
-
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
- dst0, dst1, dst2, dst3) { \
- v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
- v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
- \
- DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
- tp0_m, tp2_m, tp3_m, tp4_m); \
- DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
- tp5_m, tp6_m, tp7_m, tp8_m); \
- BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
- BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
- SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
- SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
- dst0, dst1, dst2, dst3); \
-}
-
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
- v8i16 dst_m; \
- v4i32 tp0_m, tp1_m; \
- \
- DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
- SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
- dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
- \
- dst_m; \
-})
-
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
@@ -125,38 +80,6 @@
out1 = -out1; \
out3 = -out3; \
out5 = -out5; \
-}
-
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
- v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
- v8i16 madd_s0_m, madd_s1_m; \
- \
- ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
- c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
- SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
-}
-
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
- out0, out1, out2, out3) { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
- cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
- m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
}
#define LD_HADD(psrc, stride) ({ \
--- /dev/null
+++ b/vpx_dsp/mips/txfm_macros_msa.h
@@ -1,0 +1,91 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
+ v8i16 k0_m = __msa_fill_h(cnst0); \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = (v4i32)__msa_fill_h(cnst1); \
+ k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ \
+ ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
+ DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ \
+ DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+}
+
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
+ dst0, dst1, dst2, dst3) { \
+ v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
+ v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
+ \
+ DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
+ tp0_m, tp2_m, tp3_m, tp4_m); \
+ DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
+ tp5_m, tp6_m, tp7_m, tp8_m); \
+ BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
+ BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
+ SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
+ SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
+ dst0, dst1, dst2, dst3); \
+}
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
+ v8i16 dst_m; \
+ v4i32 tp0_m, tp1_m; \
+ \
+ DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
+ SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
+ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+ \
+ dst_m; \
+})
+
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
+ c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
+ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
+}
+
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
+ out0, out1, out2, out3) { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
+ cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
+ m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
+ cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
+ m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+}
+#endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -72,6 +72,7 @@
endif
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
endif # CONFIG_VP9_ENCODER
+DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
# quantization
ifeq ($(CONFIG_VP9_ENCODER),yes)