ref: 5b7882139c2f64d4ff830e1665e04a4d72f9e484
parent: a1cee8dc919df1980d802e1a9bce1259ec34cba8
author: jinbo <jinbo-hf@loongson.cn>
date: Wed Jul 1 04:56:25 EDT 2020
vp8,vpx_dsp:[loongson] fix bugs reported by clang 1. Adjust variable type to match clang compiler. Clang is more strict on the type of asm operands, float or double type variable should use constraint 'f', integer variable should use constraint 'r'. 2. Fix prob of using r-value in output operands. clang report error: 'invalid use of a cast in a inline asm context requiring an l-value: remove the cast or build with -fheinous-gnu-extensions'. Change-Id: Iae9e08f55f249059066c391534013e320812463e
--- a/vp8/common/mips/mmi/idctllm_mmi.c
+++ b/vp8/common/mips/mmi/idctllm_mmi.c
@@ -41,14 +41,18 @@
int pred_stride, unsigned char *dst_ptr,
int dst_stride) {
double ftmp[12];
- uint32_t tmp[0];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+ uint64_t tmp[1];
+ double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
__asm__ volatile (
+ "dli %[tmp0], 0x0004000400040004 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_04] \n\t"
+ "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t"
+ "dli %[tmp0], 0x22a322a322a322a3 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t"
MMI_LI(%[tmp0], 0x02)
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
@@ -186,9 +190,10 @@
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
[ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
[ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
- [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
- : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
- [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+ [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+ [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+ [ff_ph_22a3]"=&f"(ff_ph_22a3)
+ : [ip]"r"(input),
[pred_stride]"r"((mips_reg)pred_stride),
[dst_stride]"r"((mips_reg)dst_stride)
: "memory"
@@ -198,12 +203,13 @@
void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
int dst_stride) {
- int a1 = ((input_dc + 4) >> 3);
- double ftmp[5];
+ int a0 = ((input_dc + 4) >> 3);
+ double a1, ftmp[5];
int low32;
__asm__ volatile (
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dmtc1 %[a0], %[a1] \n\t"
"pshufh %[a1], %[a1], %[ftmp0] \n\t"
"ulw %[low32], 0x00(%[pred_ptr]) \n\t"
"mtc1 %[low32], %[ftmp1] \n\t"
@@ -244,9 +250,9 @@
"gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
[ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
- [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+ [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
: [dst_stride]"r"((mips_reg)dst_stride),
- [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+ [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
: "memory"
);
}
@@ -254,14 +260,15 @@
void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
int i;
int16_t output[16];
- double ftmp[12];
- uint32_t tmp[1];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+ double ff_ph_03, ftmp[12];
+ uint64_t tmp[1];
__asm__ volatile (
+ "dli %[tmp0], 0x0003000300030003 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_03] \n\t"
MMI_LI(%[tmp0], 0x03)
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
"gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
@@ -317,8 +324,8 @@
[ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
[ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
- [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
- : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+ [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+ : [ip]"r"(input), [op]"r"(output)
: "memory"
);
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -13,28 +13,25 @@
#include "vp8/common/onyxc_int.h"
#include "vpx_ports/asmdefs_mmi.h"
-DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
- ff_ph_003f) = { 0x003f003f003f003fULL };
-DECLARE_ALIGNED(8, static const uint64_t,
- ff_ph_0900) = { 0x0900090009000900ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
- ff_ph_1200) = { 0x1200120012001200ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
- ff_ph_1b00) = { 0x1b001b001b001b00ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
-
void vp8_loop_filter_horizontal_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
- uint32_t tmp[1];
+ uint64_t tmp[1];
mips_reg addr[2];
double ftmp[12];
+ double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t"
@@ -91,9 +88,9 @@
"pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
"pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
- "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t"
@@ -134,8 +131,8 @@
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
"punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
"psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
"packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t"
@@ -149,8 +146,8 @@
"packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
"psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
"packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
@@ -188,17 +185,18 @@
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_03]"=&f"(ff_pb_03)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
[src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
- [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
- [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe),
- [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04),
- [ff_pb_03]"f"(ff_pb_03)
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
: "memory"
);
+ /* clang-format on */
}
void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
@@ -206,11 +204,23 @@
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh, int count) {
- uint32_t tmp[1];
+ uint64_t tmp[1];
mips_reg addr[2];
double ftmp[13];
+ double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
@@ -315,8 +325,8 @@
/* abs (p1-q1) */
"pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
"pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp1] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp1] \n\t"
"psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t"
"paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
@@ -354,8 +364,8 @@
"paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t"
"paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
@@ -379,8 +389,8 @@
"paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t"
"paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
"packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t"
@@ -450,15 +460,16 @@
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_fe]"=&f"(ff_pb_fe)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
- [src_pixel_step]"r"((mips_reg)src_pixel_step),
- [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03),
- [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80),
- [ff_pb_fe]"f"(ff_pb_fe)
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
+ /* clang-format on */
}
/* clang-format off */
@@ -484,10 +495,29 @@
void vp8_mbloop_filter_horizontal_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
- uint32_t tmp[1];
+ uint64_t tmp[1];
double ftmp[13];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+ ff_ph_1200, ff_ph_1b00;
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0x1200120012001200 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1200] \n\t"
+ "dli %[tmp0], 0x1b001b001b001b00 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
"1: \n\t"
@@ -550,8 +580,8 @@
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
"pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
@@ -584,8 +614,8 @@
"pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t"
VP8_MBLOOP_HPSRAB
"paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
@@ -593,8 +623,8 @@
VP8_MBLOOP_HPSRAB
"psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
- "li %[tmp0], 0x07 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
@@ -649,18 +679,20 @@
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00),
+ [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
- [src_pixel_step]"r"((mips_reg)src_pixel_step),
- [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
- [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03),
- [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00),
- [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f)
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
+ /* clang-format on */
}
+/* clang-format off */
#define VP8_MBLOOP_VPSRAB_ADDH \
"pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
"pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
@@ -673,15 +705,30 @@
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \
"packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+/* clang-format on */
void vp8_mbloop_filter_vertical_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
mips_reg tmp[1];
- DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
double ftmp[14];
+ double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
"1: \n\t"
@@ -783,8 +830,8 @@
/* abs (p1-q1) / 2 */
"pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
"pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
"psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
"paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
"psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
@@ -824,8 +871,8 @@
"pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
"paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp12] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
@@ -842,8 +889,8 @@
/* ftmp6: ps0 */
"paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
- "li %[tmp0], 0x07 \n\t"
- "mtc1 %[tmp0], %[ftmp12] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
"paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
"paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t"
@@ -948,17 +995,19 @@
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
[tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
- [count]"+&r"(count)
+ [count]"+&r"(count),
+ [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900),
+ [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe)
: [limit]"r"(limit), [blimit]"r"(blimit),
[srct]"r"(srct), [thresh]"r"(thresh),
- [src_pixel_step]"r"((mips_reg)src_pixel_step),
- [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900),
- [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04),
- [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe)
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
+ /* clang-format on */
}
+/* clang-format off */
#define VP8_SIMPLE_HPSRAB \
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \
@@ -966,23 +1015,38 @@
"psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \
"psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+/* clang-format on */
void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
int src_pixel_step,
const unsigned char *blimit) {
- uint32_t tmp[1], count = 2;
+ uint64_t tmp[1], count = 2;
mips_reg addr[2];
double ftmp[12];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+ /* clang-format off */
__asm__ volatile (
- "li %[tmp0], 0x08 \n\t"
- "mtc1 %[tmp0], %[ftmp8] \n\t"
- "li %[tmp0], 0x03 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t"
@@ -996,7 +1060,7 @@
"gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
"pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t"
- "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
+ "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
"psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
@@ -1020,7 +1084,7 @@
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
- "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
VP8_SIMPLE_HPSRAB
@@ -1048,30 +1112,43 @@
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
: [blimit]"r"(blimit),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
- [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
- [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
- [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01)
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
: "memory"
);
+ /* clang-format on */
}
void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
int src_pixel_step,
const unsigned char *blimit) {
- uint32_t tmp[1], count = 2;
+ uint64_t tmp[1], count = 2;
mips_reg addr[2];
- DECLARE_ALIGNED(8, const uint64_t, srct[1]);
- double ftmp[12];
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+ double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+ /* clang-format off */
__asm__ volatile (
- "li %[tmp0], 0x08 \n\t"
- "mtc1 %[tmp0], %[ftmp8] \n\t"
- "li %[tmp0], 0x20 \n\t"
- "mtc1 %[tmp0], %[ftmp10] \n\t"
-
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
@@ -1118,8 +1195,8 @@
"punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t"
"punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
- "li %[tmp0], 0x01 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
"pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t"
"psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
@@ -1149,14 +1226,14 @@
"pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
- "li %[tmp0], 0x03 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t"
"psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
@@ -1164,14 +1241,14 @@
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
- "li %[tmp0], 0x03 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
- "li %[tmp0], 0x0b \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
"psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
@@ -1235,16 +1312,17 @@
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
: [blimit]"r"(blimit), [srct]"r"(srct),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
[src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
[src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
- [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
- [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
- [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01)
+ [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
: "memory"
);
+ /* clang-format on */
}
/* Horizontal MB filtering */
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -70,9 +70,8 @@
unsigned int output_height,
unsigned int output_width,
const int16_t *vp8_filter) {
- uint32_t tmp[1];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-
+ uint64_t tmp[1];
+ double ff_ph_40;
#if _MIPS_SIM == _ABIO32
register double fzero asm("$f0");
register double ftmp0 asm("$f2");
@@ -103,7 +102,10 @@
register double ftmp11 asm("$f12");
#endif // _MIPS_SIM == _ABIO32
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x0040004000400040 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_40] \n\t"
"ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
"ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
"ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
@@ -111,10 +113,10 @@
"ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
"ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
"pxor %[fzero], %[fzero], %[fzero] \n\t"
- "li %[tmp0], 0x07 \n\t"
- "mtc1 %[tmp0], %[ftmp7] \n\t"
- "li %[tmp0], 0x08 \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t"
@@ -166,12 +168,12 @@
[ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
[ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]),
[output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
- [src_ptr]"+&r"(src_ptr)
+ [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40)
: [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
- [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width),
- [ff_ph_40]"f"(ff_ph_40)
+ [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width)
: "memory"
);
+ /* clang-format on */
}
/* Horizontal filter: pixel_step is always W */
@@ -178,9 +180,10 @@
static INLINE void vp8_filter_block1dc_v6_mmi(
uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- uint32_t tmp[1];
+ double ff_ph_40;
+ uint64_t tmp[1];
mips_reg addr[1];
+
#if _MIPS_SIM == _ABIO32
register double fzero asm("$f0");
register double ftmp0 asm("$f2");
@@ -215,7 +218,10 @@
register double ftmp13 asm("$f14");
#endif // _MIPS_SIM == _ABIO32
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x0040004000400040 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_40] \n\t"
"ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
"ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
"ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
@@ -223,8 +229,8 @@
"ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
"ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
"pxor %[fzero], %[fzero], %[fzero] \n\t"
- "li %[tmp0], 0x07 \n\t"
- "mtc1 %[tmp0], %[ftmp13] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp13] \n\t"
/* In order to make full use of memory load delay slot,
* Operation of memory loading and calculating has been rearranged.
@@ -285,15 +291,16 @@
[ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12),
[ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr),
- [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
+ [ff_ph_40]"=&f"(ff_ph_40)
: [pixels_per_line]"r"((mips_reg)pixels_per_line),
[pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
[pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
[vp8_filter]"r"(vp8_filter),
- [output_pitch]"r"((mips_reg)output_pitch),
- [ff_ph_40]"f"(ff_ph_40)
+ [output_pitch]"r"((mips_reg)output_pitch)
: "memory"
);
+ /* clang-format on */
}
/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
@@ -313,6 +320,7 @@
register double ftmp1 asm("$f2");
#endif // _MIPS_SIM == _ABIO32
+ /* clang-format off */
__asm__ volatile (
"pxor %[fzero], %[fzero], %[fzero] \n\t"
@@ -335,6 +343,7 @@
[output_width]"r"(output_width)
: "memory"
);
+ /* clang-format on */
}
static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
@@ -350,6 +359,7 @@
register double ftmp1 asm("$f2");
#endif // _MIPS_SIM == _ABIO32
+ /* clang-format on */
__asm__ volatile (
"pxor %[fzero], %[fzero], %[fzero] \n\t"
@@ -371,6 +381,7 @@
[output_pitch]"r"((mips_reg)output_pitch)
: "memory"
);
+ /* clang-format on */
}
#define sixtapNxM(n, m) \
--- a/vp8/encoder/mips/mmi/dct_mmi.c
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -46,6 +46,7 @@
void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
uint64_t tmp[1];
int16_t *ip = input;
+ double ff_ph_op1, ff_ph_op3;
#if _MIPS_SIM == _ABIO32
register double ftmp0 asm("$f0");
@@ -83,13 +84,16 @@
DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x14e808a914e808a9 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_op1] \n\t"
+ "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_op3] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
@@ -129,7 +133,7 @@
// op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
MMI_LI(%[tmp0], 0x0c)
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
"punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
"pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
@@ -169,7 +173,7 @@
"paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
MMI_LI(%[tmp0], 0x04)
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
@@ -211,9 +215,9 @@
[ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
[ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
[ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
- [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+ [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+ [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
: [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
- [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
[ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
[ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
[ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
@@ -220,6 +224,7 @@
[ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
: "memory"
);
+ /* clang-format on */
}
void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
@@ -228,17 +233,22 @@
}
void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
- double ftmp[13];
- uint32_t tmp[1];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
- DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+ double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+ uint64_t tmp[1];
+ /* clang-format off */
__asm__ volatile (
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0x0000000100000001 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_01] \n\t"
+ "dli %[tmp0], 0x0000000300000003 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_03] \n\t"
+ "dli %[tmp0], 0x0001000000010000 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_mask] \n\t"
MMI_LI(%[tmp0], 0x02)
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
@@ -337,7 +347,7 @@
"psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
MMI_LI(%[tmp0], 0x03)
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
"pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
@@ -393,7 +403,7 @@
"packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
MMI_LI(%[tmp0], 0x72)
- "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
"pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
"pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
"pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
@@ -413,13 +423,12 @@
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [ftmp12]"=&f"(ftmp[12]),
- [tmp0]"=&r"(tmp[0]),
- [ip]"+&r"(input)
- : [op]"r"(output),
- [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch),
- [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask),
- [ff_ph_01]"f"(ff_ph_01)
+ [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask),
+ [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01),
+ [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03),
+ [ff_ph_01]"=&f"(ff_ph_01)
+ : [op]"r"(output), [pitch]"r"((mips_reg)pitch)
: "memory"
);
+ /* clang-format on */
}
--- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -42,16 +42,17 @@
double ftmp[13];
uint64_t tmp[1];
- DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
- int eob = 0;
+ int64_t eob = 0;
+ double ones;
__asm__ volatile(
// loop 0 ~ 7
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pcmpeqh %[ones], %[ones], %[ones] \n\t"
"gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t"
- "li %[tmp0], 0x0f \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0f \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t"
"gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t"
@@ -165,18 +166,18 @@
"gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t"
"gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t"
- "li %[tmp0], 0x10 \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x10 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
"psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
"pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
- "li %[tmp0], 0xaa \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0xaa \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
"pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
- "li %[tmp0], 0xffff \n\t"
- "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0xffff \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
"pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
"gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t"
"gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t"
@@ -184,7 +185,8 @@
[ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
[ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
[ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
- [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
: [coeff_ptr] "r"((mips_reg)coeff_ptr),
[qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
[dequant_ptr] "r"((mips_reg)dequant_ptr),
@@ -191,8 +193,7 @@
[round_ptr] "r"((mips_reg)round_ptr),
[quant_ptr] "r"((mips_reg)quant_ptr),
[dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
- [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
- [ones] "f"(ones)
+ [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
: "memory");
*d->eob = eob;
--- a/vpx_dsp/mips/sad_mmi.c
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -364,6 +364,7 @@
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -383,6 +384,7 @@
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -405,7 +407,9 @@
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -424,11 +428,12 @@
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -450,6 +455,7 @@
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -469,6 +475,7 @@
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -493,7 +500,9 @@
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -512,11 +521,12 @@
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -539,6 +549,7 @@
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -558,6 +569,7 @@
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -586,7 +598,9 @@
unsigned int sad;
double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"1: \n\t"
@@ -605,11 +619,12 @@
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
[src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -632,6 +647,7 @@
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
@@ -651,6 +667,7 @@
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -679,7 +696,9 @@
unsigned int sad;
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
@@ -697,11 +716,12 @@
"mfc1 %[sad], %[ftmp3] \n\t"
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -724,6 +744,7 @@
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
@@ -743,6 +764,7 @@
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
@@ -767,7 +789,9 @@
unsigned int sad;
double ftmp1, ftmp2, ftmp3;
mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"1: \n\t"
@@ -785,11 +809,12 @@
"mfc1 %[sad], %[ftmp3] \n\t"
: [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
[counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
- [second_pred]"+&r"((mips_reg)second_pred),
+ [second_pred]"+&r"(l_second_pred),
[sad]"=&r"(sad)
: [src_stride]"r"((mips_reg)src_stride),
[ref_stride]"r"((mips_reg)ref_stride)
);
+ /* clang-format on */
return sad;
}
--- a/vpx_dsp/mips/variance_mmi.c
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -414,6 +414,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -496,6 +497,7 @@
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (64 * high));
}
@@ -519,6 +521,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -577,6 +580,7 @@
[sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / 2048);
}
@@ -590,6 +594,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -653,6 +658,7 @@
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (32 * high));
}
@@ -676,6 +682,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -729,6 +736,7 @@
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (16 * high));
}
@@ -753,6 +761,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -801,6 +810,7 @@
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (8 * high));
}
@@ -825,6 +835,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp10] \n\t"
@@ -872,6 +883,7 @@
[high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
: "memory"
);
+ /* clang-format on */
return *sse - (((int64_t)sum * sum) / (4 * high));
}
@@ -894,6 +906,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -925,6 +938,7 @@
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse;
}
@@ -947,6 +961,7 @@
*sse = 0;
+ /* clang-format off */
__asm__ volatile (
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp11] \n\t"
@@ -978,6 +993,7 @@
[high]"r"(&high), [sse]"r"(sse)
: "memory"
);
+ /* clang-format on */
return *sse;
}
@@ -1021,22 +1037,39 @@
uint8_t *temp2_ptr = temp2;
mips_reg l_counter = counter;
double ftmp[15];
+ double ff_ph_40, mask;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ uint64_t x0, x1, y0, y1, all;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
-
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[15]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
@@ -1072,15 +1105,13 @@
[ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
[ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
[tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR16XN(H) \
@@ -1105,19 +1136,38 @@
mips_reg l_counter = counter;
double ftmp[15];
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp14])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[7]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@
[ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
[ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
[tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
- [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR8XN(H) \
@@ -1188,19 +1236,38 @@
mips_reg l_counter = counter;
double ftmp[7];
mips_reg tmp[2];
- DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
- DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
const uint8_t *filter_x = bilinear_filters[x_offset];
const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+ /* clang-format off */
__asm__ volatile (
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp6])
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
MMI_LI(%[tmp0], 0x07)
MMI_MTC1(%[tmp0], %[ftmp6])
- "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
- "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
- "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
- "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
// fdata3: fdata3[0] ~ fdata3[3]
VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
@@ -1232,15 +1299,14 @@
: [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
[ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
[ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
- [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
- : [filter_x0] "f"((uint64_t)filter_x[0]),
- [filter_x1] "f"((uint64_t)filter_x[1]),
- [filter_y0] "f"((uint64_t)filter_y[0]),
- [filter_y1] "f"((uint64_t)filter_y[1]),
- [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
- [mask] "f"(mask)
+ [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+ [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
: "memory"
);
+ /* clang-format on */
}
#define SUBPIX_VAR4XN(H) \