ref: 08a668af32d05de2b29d29372b1749dd29ee8a61
parent: 09519a55c75f020c76ba7a8ca35141f895d2f472
author: Shiyou Yin <yinshiyou-hf@loongson.cn>
date: Fri Dec 15 12:06:47 EST 2017
vp8: [loongson] optimize loopfilter v2. Optimize function vp8_mbloop_filter_vertical_edge_mmi and function vp8_mbloop_filter_horizontal_edge_mmi. Make full use of memory loading delay slot and reduce unnecessary instructions. Change-Id: I61da2c3a44c06044225461f46bf487d83cba6c16
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -461,96 +461,87 @@
);
}
+/* clang-format off */
#define VP8_MBLOOP_HPSRAB \
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
- "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
- "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" \
- "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" \
- "psrah %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
- "packsshb %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \
+ "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \
+ "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \
+ "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
-#define VP8_MBLOOP_HPSRAB_PMULHH(reg1, reg2) \
- "pmulhh " #reg1 ", " #reg1 ", " #reg2 " \n\t"
+#define VP8_MBLOOP_HPSRAB_ADD(reg) \
+ "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \
+ "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \
+ "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \
+ "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \
+ "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+/* clang-format on */
-#define VP8_MBLOOP_HPSRAB_ADD(reg) \
- "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
- "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
- "punpckhbh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
- VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp3], reg) \
- VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp8], reg) \
- "paddh %[ftmp3], %[ftmp3], %[ff_ph_003f] \n\t" \
- "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \
- "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t" \
- "psrah %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
- "packsshb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
-
void vp8_mbloop_filter_horizontal_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
uint32_t tmp[1];
- mips_reg addr[2];
- DECLARE_ALIGNED(8, const uint64_t, srct[1]);
- double ftmp[10];
+ double ftmp[13];
__asm__ volatile (
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
"1: \n\t"
"gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t"
+ /* ftmp1: p3 */
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ /* ftmp3: p2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+ /* ftmp4: p1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+ /* ftmp5: p0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ /* ftmp6: q0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ /* ftmp7: q1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ /* ftmp8: q2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ /* ftmp2: q3 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t"
- MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t"
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
- MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
- MMI_SUBU(%[addr1], %[addr0], %[tmp0])
- "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
"psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
-
- /* ftmp4:p1 */
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
- MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
-
- /* ftmp5:p0 */
- MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
- "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
- "pasubub %[ftmp1], %[ftmp4], %[ftmp5] \n\t"
- "sdc1 %[ftmp1], 0x00(%[srct]) \n\t"
- "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
-
- /* ftmp6:q0 */
- "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
-
- /* ftmp7:q1 */
- "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
- "pasubub %[ftmp1], %[ftmp7], %[ftmp6] \n\t"
- "sdc1 %[ftmp1], 0x08(%[srct]) \n\t"
- "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
-
- MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
-
- MMI_ADDU(%[addr1], %[addr0], %[tmp0])
- "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
@@ -563,9 +554,7 @@
"mtc1 %[tmp0], %[ftmp9] \n\t"
"psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
- "gsldlc1 %[ftmp9], 0x07(%[blimit]) \n\t"
- "gsldrc1 %[ftmp9], 0x00(%[blimit]) \n\t"
- "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
/* ftmp0: mask */
@@ -573,15 +562,13 @@
"gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t"
"gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t"
- "ldc1 %[ftmp1], 0x00(%[srct]) \n\t"
- "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
- "ldc1 %[ftmp2], 0x08(%[srct]) \n\t"
- "psubusb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t"
"paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
- /* ftmp1:hev*/
+ /* ftmp1: hev */
"xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
@@ -588,7 +575,6 @@
"xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
"xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
-
"psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
@@ -595,7 +581,7 @@
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"and %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
- "sdc1 %[ftmp2], 0x00(%[srct]) \n\t"
+ "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
"and %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
"li %[tmp0], 0x0b \n\t"
@@ -606,59 +592,55 @@
"paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t"
VP8_MBLOOP_HPSRAB
"psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
- "ldc1 %[ftmp2], 0x00(%[srct]) \n\t"
- "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t"
"li %[tmp0], 0x07 \n\t"
"mtc1 %[tmp0], %[ftmp9] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
- "psubsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
- "paddsb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
"xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
-
- MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
- "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
- "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
- "paddsb %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
- "psubsb %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
"xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
"xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
- "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
- MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
- "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
- "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
-
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
- MMI_SUBU(%[addr1], %[addr0], %[tmp0])
- "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
- MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
- "gsldlc1 %[ftmp7], 0x07(%[addr1]) \n\t"
- "gsldrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+ "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
- "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
- "paddsb %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
- "psubsb %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
- "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
- MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp7], 0x07(%[addr1]) \n\t"
- "gssdrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
- MMI_SUBU(%[addr1], %[addr0], %[tmp0])
- "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
- "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
-
- "addiu %[count], %[count], -0x01 \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "addiu %[count], %[count], -0x01 \n\t"
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
@@ -665,16 +647,16 @@
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
- [tmp0]"=&r"(tmp[0]),
- [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
- : [limit]"r"(limit), [blimit]"r"(blimit),
- [srct]"r"(srct), [thresh]"r"(thresh),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
- [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
- [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03),
- [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00),
- [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f)
+ [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80),
+ [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03),
+ [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00),
+ [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f)
: "memory"
);
}
@@ -696,64 +678,60 @@
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
mips_reg tmp[1];
- mips_reg addr[2];
DECLARE_ALIGNED(8, const uint64_t, srct[1]);
- double ftmp[13];
+ double ftmp[14];
__asm__ volatile (
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
- MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
"1: \n\t"
- MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
- MMI_ADDU(%[addr0], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
- MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
- "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
- "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
- "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
- "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t"
- "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t"
- MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
- "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
- "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
- "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
- "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t"
- "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t"
- "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
- "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
- MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
- MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
- "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
- "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
- "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
- MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
- MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
- "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
- MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
- "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
- "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
- "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t"
- "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
- "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
- "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
- "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t"
- "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t"
+ "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t"
/* ftmp9:q0 ftmp10:q1 */
"punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
"punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
@@ -771,36 +749,38 @@
"punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
"punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
- "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t"
- "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t"
-
/* abs (q3-q2) */
"pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
- "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t"
/* abs (q2-q1) */
"pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
- "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp3: abs(q1-q0) */
"pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
- "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp4: abs(p1-p0) */
"pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
- "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p2-p1) */
"pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
- "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p3-p2) */
"pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
- "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
- /* abs (p0-q0) */
+
+ "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t"
+ /* abs (p0-q0) * 2 */
"pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
- /* abs (p1-q1) */
+ /* abs (p1-q1) / 2 */
"pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
"and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
"li %[tmp0], 0x01 \n\t"
@@ -807,24 +787,23 @@
"mtc1 %[tmp0], %[ftmp8] \n\t"
"psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
"paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
-
- "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t"
- "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t"
- "psubusb %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+ "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
"or %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
"xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ /* ftmp0: mask */
"pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
- "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t"
- "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t"
- /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */
- "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
- "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
+ /* abs(p1-p0) - thresh */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
+ /* abs(q1-q0) - thresh */
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
"or %[ftmp3], %[ftmp4], %[ftmp3] \n\t"
"pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1: hev */
"xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
"xor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t"
"xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
"xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
@@ -837,30 +816,30 @@
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ /* filter_value &= mask */
"and %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ /* Filter2 = filter_value & hev */
"and %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ /* filter_value &= ~hev */
"pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
"paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
"li %[tmp0], 0x0b \n\t"
"mtc1 %[tmp0], %[ftmp12] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
"packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp9: qs0 */
"psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t"
- "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
- "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t"
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
"packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
-
+ /* ftmp6: ps0 */
"paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
"li %[tmp0], 0x07 \n\t"
@@ -872,8 +851,10 @@
"pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t"
+ /* ftmp9: oq0 */
"xor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t"
+ /* ftmp6: op0 */
"xor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
@@ -882,8 +863,10 @@
"pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t"
+ /* ftmp10: oq1 */
"xor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
+ /* ftmp5: op1 */
"xor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
@@ -891,8 +874,10 @@
"pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t"
+ /* ftmp11: oq2 */
"xor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2: op2 */
"xor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t"
"ldc1 %[ftmp12], 0x00(%[srct]) \n\t"
@@ -916,41 +901,40 @@
"punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t"
"punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
"punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t"
"punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t"
-
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
- MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
- "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
- "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
- "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
- MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp0], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp0], 0x00(%[addr0]) \n\t"
- MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
-
"punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t"
"punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t"
- MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
- "gssdlc1 %[ftmp0], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp0], 0x00(%[addr0]) \n\t"
- MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
"punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t"
"punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t"
- MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp0], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp0], 0x00(%[addr0]) \n\t"
- MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
- "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
- "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
"addiu %[count], %[count], -0x01 \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
@@ -962,9 +946,9 @@
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
- [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
- [addr0]"=&r"(addr[0]),
- [src_ptr]"+&r"(src_ptr), [count]"+&r"(count)
+ [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
+ [count]"+&r"(count)
: [limit]"r"(limit), [blimit]"r"(blimit),
[srct]"r"(srct), [thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step),