shithub: libvpx

Download patch

ref: afd2d797ebff575db406b5d2f44e5a0ce00d3da6
parent: c7e2bd62987ba5e4f8c74feed9175e07bb99ff22
author: Kaustubh Raste <kaustubh.raste@imgtec.com>
date: Mon Jan 9 10:28:30 EST 2017

Fix mips dspr2 idct4x4 functions for large coefficient input

Change-Id: I06730eec80ca81e0b7436d26232465b79f447e89

--- a/vpx_dsp/mips/itrans4_dspr2.c
+++ b/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,7 +15,7 @@
 
 #if HAVE_DSPR2
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t step_0, step_1, step_2, step_3;
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
   int i;
@@ -97,23 +97,13 @@
 
 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                      int stride) {
-  int16_t step_0, step_1, step_2, step_3;
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
+  const int const_255 = 255;
   int i;
   uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
 
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl + 32);
-  prefetch_load(vpx_ff_cropTbl + 64);
-  prefetch_load(vpx_ff_cropTbl + 96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
-
   for (i = 0; i < 4; ++i) {
     dest_pix = (dest + i);
 
@@ -172,43 +162,55 @@
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
 
         : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
           [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
           [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
           [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
+        : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
           [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
           [stride] "r"(stride));
 
     input += 4;
@@ -273,6 +275,39 @@
 
           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 3;
+    a12 = a1 - (a11 * 7);
+
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb      %[vector_a],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a12]   \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte