shithub: libvpx

Download patch

ref: 7c880906a0b4c7586ac167f13eb721548989061f
parent: df0d3a415216340a44953c6ed936bc2a4d7a1175
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Wed Dec 31 19:04:49 EST 1969

convolutions: make things a bit faster by inlining and unrolling loops

--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -31,8 +31,7 @@
     for (x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       x_q4 += x_step_q4;
     }
@@ -53,8 +52,7 @@
     for (x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));
       dst[x] = ROUND_POWER_OF_TWO(
           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
@@ -76,9 +74,7 @@
     for (y = 0; y < h; ++y) {
       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));
       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       y_q4 += y_step_q4;
     }
@@ -99,9 +95,7 @@
     for (y = 0; y < h; ++y) {
       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
           dst[y * dst_stride] +
               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,33 +45,15 @@
 
 typedef int16_t tran_coef_t;
 
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
+#define clip_pixel(val) (uint8_t)(((int)(val) > 255) ? 255 : (((int)(val) < 0) ? 0 : (val)))
+#define clamp(value, low, high) (int)((int)(value) < (int)(low) ? (low) : ((int)(value) > (int)(high) ? (high) : (value)))
+#define fcclamp(value, low, high) (double)((double)(value) < (double)(low) ? (low) : ((double)(value) > (double)(high) ? (high) : (value)))
+#define lclamp(value, low, high) (int64_t)((int64_t)(value) < (int64_t)(low) ? (low) : ((int64_t)(value) > (int64_t)(high) ? (high) : (value)))
+#define clip_pixel_highbd(val, bd) (uint16_t)((bd) == 12 ? clamp((val), 0, 4095) : ((bd) == 10 ? clamp((val), 0, 1023) : clamp((val), 0, 255)))
 
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
-  switch (bd) {
-    case 8:
-    default: return (uint16_t)clamp(val, 0, 255);
-    case 10: return (uint16_t)clamp(val, 0, 1023);
-    case 12: return (uint16_t)clamp(val, 0, 4095);
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
 #endif  // VPX_VPX_DSP_VPX_DSP_COMMON_H_
+