ref: a41a4860c0b3be7815f37b4ec833e87218307c4f
parent: c43af9a8a3adc7bd3888e746ce7b7bd581c476ae
author: Jingning Han <jingning@google.com>
date: Fri Jun 14 07:28:56 EDT 2013
Make fdct32 computation flow within 16bit range This commit makes use of dual fdct32x32 versions for rate-distortion optimization loop and encoding process, respectively. The one for rd loop requires only 16 bits precision for intermediate steps. The original fdct32x32 that allows higher intermediate precision (18 bits) was retained for the encoding process only. This allows speed-up for fdct32x32 in the rd loop. No performance loss observed. Change-Id: I3237770e39a8f87ed17ae5513c87228533397cc3
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -71,12 +71,6 @@
return rv;
}
-static INLINE int dct_32_round(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(-131072 <= rv && rv <= 131071);
- return rv;
-}
-
typedef void (*transform_1d)(int16_t*, int16_t*);
typedef struct {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -577,6 +577,9 @@
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct32x32_rd
+
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct16x16 sse2
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -139,6 +139,9 @@
int optimize;
+ // indicate if it is in the rd search loop or encoding process
+ int rd_search;
+
// TODO(jingning): Need to refactor the structure arrays that buffers the
// coding mode decisions of each partition type.
PICK_MODE_CONTEXT ab4x4_context[4][4][4];
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -991,8 +991,18 @@
}
}
+static INLINE int dct_32_round(int input) {
+ int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ assert(-131072 <= rv && rv <= 131071);
+ return rv;
+}
-static void dct32_1d(int *input, int *output) {
+static INLINE int half_round_shift(int input) {
+ int rv = (input + 1 + (input < 0)) >> 2;
+ return rv;
+}
+
+static void dct32_1d(int *input, int *output, int round) {
int step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1101,6 +1111,44 @@
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
+ // dump the magnitude by half, hence the intermediate values are within 1108
+ // the range of 16 bits.
+ if (round) {
+ step[0] = half_round_shift(step[0]);
+ step[1] = half_round_shift(step[1]);
+ step[2] = half_round_shift(step[2]);
+ step[3] = half_round_shift(step[3]);
+ step[4] = half_round_shift(step[4]);
+ step[5] = half_round_shift(step[5]);
+ step[6] = half_round_shift(step[6]);
+ step[7] = half_round_shift(step[7]);
+ step[8] = half_round_shift(step[8]);
+ step[9] = half_round_shift(step[9]);
+ step[10] = half_round_shift(step[10]);
+ step[11] = half_round_shift(step[11]);
+ step[12] = half_round_shift(step[12]);
+ step[13] = half_round_shift(step[13]);
+ step[14] = half_round_shift(step[14]);
+ step[15] = half_round_shift(step[15]);
+
+ step[16] = half_round_shift(step[16]);
+ step[17] = half_round_shift(step[17]);
+ step[18] = half_round_shift(step[18]);
+ step[19] = half_round_shift(step[19]);
+ step[20] = half_round_shift(step[20]);
+ step[21] = half_round_shift(step[21]);
+ step[22] = half_round_shift(step[22]);
+ step[23] = half_round_shift(step[23]);
+ step[24] = half_round_shift(step[24]);
+ step[25] = half_round_shift(step[25]);
+ step[26] = half_round_shift(step[26]);
+ step[27] = half_round_shift(step[27]);
+ step[28] = half_round_shift(step[28]);
+ step[29] = half_round_shift(step[29]);
+ step[30] = half_round_shift(step[30]);
+ step[31] = half_round_shift(step[31]);
+ }
+
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
@@ -1283,12 +1331,12 @@
int output[32 * 32];
// Columns
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
- for (j = 0; j < 32; j++)
+ for (j = 0; j < 32; ++j)
temp_in[j] = input[j * shortpitch + i] << 2;
- dct32_1d(temp_in, temp_out);
- for (j = 0; j < 32; j++)
+ dct32_1d(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
@@ -1297,8 +1345,37 @@
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- dct32_1d(temp_in, temp_out);
+ dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ }
+}
+
+// Note that although we use dct_32_round in dct32_1d computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
+ int shortpitch = pitch >> 1;
+ int i, j;
+ int output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ int temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = input[j * shortpitch + i] << 2;
+ dct32_1d(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = output[j + i * 32];
+ dct32_1d(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = temp_out[j];
}
}
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -603,6 +603,8 @@
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ x->rd_search = 1;
+
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index != 0)
return;
@@ -1975,6 +1977,7 @@
const int mis = cm->mode_info_stride;
const int bwl = mi_width_log2(bsize);
const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+ x->rd_search = 0;
if (cm->frame_type == KEY_FRAME) {
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -454,7 +454,10 @@
switch (ss_txfrm_size / 2) {
case TX_32X32:
- vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+ if (x->rd_search)
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+ else
+ vp9_short_fdct32x32(src_diff, coeff, bw * 2);
break;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
--
⑨