ref: 794a7bedbd43fe062c0e11308938f9793f2facb1
parent: a272ff25cd99f47950dddb55e94b370e95b70016
author: Scott LaVarnway <slavarnway@google.com>
date: Thu May 16 09:52:15 EDT 2013
WIP: 8x8 idct/recon merge This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: Iacfd57324fbe2b7beca5d7f3dcae25c976e67f45
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -16,6 +16,7 @@
extern "C" {
#include "vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
@@ -100,11 +101,15 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
- int16_t test_output_block[64];
+ uint8_t dst[64], src[64];
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
@@ -119,10 +124,10 @@
test_temp_block[j] *= 4;
}
}
- vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
@@ -145,18 +150,22 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
- int16_t test_output_block[64];
+ uint8_t dst[64], src[64];
- // Initialize a test block with input range {-255, 255}.
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8() % 2 ? 255 : 0;
+ dst[j] = src[j] > 0 ? 0 : 255;
+ }
+ // Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -112,20 +112,23 @@
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
int16_t input[64], coeff[64];
- int16_t output_c[64];
double output_r[64];
+ uint8_t dst[64], src[64];
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- input[j] = rnd.Rand8() - rnd.Rand8();
+ input[j] = src[j] - dst[j];
- const int pitch = 16;
reference_dct_2d(input, output_r);
for (int j = 0; j < 64; ++j)
coeff[j] = round(output_r[j]);
- vp9_short_idct8x8_c(coeff, output_c, pitch);
+ vp9_short_idct8x8_add_c(coeff, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = output_c[j] -input[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 8x8 FDCT/IDCT has error " << error
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -219,14 +219,13 @@
output[7] = step1[0] - step1[7];
}
-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
- // Rows
+ // First transform rows
for (i = 0; i < 8; ++i) {
idct8_1d(input, outptr);
input += 8;
@@ -233,13 +232,14 @@
outptr += 8;
}
- // Columns
+ // Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
}
}
@@ -400,8 +400,8 @@
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
- int pitch, int tx_type) {
+void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+ int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
@@ -421,14 +421,14 @@
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
- }
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]); }
}
-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
@@ -447,7 +447,8 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
}
}
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -88,9 +88,6 @@
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_4x4 sse2
-prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_8x8 sse2
-
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
@@ -188,11 +185,11 @@
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct4x4 sse2
-prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct8x8 sse2
+prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_add sse2
-prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_8x8 sse2
+prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_8x8_add sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
@@ -215,8 +212,8 @@
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_32x32_add
-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht8x8
+prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht8x8_add
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -403,8 +403,18 @@
in6 = _mm_subs_epi16(stp1_1, stp1_6); \
in7 = _mm_subs_epi16(stp1_0, stp2_7);
-void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ in_x = _mm_add_epi16(in_x, d0); \
+ in_x = _mm_packus_epi16(in_x, in_x); \
+ _mm_storel_epi64((__m128i *)(dest), in_x); \
+ dest += stride; \
+ }
+
+void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -461,19 +471,17 @@
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
}
-void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -612,15 +620,14 @@
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
}
#define IDCT16x16_1D \
@@ -750,16 +757,6 @@
MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
stg6_0, stg4_0, stg6_0, stg4_0, \
stp2_10, stp2_13, stp2_11, stp2_12) \
- }
-
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
- dest += stride; \
}
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -101,10 +101,6 @@
add_residual(diff, dest, stride, 4, 4);
}
-void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 8, 8);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
@@ -151,11 +147,8 @@
vp9_idct_add_8x8(input, dest, stride, eob);
} else {
if (eob > 0) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
- vp9_short_iht8x8(input, output, 8, tx_type);
+ vp9_short_iht8x8_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 128);
- vp9_add_residual_8x8(output, dest, stride);
}
}
}
@@ -210,8 +203,6 @@
}
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -233,20 +224,15 @@
vp9_add_constant_residual_8x8(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- vp9_short_idct10_8x8(input, output, 16);
-
+ vp9_short_idct10_8x8_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[8] = input[9] = input[10] = 0;
input[16] = input[17] = 0;
input[24] = 0;
-
- vp9_add_residual_8x8(output, dest, stride);
#endif
} else {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct8x8(input, output, 8 << 1);
+ vp9_short_idct8x8_add(input, dest, stride);
vpx_memset(input, 0, 128);
- vp9_add_residual_8x8(output, dest, stride);
}
}
}
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -58,70 +58,6 @@
*(int *)dest = _mm_cvtsi128_si32(p2);
}
-void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
- const int width = 8;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
- const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
- const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
- const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
- const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
- const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
-
- // Prediction data.
- __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
- __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
- __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
- __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
- __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
- __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
- __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
- __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
- p0 = _mm_unpacklo_epi8(p0, zero);
- p1 = _mm_unpacklo_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p2, zero);
- p3 = _mm_unpacklo_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p4, zero);
- p5 = _mm_unpacklo_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p6, zero);
- p7 = _mm_unpacklo_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p2 = _mm_packus_epi16(p2, p3);
- p4 = _mm_packus_epi16(p4, p5);
- p6 = _mm_packus_epi16(p6, p7);
-
- _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
- _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
- p2 = _mm_srli_si128(p2, 8);
- _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
- _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
- p4 = _mm_srli_si128(p4, 8);
- _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
- _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
- p6 = _mm_srli_si128(p6, 8);
- _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -534,11 +534,12 @@
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
- vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
} else {
- vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw, tx_type);
+ vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride,
+ tx_type);
}
*wip_txfrm_size = 8;
break;
@@ -589,7 +590,7 @@
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
- if (wip_txfrm_size < 32)
+ if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
}
@@ -606,7 +607,7 @@
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
}
@@ -628,13 +629,13 @@
// wip version... will use foreach_transformed_block when done
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
wip_txfrm_size = 0;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
#endif
}
--
⑨