ref: bf8a49abbd3e637ae933806aa47564dd7ef324d1
parent: a02f391cbe618c6d84b35d161269266c43bc84ca
author: Linfeng Zhang <linfengz@google.com>
date: Wed Apr 5 13:54:42 EDT 2017
Clean CONVERT_TO_BYTEPTR/SHORTPTR in convolve Replace by CAST_TO_BYTEPTR/SHORTPTR. The rule is: if a short ptr is casted to a byte ptr, any offset operation on the byte ptr must be doubled. We do this by casting to short ptr first, adding offset, then casting back to byte ptr. BUG=webm:1388 Change-Id: I9e18a73ba45ddae58fc9dae470c0ff34951fe248
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -301,9 +301,9 @@
filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
dst_stride, output_width, output_height);
} else {
- highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+ highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride,
hfilter, vfilter,
- CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+ CAST_TO_SHORTPTR(dst_ptr), dst_stride,
output_width, output_height, use_highbd);
}
#else
@@ -324,8 +324,8 @@
filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
dst_stride, output_width, output_height);
} else {
- highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter,
- vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+ highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter,
+ vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride,
output_width, output_height, use_highbd);
}
#else
@@ -460,7 +460,7 @@
if (UUT_->use_highbd_ == 0) {
return input_ + offset;
} else {
- return CONVERT_TO_BYTEPTR(input16_) + offset;
+ return CAST_TO_BYTEPTR(input16_ + offset);
}
#else
return input_ + offset;
@@ -473,7 +473,7 @@
if (UUT_->use_highbd_ == 0) {
return output_ + offset;
} else {
- return CONVERT_TO_BYTEPTR(output16_) + offset;
+ return CAST_TO_BYTEPTR(output16_ + offset);
}
#else
return output_ + offset;
@@ -486,7 +486,7 @@
if (UUT_->use_highbd_ == 0) {
return output_ref_ + offset;
} else {
- return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+ return CAST_TO_BYTEPTR(output16_ref_ + offset);
}
#else
return output_ref_ + offset;
@@ -498,7 +498,7 @@
if (UUT_->use_highbd_ == 0) {
return list[index];
} else {
- return CONVERT_TO_SHORTPTR(list)[index];
+ return CAST_TO_SHORTPTR(list)[index];
}
#else
return list[index];
@@ -510,7 +510,7 @@
if (UUT_->use_highbd_ == 0) {
list[index] = (uint8_t)val;
} else {
- CONVERT_TO_SHORTPTR(list)[index] = val;
+ CAST_TO_SHORTPTR(list)[index] = val;
}
#else
list[index] = (uint8_t)val;
@@ -718,7 +718,7 @@
if (UUT_->use_highbd_ == 0) {
ref = ref8;
} else {
- ref = CONVERT_TO_BYTEPTR(ref16);
+ ref = CAST_TO_BYTEPTR(ref16);
}
#else
uint8_t ref[kOutputStride * kMaxDimension];
@@ -797,7 +797,7 @@
if (UUT_->use_highbd_ == 0) {
ref = ref8;
} else {
- ref = CONVERT_TO_BYTEPTR(ref16);
+ ref = CAST_TO_BYTEPTR(ref16);
}
#else
uint8_t ref[kOutputStride * kMaxDimension];
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -37,8 +37,9 @@
const int subpel_x, const int subpel_y, const struct scale_factors *sf,
int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
- ys, w, h, bd);
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src)), src_stride,
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), dst_stride, kernel[subpel_x],
+ xs, kernel[subpel_y], ys, w, h, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2417,10 +2417,11 @@
uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
- kernel[x_q4 & 0xf], 16 * src_w / dst_w,
- kernel[y_q4 & 0xf], 16 * src_h / dst_h,
- 16 / factor, 16 / factor, bd);
+ vpx_highbd_convolve8(
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src_ptr)), src_stride,
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst_ptr)), dst_stride,
+ kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf],
+ 16 * src_h / dst_h, 16 / factor, 16 / factor, bd);
} else {
vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -2053,9 +2053,11 @@
this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
- this_mode_pred->data, this_mode_pred->stride,
- NULL, 0, NULL, 0, bw, bh, xd->bd);
+ vpx_highbd_convolve_copy(
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)),
+ best_pred->stride,
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(this_mode_pred->data)),
+ this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL,
@@ -2162,9 +2164,11 @@
if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
- pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0,
- bw, bh, xd->bd);
+ vpx_highbd_convolve_copy(
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)),
+ best_pred->stride,
+ CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(pd->dst.buf)), pd->dst.stride,
+ NULL, 0, NULL, 0, bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -599,9 +599,10 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- recon = CONVERT_TO_BYTEPTR(recon);
- vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
- bs, bs, xd->bd);
+ vpx_highbd_convolve_copy(CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)),
+ dst_stride, recon, 32, NULL, 0, NULL, 0, bs,
+ bs, xd->bd);
+ recon = CONVERT_TO_BYTEPTR(recon16);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
} else {
--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -145,8 +145,8 @@
vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_x);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -348,8 +348,8 @@
filter_x, x_step_q4, filter_y, y_step_q4,
w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_x);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -579,8 +579,8 @@
vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_y);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
@@ -748,8 +748,8 @@
filter_x, x_step_q4, filter_y, y_step_q4, w,
h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_y);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
--- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -18,8 +18,8 @@
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -18,8 +18,8 @@
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
--- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -18,7 +18,7 @@
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
@@ -29,13 +29,12 @@
* height and filter a multiple of 4 lines. Since this goes in to the temp
* buffer which has lots of extra room and is subsequently discarded this is
* safe if somewhat less than ideal. */
- vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
- src_stride, CONVERT_TO_BYTEPTR(temp), w,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
- intermediate_height, bd);
+ vpx_highbd_convolve8_horiz_neon(
+ CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp),
+ w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
+ vpx_highbd_convolve8_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst,
dst_stride, filter_x, x_step_q4, filter_y,
y_step_q4, w, h, bd);
}
@@ -45,7 +44,7 @@
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
@@ -55,11 +54,10 @@
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
- vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
- src_stride, CONVERT_TO_BYTEPTR(temp), w,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
- intermediate_height, bd);
- vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
+ vpx_highbd_convolve8_horiz_neon(
+ CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp),
+ w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd);
+ vpx_highbd_convolve8_avg_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst,
dst_stride, filter_x, x_step_q4, filter_y,
y_step_q4, w, h, bd);
}
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -324,8 +324,8 @@
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -348,8 +348,8 @@
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -374,8 +374,8 @@
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
@@ -400,8 +400,8 @@
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
@@ -449,12 +449,12 @@
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
- highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
- CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
+ highbd_convolve_horiz(CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) -
+ src_stride * (SUBPEL_TAPS / 2 - 1)),
+ src_stride, CAST_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
x_step_q4, w, intermediate_height, bd);
- highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
- bd);
+ highbd_convolve_vert(CAST_TO_BYTEPTR(temp + 64 * (SUBPEL_TAPS / 2 - 1)), 64,
+ dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -541,10 +541,10 @@
assert(w <= 64);
assert(h <= 64);
- vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
- 0, NULL, 0, w, h, bd);
+ vpx_highbd_convolve8_c(src, src_stride, CAST_TO_BYTEPTR(temp), 64, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve_avg_c(CAST_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0,
+ NULL, 0, w, h, bd);
}
void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
@@ -553,8 +553,8 @@
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int r;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
@@ -575,8 +575,8 @@
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -107,8 +107,8 @@
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *src = CAST_TO_SHORTPTR(src8); \
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8); \
if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
@@ -162,36 +162,37 @@
} \
}
-#define HIGH_FUN_CONV_2D(avg, opt) \
- void vpx_highbd_convolve8_##avg##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt( \
- src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt( \
- CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt( \
- src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt( \
- CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h, bd); \
- } \
- } else { \
- vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
+#define HIGH_FUN_CONV_2D(avg, opt) \
+ void vpx_highbd_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - 3 * src_stride), \
+ src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ CAST_TO_BYTEPTR(fdata2 + 192), 64, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src, src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ CAST_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -21,8 +21,8 @@
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int width, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
@@ -104,8 +104,8 @@
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int width, int h, int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *src = CAST_TO_SHORTPTR(src8);
+ uint16_t *dst = CAST_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -32,9 +32,7 @@
mov r4d, dword wm
%ifidn %2, highbd
shl r4d, 1
- shl srcq, 1
shl src_strideq, 1
- shl dstq, 1
shl dst_strideq, 1
%else
cmp r4d, 4