shithub: libvpx

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -280,7 +280,6 @@

 typedef struct blockd {

   int16_t *diff;

-  int16_t *dequant;

   /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */

   uint8_t **base_pre;

@@ -335,6 +334,7 @@

   int subsampling_y;

   struct buf_2d dst;

   struct buf_2d pre[2];

+  int16_t *dequant;

};

 #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -206,12 +206,10 @@

   const int qindex = get_qindex(mb, segment_id, pc->base_qindex);

   mb->q_index = qindex;

-  for (i = 0; i < 16; i++)

-    mb->block[i].dequant = pc->y_dequant[qindex];

+  mb->plane[0].dequant = pc->y_dequant[qindex];

+  for (i = 1; i < MAX_MB_PLANE; i++)

+    mb->plane[i].dequant = pc->uv_dequant[qindex];

-  for (i = 16; i < 24; i++)

-    mb->block[i].dequant = pc->uv_dequant[qindex];

   if (mb->lossless) {

     assert(qindex == 0);

     mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;

@@ -354,7 +352,8 @@

       xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =

           vp9_find_bpred_context(xd, b);

       if (!xd->mode_info_context->mbmi.mb_skip_coeff)

-        vp9_decode_coefs_4x4(pbi, xd, r, PLANE_TYPE_Y_WITH_DC, i, b->dequant);

+        vp9_decode_coefs_4x4(pbi, xd, r, PLANE_TYPE_Y_WITH_DC, i,

+                             xd->plane[0].dequant);

 #endif

       vp9_intra4x4_predict(xd, b, b_mode, *(b->base_dst) + b->dst,

                            b->dst_stride);

@@ -363,7 +362,7 @@

 #if CONFIG_NEWBINTRAMODES

     if (!xd->mode_info_context->mbmi.mb_skip_coeff)

-      vp9_decode_mb_tokens_4x4_uv(pbi, xd, r, xd->block[16].dequant);

+      vp9_decode_mb_tokens_4x4_uv(pbi, xd, r, xd->plane[1].dequant);

 #endif

     vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);

     xd->itxm_add_uv_block(xd->plane[1].qcoeff, xd->plane[1].dst.buf,

@@ -596,7 +595,7 @@

       mb_init_dequantizer(pbi, xd);

     // dequantization and idct

-    eobtotal = vp9_decode_tokens(pbi, xd, r, bsize, xd->block[0].dequant);

+    eobtotal = vp9_decode_tokens(pbi, xd, r, bsize, xd->plane[0].dequant);

     if (eobtotal == 0) {  // skip loopfilter

       for (n = 0; n < bw * bh; n++) {

         const int x_idx = n & (bw - 1), y_idx = n >> bwl;

@@ -671,7 +670,7 @@

     if (mode != I4X4_PRED)

 #endif

       eobtotal = vp9_decode_tokens(pbi, xd, r, BLOCK_SIZE_MB16X16,

-                                   xd->block[0].dequant);

+                                   xd->plane[0].dequant);

--- a/vp9/encoder/vp9_asm_enc_offsets.c

+++ /dev/null

@@ -1,29 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/asm_offsets.h"

-#include "vpx_config.h"

-#include "vp9/encoder/vp9_block.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/encoder/vp9_onyx_int.h"

-#include "vp9/encoder/vp9_treewriter.h"

-#include "vp9/encoder/vp9_tokenize.h"

-BEGIN

-/* regular quantize */

-DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));

-END

-/* add asserts for any offset that is not supported by assembly code

- * add asserts for any size that is not supported by assembly code

- */

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -509,7 +509,7 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,

                ta + x_idx, tl + y_idx, TX_32X32, 64 * bw * bh);

@@ -532,7 +532,7 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,

                ta + x_idx, tl + y_idx, TX_16X16, 16 * bw * bh);

@@ -560,7 +560,7 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,

                ta + x_idx, tl + y_idx, TX_8X8, 4 * bw * bh);

@@ -585,7 +585,7 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,

                ta + x_idx, tl + y_idx, TX_4X4, bh * bw);

@@ -599,7 +599,7 @@

   assert(bsize == BLOCK_SIZE_SB64X64);

   for (b = 256; b < 384; b += 64) {

-    const int cidx = b >= 320 ? 20 : 16;

+    const int plane = 1 + (b >= 320);

     a = ta + vp9_block2above_sb64[TX_32X32][b];

     l = tl + vp9_block2left_sb64[TX_32X32][b];

     a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

@@ -610,7 +610,7 @@

     l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

     a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;

     l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.plane[plane].dequant,

                &a_ec, &l_ec, TX_32X32, 256);

@@ -638,11 +638,10 @@

   for (plane = 0; plane < 2; plane++) {

-    const int cidx = 16 + plane * 4;

     for (n = 0; n < bw * bh; n++) {

       const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

       optimize_b(cm, x, uvoff + n * 16, PLANE_TYPE_UV,

-                 x->e_mbd.block[cidx].dequant,

+                 x->e_mbd.plane[plane + 1].dequant,

                  &ta[plane][x_idx], &tl[plane][y_idx],

                  TX_16X16, bh * bw * 64);

@@ -671,11 +670,10 @@

   for (plane = 0; plane < 2; plane++) {

-    const int cidx = 16 + plane * 4;

     for (n = 0; n < bw * bh; n++) {

       const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

       optimize_b(cm, x, uvoff + n * 4, PLANE_TYPE_UV,

-                 x->e_mbd.block[cidx].dequant,

+                 x->e_mbd.plane[plane + 1].dequant,

                  &ta[plane][x_idx], &tl[plane][y_idx],

                  TX_8X8, bh * bw * 16);

@@ -708,11 +706,10 @@

   for (plane = 0; plane < 2; plane++) {

-    const int cidx = 16 + plane * 4;

     for (n = 0; n < bw * bh; n++) {

       const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

       optimize_b(cm, x, uvoff + n, PLANE_TYPE_UV,

-                 x->e_mbd.block[cidx].dequant,

+                 x->e_mbd.plane[plane + 1].dequant,

                  &ta[plane][x_idx], &tl[plane][y_idx],

                  TX_4X4, bh * bw * 4);

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -28,7 +28,6 @@

 void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

   MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCKD *const d = &xd->block[0];

   int i, rc, eob;

   int zbin;

   int x, y, z, sz;

@@ -41,7 +40,7 @@

   int16_t *round_ptr       = mb->plane[0].round;

   int16_t *quant_ptr       = mb->plane[0].quant;

   uint8_t *quant_shift_ptr = mb->plane[0].quant_shift;

-  int16_t *dequant_ptr     = d->dequant;

+  int16_t *dequant_ptr     = xd->plane[0].dequant;

   int zbin_oq_value        = mb->plane[0].zbin_extra;

   const int *pt_scan = get_scan_4x4(tx_type);

@@ -84,7 +83,6 @@

   MACROBLOCKD *const xd = &mb->e_mbd;

   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);

   const int c_idx = plane_idx(pb_idx.plane);

-  BLOCKD *const d = &xd->block[c_idx];

   int i, rc, eob;

   int zbin;

   int x, y, z, sz;

@@ -99,7 +97,7 @@

   int16_t *round_ptr       = mb->plane[pb_idx.plane].round;

   int16_t *quant_ptr       = mb->plane[pb_idx.plane].quant;

   uint8_t *quant_shift_ptr = mb->plane[pb_idx.plane].quant_shift;

-  int16_t *dequant_ptr     = d->dequant;

+  int16_t *dequant_ptr     = xd->plane[0].dequant;

   int zbin_oq_value        = mb->plane[pb_idx.plane].zbin_extra;

   if (c_idx == 0) assert(pb_idx.plane == 0);

@@ -152,7 +150,6 @@

                                       pb_idx.block, 16);

   int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,

                                     pb_idx.block, 16);

-  BLOCKD *const d = &xd->block[c_idx];

   const int *pt_scan = get_scan_8x8(tx_type);

   if (c_idx == 0) assert(pb_idx.plane == 0);

@@ -171,7 +168,7 @@

     int16_t *round_ptr  = mb->plane[pb_idx.plane].round;

     int16_t *quant_ptr  = mb->plane[pb_idx.plane].quant;

     uint8_t *quant_shift_ptr = mb->plane[pb_idx.plane].quant_shift;

-    int16_t *dequant_ptr = d->dequant;

+    int16_t *dequant_ptr = xd->plane[pb_idx.plane].dequant;

     int zbin_oq_value = mb->plane[pb_idx.plane].zbin_extra;

     eob = -1;

@@ -286,7 +283,6 @@

   MACROBLOCKD *const xd = &mb->e_mbd;

   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);

   const int c_idx = plane_idx(pb_idx.plane);

-  BLOCKD *const d = &xd->block[c_idx];

   const int *pt_scan = get_scan_16x16(tx_type);

   if (c_idx == 0) assert(pb_idx.plane == 0);

@@ -301,7 +297,7 @@

            mb->plane[pb_idx.plane].quant_shift,

            BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),

            BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),

-           d->dequant,

+           xd->plane[pb_idx.plane].dequant,

            mb->plane[pb_idx.plane].zbin_extra,

            &xd->plane[pb_idx.plane].eobs[pb_idx.block],

            pt_scan, 1);

@@ -311,7 +307,6 @@

   MACROBLOCKD *const xd = &mb->e_mbd;

   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);

   const int c_idx = plane_idx(pb_idx.plane);

-  BLOCKD *const d = &xd->block[c_idx];

   if (c_idx == 0) assert(pb_idx.plane == 0);

   if (c_idx == 16) assert(pb_idx.plane == 1);

@@ -325,7 +320,7 @@

            mb->plane[pb_idx.plane].quant_shift,

            BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),

            BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),

-           d->dequant,

+           xd->plane[pb_idx.plane].dequant,

            mb->plane[pb_idx.plane].zbin_extra,

            &xd->plane[pb_idx.plane].eobs[pb_idx.block],

            vp9_default_zig_zag1d_32x32, 2);

@@ -528,8 +523,7 @@

   x->plane[0].round = cpi->Y1round[qindex];

   x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[qindex];

   x->plane[0].zbin_extra = (int16_t)zbin_extra;

-  for (i = 0; i < 16; i++)

-    x->e_mbd.block[i].dequant = cpi->common.y_dequant[qindex];

+  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];

   // UV

   zbin_extra = (cpi->common.uv_dequant[qindex][1] *

@@ -542,9 +536,8 @@

     x->plane[i].round = cpi->UVround[qindex];

     x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];

     x->plane[i].zbin_extra = (int16_t)zbin_extra;

+    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];

-  for (i = 16; i < 24; i++)

-    x->e_mbd.block[i].dequant = cpi->common.uv_dequant[qindex];

   x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -3050,7 +3050,7 @@

         // Hence quantizer step is also 8 times. To get effective quantizer

         // we need to divide by 8 before sending to modeling function.

         model_rd_from_var_lapndz(var, 16 * bw * 16 * bh,

-                                 xd->block[0].dequant[1] >> 3,

+                                 xd->plane[0].dequant[1] >> 3,

                                  &tmp_rate_y, &tmp_dist_y);

         var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,

                                             x->plane[1].src.stride,

@@ -3058,7 +3058,7 @@

                                             xd->plane[1].dst.stride,

                                             &sse);

         model_rd_from_var_lapndz(var, 8 * bw * 8 * bh,

-                                 xd->block[16].dequant[1] >> 3,

+                                 xd->plane[1].dequant[1] >> 3,

                                  &tmp_rate_u, &tmp_dist_u);

         var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,

                                             x->plane[1].src.stride,

@@ -3066,7 +3066,7 @@

                                             xd->plane[1].dst.stride,

                                             &sse);

         model_rd_from_var_lapndz(var, 8 * bw * 8 * bh,

-                                 xd->block[20].dequant[1] >> 3,

+                                 xd->plane[2].dequant[1] >> 3,

                                  &tmp_rate_v, &tmp_dist_v);

         rd = RDCOST(x->rdmult, x->rddiv,

                     rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

@@ -3138,17 +3138,17 @@

         // Note our transform coeffs are 8 times an orthogonal transform.

         // Hence quantizer step is also 8 times. To get effective quantizer

         // we need to divide by 8 before sending to modeling function.

-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3,

+        model_rd_from_var_lapndz(var, 16 * 16, xd->plane[0].dequant[1] >> 3,

                                  &tmp_rate_y, &tmp_dist_y);

         var = vp9_variance8x8(x->plane[1].src.buf, x->plane[1].src.stride,

                               xd->plane[1].dst.buf, xd->plane[1].dst.stride,

                               &sse);

-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3,

+        model_rd_from_var_lapndz(var, 8 * 8, xd->plane[1].dequant[1] >> 3,

                                  &tmp_rate_u, &tmp_dist_u);

         var = vp9_variance8x8(x->plane[2].src.buf, x->plane[1].src.stride,

                               xd->plane[2].dst.buf, xd->plane[1].dst.stride,

                               &sse);

-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3,

+        model_rd_from_var_lapndz(var, 8 * 8, xd->plane[2].dequant[1] >> 3,

                                  &tmp_rate_v, &tmp_dist_v);

         rd = RDCOST(x->rdmult, x->rddiv,

                     rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

@@ -3225,8 +3225,8 @@

     x->skip = 1;

   else if (x->encode_breakout) {

     unsigned int var, sse;

-    int threshold = (xd->block[0].dequant[1]

-                     * xd->block[0].dequant[1] >> 4);

+    int threshold = (xd->plane[0].dequant[1]

+                     * xd->plane[0].dequant[1] >> 4);

     if (threshold < x->encode_breakout)

       threshold = x->encode_breakout;

@@ -3244,7 +3244,7 @@

     if ((int)sse < threshold) {

-      unsigned int q2dc = xd->block[0].dequant[0];

+      unsigned int q2dc = xd->plane[0].dequant[0];

       /* If there is no codeable 2nd order dc

          or a very small uniform pixel change change */

       if ((sse - var < q2dc * q2dc >> 4) ||

--- a/vp9/encoder/x86/vp9_quantize_sse2.asm

+++ b/vp9/encoder/x86/vp9_quantize_sse2.asm

@@ -9,7 +9,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

 ; void vp9_regular_quantize_b_sse2 | arg

--- a/vp9/encoder/x86/vp9_quantize_sse4.asm

+++ b/vp9/encoder/x86/vp9_quantize_sse4.asm

@@ -9,7 +9,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

 ; void vp9_regular_quantize_b_sse4 | arg

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm

@@ -9,7 +9,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

 ; void vp9_fast_quantize_b_ssse3 | arg

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -26,7 +26,6 @@

 #INCLUDES += common

 #INCLUDES += encoder

-VP9_CX_SRCS-yes += encoder/vp9_asm_enc_offsets.c

 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c

 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c

 VP9_CX_SRCS-yes += encoder/vp9_dct.c

@@ -117,6 +116,3 @@

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

-$(eval $(call asm_offsets_template,\

-         vp9_asm_enc_offsets.asm, $(VP9_PREFIX)encoder/vp9_asm_enc_offsets.c))

--

⑨