shithub: libvpx

--- a/vp8/encoder/arm/quantize_arm.c

+++ b/vp8/encoder/arm/quantize_arm.c

@@ -29,7 +29,7 @@

 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)

-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);

+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);

/*

--- a/vp8/encoder/block.h

+++ b/vp8/encoder/block.h

@@ -33,6 +33,7 @@

     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

     short *quant;

+    short *quant_fast;

     short *quant_shift;

     short *zbin;

     short *zrun_zbin_boost;

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -179,6 +179,7 @@

         // dc values

         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);

+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;

         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,

                            cpi->Y1quant_shift[Q] + 0, quant_val);

         cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;

@@ -187,6 +188,7 @@

         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);

+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;

         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,

                            cpi->Y2quant_shift[Q] + 0, quant_val);

         cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;

@@ -195,6 +197,7 @@

         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);

+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;

         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,

                            cpi->UVquant_shift[Q] + 0, quant_val);

         cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;

@@ -208,6 +211,7 @@

             int rc = vp8_default_zig_zag1d[i];

             quant_val = vp8_ac_yquant(Q);

+            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;

             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,

                                cpi->Y1quant_shift[Q] + rc, quant_val);

             cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;

@@ -216,6 +220,7 @@

             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;

             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);

+            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;

             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,

                                cpi->Y2quant_shift[Q] + rc, quant_val);

             cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;

@@ -224,6 +229,7 @@

             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;

             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);

+            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;

             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,

                                cpi->UVquant_shift[Q] + rc, quant_val);

             cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;

@@ -325,6 +331,7 @@

     for (i = 0; i < 16; i++)

         x->block[i].quant = cpi->Y1quant[QIndex];

+        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];

         x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

         x->block[i].zbin = cpi->Y1zbin[QIndex];

         x->block[i].round = cpi->Y1round[QIndex];

@@ -339,6 +346,7 @@

     for (i = 16; i < 24; i++)

         x->block[i].quant = cpi->UVquant[QIndex];

+        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];

         x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

         x->block[i].zbin = cpi->UVzbin[QIndex];

         x->block[i].round = cpi->UVround[QIndex];

@@ -349,6 +357,7 @@

     // Y2

     zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;

+    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];

     x->block[24].quant = cpi->Y2quant[QIndex];

     x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];

     x->block[24].zbin = cpi->Y2zbin[QIndex];

@@ -1270,7 +1279,18 @@

     if (cpi->sf.RD)

+        /* Are we using the fast quantizer for the mode selection? */

+        if(cpi->sf.use_fastquant_for_pick)

+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);

         inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);

+        /* switch back to the regular quantizer for the encode */

+        if (cpi->sf.improved_quant)

+        {

+            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);

+        }

     else

 #endif

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -591,6 +591,7 @@

     sf->max_fs_radius = 32;

     sf->iterative_sub_pixel = 1;

     sf->optimize_coefficients = 1;

+    sf->use_fastquant_for_pick = 0;

     sf->first_step = 0;

     sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

@@ -758,7 +759,7 @@

             cpi->mode_check_freq[THR_SPLITG] = 4;

             cpi->mode_check_freq[THR_SPLITA] = 4;

-            cpi->mode_check_freq[THR_SPLITMV] = 0;

+            cpi->mode_check_freq[THR_SPLITMV] = 2;

             sf->thresh_mult[THR_TM       ] = 1500;

             sf->thresh_mult[THR_V_PRED   ] = 1500;

@@ -789,8 +790,7 @@

                 sf->thresh_mult[THR_SPLITA   ] = 20000;

-            sf->improved_quant = 0;

-            sf->improved_dct = 0;

+            sf->use_fastquant_for_pick = 1;

             sf->first_step = 1;

             sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

@@ -798,6 +798,8 @@

         if (Speed > 1)

+            sf->use_fastquant_for_pick = 0;

             cpi->mode_check_freq[THR_SPLITG] = 15;

             cpi->mode_check_freq[THR_SPLITA] = 15;

             cpi->mode_check_freq[THR_SPLITMV] = 7;

@@ -830,6 +832,11 @@

                 sf->thresh_mult[THR_NEWA     ] = 2500;

                 sf->thresh_mult[THR_SPLITA   ] = 50000;

+            sf->first_step = 1;

+            sf->improved_quant = 0;

+            sf->improved_dct = 0;

             // Only do recode loop on key frames, golden frames and

             // alt ref frames

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -182,6 +182,8 @@

     int first_step;

     int optimize_coefficients;

+    int use_fastquant_for_pick;

 } SPEED_FEATURES;

 typedef struct

@@ -269,6 +271,9 @@

     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);

     MACROBLOCK mb;

--- a/vp8/encoder/quantize.c

+++ b/vp8/encoder/quantize.c

@@ -27,7 +27,7 @@

     short *coeff_ptr       = b->coeff;

     short *zbin_ptr        = b->zbin;

     short *round_ptr       = b->round;

-    short *quant_ptr       = b->quant;

+    short *quant_ptr       = b->quant_fast;

     short *quant_shift_ptr = b->quant_shift;

     short *qcoeff_ptr      = d->qcoeff;

     short *dqcoeff_ptr     = d->dqcoeff;

@@ -74,7 +74,7 @@

     int x, y, z, sz;

     short *coeff_ptr   = b->coeff;

     short *round_ptr   = b->round;

-    short *quant_ptr   = b->quant;

+    short *quant_ptr   = b->quant_fast;

     short *qcoeff_ptr  = d->qcoeff;

     short *dqcoeff_ptr = d->dqcoeff;

     short *dequant_ptr = d->dequant;

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -1026,6 +1026,7 @@

 typedef struct

   MV *ref_mv;

+  MV *mvp;

   int segment_rd;

   int segment_num;

@@ -1039,6 +1040,9 @@

   int mvthresh;

   int *mdcounts;

+  MV sv_mvp[4];     // save 4 mvp from 8x8

+  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16

 } BEST_SEG_INFO;

@@ -1124,7 +1128,7 @@

                 int sseshift;

                 int num00;

                 int step_param = 0;

-                int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

+                int further_steps;

                 int n;

                 int thissme;

                 int bestsme = INT_MAX;

@@ -1136,7 +1140,28 @@

                 if (best_label_rd < label_mv_thresh)

                     break;

+                if(cpi->compressor_speed)

+                    if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8)

+                    {

+                        bsi->mvp = &bsi->sv_mvp[i];

+                        if (i==1 && segmentation == BLOCK_16X8) bsi->mvp = &bsi->sv_mvp[2];

+                        step_param = bsi->sv_istep[i];

+                    }

+                    // use previous block's result as next block's MV predictor.

+                    if (segmentation == BLOCK_4X4 && i>0)

+                    {

+                        bsi->mvp = &(x->e_mbd.block[i-1].bmi.mv.as_mv);

+                        if (i==4 || i==8 || i==12) bsi->mvp = &(x->e_mbd.block[i-4].bmi.mv.as_mv);

+                        step_param = 2;

+                    }

+                }

+                further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

+                {

                     int sadpb = x->sadperbit4;

                     // find first label

@@ -1151,7 +1176,7 @@

                     else

-                        bestsme = cpi->diamond_search_sad(x, c, e, bsi->ref_mv,

+                        bestsme = cpi->diamond_search_sad(x, c, e, bsi->mvp,

                                                           &mode_mv[NEW4X4], step_param,

                                                           sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);

@@ -1166,7 +1191,7 @@

                                 num00--;

                             else

-                                thissme = cpi->diamond_search_sad(x, c, e, bsi->ref_mv,

+                                thissme = cpi->diamond_search_sad(x, c, e, bsi->mvp,

                                                                   &temp_mv, step_param + n,

                                                                   sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);

@@ -1185,7 +1210,7 @@

                     // Should we do a full search (best quality only)

                     if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)

-                        thissme = cpi->full_search_sad(x, c, e, bsi->ref_mv,

+                        thissme = cpi->full_search_sad(x, c, e, bsi->mvp,

                                                        sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost,bsi->ref_mv);

                         if (thissme < bestsme)

@@ -1254,8 +1279,9 @@

         segmentyrate += bestlabelyrate;

         this_segment_rd += best_label_rd;

-        if (this_segment_rd > bsi->segment_rd)

+        if (this_segment_rd >= bsi->segment_rd)

             break;

     } /* for each label */

     if (this_segment_rd < bsi->segment_rd)

@@ -1277,6 +1303,21 @@

+static __inline

+void vp8_cal_step_param(int sr, int *sp)

+{

+    int step = 0;

+    if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;

+    else if (sr < 1) sr = 1;

+    while (sr>>=1)

+        step++;

+    *sp = MAX_MVSEARCH_STEPS - 1 - step;

+}

 static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,

                                            MV *best_ref_mv, int best_rd,

                                            int *mdcounts, int *returntotrate,

@@ -1285,14 +1326,12 @@

     int i;

     BEST_SEG_INFO bsi;

-    BEST_SEG_INFO bsi_8x8;

-    int check_8x16 = 0;

-    int check_16x8 = 0;

     vpx_memset(&bsi, 0, sizeof(bsi));

     bsi.segment_rd = best_rd;

     bsi.ref_mv = best_ref_mv;

+    bsi.mvp = best_ref_mv;

     bsi.mvthresh = mvthresh;

     bsi.mdcounts = mdcounts;

@@ -1300,6 +1339,7 @@

         bsi.modes[i] = ZERO4X4;

     if(cpi->compressor_speed == 0)

         /* for now, we will keep the original segmentation order

@@ -1311,12 +1351,73 @@

     else

+        int sr;

         vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8);

         if (bsi.segment_rd < best_rd)

-          vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);

-          vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);

-          vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);

+            int col_min = (best_ref_mv->col - MAX_POSSIBLE_MV) >>3;

+            int col_max = (best_ref_mv->col + MAX_POSSIBLE_MV) >>3;

+            int row_min = (best_ref_mv->row - MAX_POSSIBLE_MV) >>3;

+            int row_max = (best_ref_mv->row + MAX_POSSIBLE_MV) >>3;

+            int tmp_col_min = x->mv_col_min;

+            int tmp_col_max = x->mv_col_max;

+            int tmp_row_min = x->mv_row_min;

+            int tmp_row_max = x->mv_row_max;

+            /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */

+            if (x->mv_col_min < col_min )

+                x->mv_col_min = col_min;

+            if (x->mv_col_max > col_max )

+                x->mv_col_max = col_max;

+            if (x->mv_row_min < row_min )

+                x->mv_row_min = row_min;

+            if (x->mv_row_max > row_max )

+                x->mv_row_max = row_max;

+            /* Get 8x8 result */

+            bsi.sv_mvp[0] = bsi.mvs[0].as_mv;

+            bsi.sv_mvp[1] = bsi.mvs[2].as_mv;

+            bsi.sv_mvp[2] = bsi.mvs[8].as_mv;

+            bsi.sv_mvp[3] = bsi.mvs[10].as_mv;

+            /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */

+            /* block 8X16 */

+            {

+                sr = MAXF((abs(bsi.sv_mvp[0].row - bsi.sv_mvp[2].row))>>3, (abs(bsi.sv_mvp[0].col - bsi.sv_mvp[2].col))>>3);

+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);

+                sr = MAXF((abs(bsi.sv_mvp[1].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[1].col - bsi.sv_mvp[3].col))>>3);

+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);

+                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);

+            }

+            /* block 16X8 */

+            {

+                sr = MAXF((abs(bsi.sv_mvp[0].row - bsi.sv_mvp[1].row))>>3, (abs(bsi.sv_mvp[0].col - bsi.sv_mvp[1].col))>>3);

+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);

+                sr = MAXF((abs(bsi.sv_mvp[2].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[2].col - bsi.sv_mvp[3].col))>>3);

+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);

+                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);

+            }

+            /* If 8x8 is better than 16x8/8x16, then do 4x4 search */

+            if (bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */

+            {

+                bsi.mvp = &bsi.sv_mvp[0];

+                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);

+            }

+            /* restore UMV window */

+            x->mv_col_min = tmp_col_min;

+            x->mv_col_max = tmp_col_max;

+            x->mv_row_min = tmp_row_min;

+            x->mv_row_max = tmp_row_max;

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -32,7 +32,7 @@

     short *coeff_ptr   = b->coeff;

     short *zbin_ptr    = b->zbin;

     short *round_ptr   = b->round;

-    short *quant_ptr   = b->quant;

+    short *quant_ptr   = b->quant_fast;

     short *qcoeff_ptr  = d->qcoeff;

     short *dqcoeff_ptr = d->dqcoeff;

     short *dequant_ptr = d->dequant;

@@ -90,7 +90,7 @@

     short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;

     short *coeff_ptr   = b->coeff;

     short *round_ptr   = b->round;

-    short *quant_ptr   = b->quant;

+    short *quant_ptr   = b->quant_fast;

     short *qcoeff_ptr  = d->qcoeff;

     short *dqcoeff_ptr = d->dqcoeff;

     short *dequant_ptr = d->dequant;

@@ -183,7 +183,7 @@

                     d->qcoeff,

                     d->dequant,

                     b->round,

-                    b->quant,

+                    b->quant_fast,

                     d->dqcoeff

);

--

⑨