shithub: libvpx

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -196,6 +196,64 @@

   if (eob > 0) {

     TX_TYPE tx_type = DCT_DCT;

     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      if (xd->lossless) {

+        tx_type = DCT_DCT;

+        vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+      } else {

+        const PLANE_TYPE plane_type = pd->plane_type;

+        switch (tx_size) {

+          case TX_4X4:

+            tx_type = get_tx_type_4x4(plane_type, xd, block);

+            vp9_high_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+            break;

+          case TX_8X8:

+            tx_type = get_tx_type(plane_type, xd);

+            vp9_high_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+            break;

+          case TX_16X16:

+            tx_type = get_tx_type(plane_type, xd);

+            vp9_high_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+            break;

+          case TX_32X32:

+            tx_type = DCT_DCT;

+            vp9_high_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);

+            break;

+          default:

+            assert(0 && "Invalid transform size");

+        }

+      }

+    } else {

+      if (xd->lossless) {

+        tx_type = DCT_DCT;

+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);

+      } else {

+        const PLANE_TYPE plane_type = pd->plane_type;

+        switch (tx_size) {

+          case TX_4X4:

+            tx_type = get_tx_type_4x4(plane_type, xd, block);

+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);

+            break;

+          case TX_8X8:

+            tx_type = get_tx_type(plane_type, xd);

+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);

+            break;

+          case TX_16X16:

+            tx_type = get_tx_type(plane_type, xd);

+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);

+            break;

+          case TX_32X32:

+            tx_type = DCT_DCT;

+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);

+            break;

+          default:

+            assert(0 && "Invalid transform size");

+            return;

+        }

+      }

+    }

+#else

     if (xd->lossless) {

       tx_type = DCT_DCT;

       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);

@@ -220,8 +278,10 @@

           break;

         default:

           assert(0 && "Invalid transform size");

+          return;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     if (eob == 1) {

       vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));

@@ -599,6 +659,9 @@

                  cm->y_dc_delta_q == 0 &&

                  cm->uv_dc_delta_q == 0 &&

                  cm->uv_ac_delta_q == 0;

+#if CONFIG_VP9_HIGHBITDEPTH

+  xd->bd = (int)cm->bit_depth;

+#endif

 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {

@@ -1139,8 +1202,17 @@

 static void read_bitdepth_colorspace_sampling(

     VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {

-  if (cm->profile >= PROFILE_2)

+  if (cm->profile >= PROFILE_2) {

     cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;

+#if CONFIG_VP9_HIGHBITDEPTH

+    cm->use_highbitdepth = 1;

+#endif

+  } else {

+    cm->bit_depth = VPX_BITS_8;

+#if CONFIG_VP9_HIGHBITDEPTH

+    cm->use_highbitdepth = 0;

+#endif

+  }

   cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);

   if (cm->color_space != SRGB) {

     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range

@@ -1244,6 +1316,10 @@

         // case (normative).

         cm->color_space = BT_601;

         cm->subsampling_y = cm->subsampling_x = 1;

+        cm->bit_depth = VPX_BITS_8;

+#if CONFIG_VP9_HIGHBITDEPTH

+        cm->use_highbitdepth = 0;

+#endif

       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);

@@ -1284,6 +1360,9 @@

+#if CONFIG_VP9_HIGHBITDEPTH

+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;

+#endif

   if (pbi->need_resync) {

     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

--- a/vp9/encoder/vp9_aq_variance.c

+++ b/vp9/encoder/vp9_aq_variance.c

@@ -34,6 +34,9 @@

 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]

 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};

+#if CONFIG_VP9_HIGHBITDEPTH

+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};

+#endif

 unsigned int vp9_vaq_segment_id(int energy) {

   ENERGY_IN_BOUNDS(energy);

@@ -126,14 +129,40 @@

     const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;

     const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;

     int avg;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      high_variance(x->plane[0].src.buf, x->plane[0].src.stride,

+                    CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, &sse,

+                    &avg);

+      sse >>= 2 * (xd->bd - 8);

+      avg >>= (xd->bd - 8);

+    } else {

+      variance(x->plane[0].src.buf, x->plane[0].src.stride,

+               vp9_64_zeros, 0, bw, bh, &sse, &avg);

+    }

+#else

     variance(x->plane[0].src.buf, x->plane[0].src.stride,

              vp9_64_zeros, 0, bw, bh, &sse, &avg);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     var = sse - (((int64_t)avg * avg) / (bw * bh));

     return (256 * var) / (bw * bh);

   } else {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,

+                               x->plane[0].src.stride,

+                               CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),

+                               0, &sse);

+    } else {

+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,

+                               x->plane[0].src.stride,

+                               vp9_64_zeros, 0, &sse);

+    }

+#else

     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,

                              x->plane[0].src.stride,

                              vp9_64_zeros, 0, &sse);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     return (256 * var) >> num_pels_log2_lookup[bs];

--- a/vp9/encoder/vp9_bitstream.c

+++ b/vp9/encoder/vp9_bitstream.c

@@ -120,16 +120,28 @@

 static void pack_mb_tokens(vp9_writer *w,

-                           TOKENEXTRA **tp, const TOKENEXTRA *const stop) {

+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,

+                           vpx_bit_depth_t bit_depth) {

   TOKENEXTRA *p = *tp;

   while (p < stop && p->token != EOSB_TOKEN) {

     const int t = p->token;

     const struct vp9_token *const a = &vp9_coef_encodings[t];

-    const vp9_extra_bit *const b = &vp9_extra_bits[t];

     int i = 0;

     int v = a->value;

     int n = a->len;

+#if CONFIG_VP9_HIGHBITDEPTH

+    const vp9_extra_bit *b;

+    if (bit_depth == VPX_BITS_12)

+      b = &vp9_extra_bits_high12[t];

+    else if (bit_depth == VPX_BITS_10)

+      b = &vp9_extra_bits_high10[t];

+    else

+      b = &vp9_extra_bits[t];

+#else

+    const vp9_extra_bit *const b = &vp9_extra_bits[t];

+    (void) bit_depth;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     /* skip one or two nodes */

     if (p->skip_eob_node) {

@@ -387,7 +399,7 @@

   assert(*tok < tok_end);

-  pack_mb_tokens(w, tok, tok_end);

+  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);

 static void write_partition(const VP9_COMMON *const cm,

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -61,16 +61,51 @@

 // Eventually this should be replaced by custom no-reference routines,

 //  which will be faster.

 static const uint8_t VP9_VAR_OFFS[64] = {

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128,

-  128, 128, 128, 128, 128, 128, 128, 128

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128

};

+#if CONFIG_VP9_HIGHBITDEPTH

+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128,

+    128, 128, 128, 128, 128, 128, 128, 128

+};

+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,

+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4

+};

+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,

+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16

+};

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,

                                               const struct buf_2d *ref,

                                               BLOCK_SIZE bs) {

@@ -80,6 +115,32 @@

   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);

+#if CONFIG_VP9_HIGHBITDEPTH

+static unsigned int high_get_sby_perpixel_variance(

+    VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {

+  unsigned int var, sse;

+  switch (bd) {

+    case 10:

+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,

+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),

+                               0, &sse);

+      break;

+    case 12:

+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,

+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),

+                               0, &sse);

+      break;

+    case 8:

+    default:

+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,

+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),

+                               0, &sse);

+      break;

+  }

+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,

                                                    const struct buf_2d *ref,

                                                    int mi_row, int mi_col,

@@ -419,6 +480,22 @@

   } else {

     d = VP9_VAR_OFFS;

     dp = 0;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      switch (xd->bd) {

+        case 10:

+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);

+          break;

+        case 12:

+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);

+          break;

+        case 8:

+        default:

+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);

+          break;

+      }

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   // Fill in the entire tree of 8x8 variances for splits.

@@ -734,7 +811,17 @@

   // Set to zero to make sure we do not use the previous encoded frame stats

   mbmi->skip = 0;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    x->source_variance =

+        high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);

+  } else {

+    x->source_variance =

+        get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);

+  }

+#else

   x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   // Save rdmult before it might be changed, so it can be restored later.

   orig_rdmult = x->rdmult;

@@ -3170,9 +3257,34 @@

   for (i = 0; i < cm->mb_rows; i++) {

     for (j = 0; j < cm->mb_cols; j++) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (cm->use_highbitdepth) {

+        switch (cm->bit_depth) {

+          case VPX_BITS_8:

+            vp9_high_get16x16var(src, src_stride, last_src, last_stride,

+                                 &var16->sse, &var16->sum);

+            break;

+          case VPX_BITS_10:

+            vp9_high_10_get16x16var(src, src_stride, last_src, last_stride,

+                                    &var16->sse, &var16->sum);

+            break;

+          case VPX_BITS_12:

+            vp9_high_12_get16x16var(src, src_stride, last_src, last_stride,

+                                    &var16->sse, &var16->sum);

+            break;

+          default:

+            assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"

+                   " or VPX_BITS_12");

+            return -1;

+        }

+      } else {

+        vp9_get16x16var(src, src_stride, last_src, last_stride,

+                        &var16->sse, &var16->sum);

+      }

+#else

       vp9_get16x16var(src, src_stride, last_src, last_stride,

                       &var16->sse, &var16->sum);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       var16->var = var16->sse -

           (((uint32_t)var16->sum * var16->sum) >> 8);

@@ -3314,7 +3426,15 @@

   cm->tx_mode = select_tx_mode(cpi);

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (cm->use_highbitdepth)

+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;

+  else

+    x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;

+  x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add;

+#else

   x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;

   if (xd->lossless) {

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -51,6 +51,29 @@

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_high_subtract_block_c(int rows, int cols,

+                               int16_t *diff, ptrdiff_t diff_stride,

+                               const uint8_t *src8, ptrdiff_t src_stride,

+                               const uint8_t *pred8, ptrdiff_t pred_stride,

+                               int bd) {

+  int r, c;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

+  (void) bd;

+  for (r = 0; r < rows; r++) {

+    for (c = 0; c < cols; c++) {

+      diff[c] = src[c] - pred[c];

+    }

+    diff += diff_stride;

+    pred += pred_stride;

+    src  += src_stride;

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {

   struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];

@@ -58,6 +81,13 @@

   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    vp9_high_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,

+                            pd->dst.buf, pd->dst.stride, x->e_mbd.bd);

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,

                      pd->dst.buf, pd->dst.stride);

@@ -124,6 +154,8 @@

   int64_t rd_cost0, rd_cost1;

   int rate0, rate1, error0, error1, t0, t1;

   int best, band, pt, i, final_eob;

+  const TOKENVALUE *dct_value_tokens;

+  const int16_t *dct_value_cost;

   assert((!type && !plane) || (type && plane));

   assert(eob <= default_eob);

@@ -140,9 +172,24 @@

   tokens[eob][0].qc = 0;

   tokens[eob][1] = tokens[eob][0];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->bd == 12) {

+    dct_value_tokens = vp9_dct_value_tokens_high12_ptr;

+    dct_value_cost = vp9_dct_value_cost_high12_ptr;

+  } else if (xd->bd == 10) {

+    dct_value_tokens = vp9_dct_value_tokens_high10_ptr;

+    dct_value_cost = vp9_dct_value_cost_high10_ptr;

+  } else {

+    dct_value_tokens = vp9_dct_value_tokens_ptr;

+    dct_value_cost = vp9_dct_value_cost_ptr;

+  }

+#else

+  dct_value_tokens = vp9_dct_value_tokens_ptr;

+  dct_value_cost = vp9_dct_value_cost_ptr;

+#endif

   for (i = 0; i < eob; i++)

     token_cache[scan[i]] =

-        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];

+        vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];

   for (i = eob; i-- > 0;) {

     int base_bits, d2, dx;

@@ -156,7 +203,7 @@

       /* Evaluate the first possibility for this state. */

       rate0 = tokens[next][0].rate;

       rate1 = tokens[next][1].rate;

-      t0 = (vp9_dct_value_tokens_ptr + x)->token;

+      t0 = (dct_value_tokens + x)->token;

       /* Consider both possible successor states. */

       if (next < default_eob) {

         band = band_translate[i + 1];

@@ -169,8 +216,13 @@

       UPDATE_RD_COST();

       /* And pick the best. */

       best = rd_cost1 < rd_cost0;

-      base_bits = vp9_dct_value_cost_ptr[x];

+      base_bits = dct_value_cost[x];

       dx = mul * (dqcoeff[rc] - coeff[rc]);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        dx >>= xd->bd - 8;

+      }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       d2 = dx * dx;

       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

       tokens[i][0].error = d2 + (best ? error1 : error0);

@@ -203,7 +255,7 @@

         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;

         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;

       } else {

-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;

+        t0 = t1 = (dct_value_tokens + x)->token;

       if (next < default_eob) {

         band = band_translate[i + 1];

@@ -222,10 +274,19 @@

       UPDATE_RD_COST();

       /* And pick the best. */

       best = rd_cost1 < rd_cost0;

-      base_bits = vp9_dct_value_cost_ptr[x];

+      base_bits = dct_value_cost[x];

       if (shortcut) {

         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

+#if CONFIG_VP9_HIGHBITDEPTH

+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;

+        } else {

+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

+        }

+#else

+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

         d2 = dx * dx;

       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

@@ -310,7 +371,7 @@

   else

     vp9_high_fdct32x32(src, dst, src_stride);

-#endif

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,

                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {

@@ -328,6 +389,44 @@

   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

   src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    switch (tx_size) {

+      case TX_32X32:

+        high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

+        vp9_high_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,

+                                   p->round_fp, p->quant_fp, p->quant_shift,

+                                   qcoeff, dqcoeff, pd->dequant, p->zbin_extra,

+                                   eob, scan_order->scan, scan_order->iscan);

+        break;

+      case TX_16X16:

+        vp9_high_fdct16x16(src_diff, coeff, diff_stride);

+        vp9_high_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,

+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                             pd->dequant, p->zbin_extra, eob,

+                             scan_order->scan, scan_order->iscan);

+        break;

+      case TX_8X8:

+        vp9_high_fdct8x8(src_diff, coeff, diff_stride);

+        vp9_high_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,

+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                             pd->dequant, p->zbin_extra, eob,

+                             scan_order->scan, scan_order->iscan);

+        break;

+      case TX_4X4:

+        x->fwd_txm4x4(src_diff, coeff, diff_stride);

+        vp9_high_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,

+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                             pd->dequant, p->zbin_extra, eob,

+                             scan_order->scan, scan_order->iscan);

+        break;

+      default:

+        assert(0);

+    }

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   switch (tx_size) {

     case TX_32X32:

       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

@@ -379,6 +478,40 @@

   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

   src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    switch (tx_size) {

+      case TX_32X32:

+        vp9_high_fdct32x32_1(src_diff, coeff, diff_stride);

+        vp9_high_quantize_dc_32x32(coeff, x->skip_block, p->round,

+                                   p->quant_fp[0], qcoeff, dqcoeff,

+                                   pd->dequant[0], eob);

+        break;

+      case TX_16X16:

+        vp9_high_fdct16x16_1(src_diff, coeff, diff_stride);

+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,

+                             p->quant_fp[0], qcoeff, dqcoeff,

+                             pd->dequant[0], eob);

+        break;

+      case TX_8X8:

+        vp9_high_fdct8x8_1(src_diff, coeff, diff_stride);

+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,

+                             p->quant_fp[0], qcoeff, dqcoeff,

+                             pd->dequant[0], eob);

+        break;

+      case TX_4X4:

+        x->fwd_txm4x4(src_diff, coeff, diff_stride);

+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,

+                             p->quant_fp[0], qcoeff, dqcoeff,

+                             pd->dequant[0], eob);

+        break;

+      default:

+        assert(0);

+    }

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   switch (tx_size) {

     case TX_32X32:

       vp9_fdct32x32_1(src_diff, coeff, diff_stride);

@@ -426,6 +559,44 @@

   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

   src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+     switch (tx_size) {

+      case TX_32X32:

+        high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

+        vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,

+                                  p->round, p->quant, p->quant_shift, qcoeff,

+                                  dqcoeff, pd->dequant, p->zbin_extra, eob,

+                                  scan_order->scan, scan_order->iscan);

+        break;

+      case TX_16X16:

+        vp9_high_fdct16x16(src_diff, coeff, diff_stride);

+        vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,

+                            p->quant, p->quant_shift, qcoeff, dqcoeff,

+                            pd->dequant, p->zbin_extra, eob,

+                            scan_order->scan, scan_order->iscan);

+        break;

+      case TX_8X8:

+        vp9_high_fdct8x8(src_diff, coeff, diff_stride);

+        vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,

+                            p->quant, p->quant_shift, qcoeff, dqcoeff,

+                            pd->dequant, p->zbin_extra, eob,

+                            scan_order->scan, scan_order->iscan);

+        break;

+      case TX_4X4:

+        x->fwd_txm4x4(src_diff, coeff, diff_stride);

+        vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,

+                            p->quant, p->quant_shift, qcoeff, dqcoeff,

+                            pd->dequant, p->zbin_extra, eob,

+                            scan_order->scan, scan_order->iscan);

+        break;

+      default:

+        assert(0);

+    }

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   switch (tx_size) {

     case TX_32X32:

       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

@@ -520,6 +691,34 @@

   if (x->skip_encode || p->eobs[block] == 0)

     return;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    switch (tx_size) {

+      case TX_32X32:

+        vp9_high_idct32x32_add(dqcoeff, dst, pd->dst.stride,

+                               p->eobs[block], xd->bd);

+        break;

+      case TX_16X16:

+        vp9_high_idct16x16_add(dqcoeff, dst, pd->dst.stride,

+                               p->eobs[block], xd->bd);

+        break;

+      case TX_8X8:

+        vp9_high_idct8x8_add(dqcoeff, dst, pd->dst.stride,

+                             p->eobs[block], xd->bd);

+        break;

+      case TX_4X4:

+        // this is like vp9_short_idct4x4 but has a special case around eob<=1

+        // which is significant (not just an optimization) for the lossless

+        // case.

+        x->high_itxm_add(dqcoeff, dst, pd->dst.stride,

+                         p->eobs[block], xd->bd);

+        break;

+      default:

+        assert(0 && "Invalid transform size");

+    }

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   switch (tx_size) {

     case TX_32X32:

@@ -557,8 +756,15 @@

   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);

-  if (p->eobs[block] > 0)

+  if (p->eobs[block] > 0) {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+       x->high_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);

+       return;

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);

+  }

 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {

@@ -621,6 +827,115 @@

   dst = &pd->dst.buf[4 * (j * dst_stride + i)];

   src = &p->src.buf[4 * (j * src_stride + i)];

   src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    switch (tx_size) {

+      case TX_32X32:

+        scan_order = &vp9_default_scan_orders[TX_32X32];

+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;

+        vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,

+                                x->skip_encode ? src : dst,

+                                x->skip_encode ? src_stride : dst_stride,

+                                dst, dst_stride, i, j, plane);

+        if (!x->skip_recode) {

+          vp9_high_subtract_block(32, 32, src_diff, diff_stride,

+                                  src, src_stride, dst, dst_stride, xd->bd);

+          high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

+          vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,

+                                    p->round, p->quant, p->quant_shift, qcoeff,

+                                    dqcoeff, pd->dequant, p->zbin_extra, eob,

+                                    scan_order->scan, scan_order->iscan);

+        }

+        if (!x->skip_encode && *eob) {

+          vp9_high_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+        }

+        break;

+      case TX_16X16:

+        tx_type = get_tx_type(pd->plane_type, xd);

+        scan_order = &vp9_scan_orders[TX_16X16][tx_type];

+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;

+        vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,

+                                x->skip_encode ? src : dst,

+                                x->skip_encode ? src_stride : dst_stride,

+                                dst, dst_stride, i, j, plane);

+        if (!x->skip_recode) {

+          vp9_high_subtract_block(16, 16, src_diff, diff_stride,

+                                  src, src_stride, dst, dst_stride, xd->bd);

+          vp9_high_fht16x16(src_diff, coeff, diff_stride, tx_type);

+          vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,

+                              p->quant, p->quant_shift, qcoeff, dqcoeff,

+                              pd->dequant, p->zbin_extra, eob,

+                              scan_order->scan, scan_order->iscan);

+        }

+        if (!x->skip_encode && *eob) {

+          vp9_high_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,

+                                *eob, xd->bd);

+        }

+        break;

+      case TX_8X8:

+        tx_type = get_tx_type(pd->plane_type, xd);

+        scan_order = &vp9_scan_orders[TX_8X8][tx_type];

+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;

+        vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,

+                                x->skip_encode ? src : dst,

+                                x->skip_encode ? src_stride : dst_stride,

+                                dst, dst_stride, i, j, plane);

+        if (!x->skip_recode) {

+          vp9_high_subtract_block(8, 8, src_diff, diff_stride,

+                                  src, src_stride, dst, dst_stride, xd->bd);

+          vp9_high_fht8x8(src_diff, coeff, diff_stride, tx_type);

+          vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,

+                              p->quant, p->quant_shift, qcoeff, dqcoeff,

+                              pd->dequant, p->zbin_extra, eob,

+                              scan_order->scan, scan_order->iscan);

+        }

+        if (!x->skip_encode && *eob) {

+          vp9_high_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,

+                              xd->bd);

+        }

+        break;

+      case TX_4X4:

+        tx_type = get_tx_type_4x4(pd->plane_type, xd, block);

+        scan_order = &vp9_scan_orders[TX_4X4][tx_type];

+        mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;

+        vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,

+                                x->skip_encode ? src : dst,

+                                x->skip_encode ? src_stride : dst_stride,

+                                dst, dst_stride, i, j, plane);

+        if (!x->skip_recode) {

+          vp9_high_subtract_block(4, 4, src_diff, diff_stride,

+                                  src, src_stride, dst, dst_stride, xd->bd);

+          if (tx_type != DCT_DCT)

+            vp9_high_fht4x4(src_diff, coeff, diff_stride, tx_type);

+          else

+            x->fwd_txm4x4(src_diff, coeff, diff_stride);

+          vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,

+                              p->quant, p->quant_shift, qcoeff, dqcoeff,

+                              pd->dequant, p->zbin_extra, eob,

+                              scan_order->scan, scan_order->iscan);

+        }

+        if (!x->skip_encode && *eob) {

+          if (tx_type == DCT_DCT)

+            // this is like vp9_short_idct4x4 but has a special case around

+            // eob<=1 which is significant (not just an optimization) for the

+            // lossless case.

+            x->high_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+          else

+            vp9_high_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);

+        }

+        break;

+      default:

+        assert(0);

+        return;

+    }

+    if (*eob)

+      *(args->skip) = 0;

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   switch (tx_size) {

     case TX_32X32:

--- a/vp9/encoder/vp9_extend.c

+++ b/vp9/encoder/vp9_extend.c

@@ -55,6 +55,52 @@

+#if CONFIG_VP9_HIGHBITDEPTH

+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,

+                                         uint8_t *dst8, int dst_pitch,

+                                         int w, int h,

+                                         int extend_top, int extend_left,

+                                         int extend_bottom, int extend_right) {

+  int i, linesize;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  // copy the left and right most columns out

+  const uint16_t *src_ptr1 = src;

+  const uint16_t *src_ptr2 = src + w - 1;

+  uint16_t *dst_ptr1 = dst - extend_left;

+  uint16_t *dst_ptr2 = dst + w;

+  for (i = 0; i < h; i++) {

+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);

+    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));

+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);

+    src_ptr1 += src_pitch;

+    src_ptr2 += src_pitch;

+    dst_ptr1 += dst_pitch;

+    dst_ptr2 += dst_pitch;

+  }

+  // Now copy the top and bottom lines into each line of the respective

+  // borders

+  src_ptr1 = dst - extend_left;

+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;

+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;

+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;

+  linesize = extend_left + extend_right + w;

+  for (i = 0; i < extend_top; i++) {

+    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));

+    dst_ptr1 += dst_pitch;

+  }

+  for (i = 0; i < extend_bottom; i++) {

+    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));

+    dst_ptr2 += dst_pitch;

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,

                                YV12_BUFFER_CONFIG *dst) {

   // Extend src frame in buffer

@@ -74,6 +120,26 @@

   const int el_uv = el_y >> uv_width_subsampling;

   const int eb_uv = eb_y >> uv_height_subsampling;

   const int er_uv = er_y >> uv_width_subsampling;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {

+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,

+                                 dst->y_buffer, dst->y_stride,

+                                 src->y_width, src->y_height,

+                                 et_y, el_y, eb_y, er_y);

+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,

+                                 dst->u_buffer, dst->uv_stride,

+                                 src->uv_width, src->uv_height,

+                                 et_uv, el_uv, eb_uv, er_uv);

+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,

+                                 dst->v_buffer, dst->uv_stride,

+                                 src->uv_width, src->uv_height,

+                                 et_uv, el_uv, eb_uv, er_uv);

+    return;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   copy_and_extend_plane(src->y_buffer, src->y_stride,

                         dst->y_buffer, dst->y_stride,

--- a/vp9/encoder/vp9_firstpass.c

+++ b/vp9/encoder/vp9_firstpass.c

@@ -281,6 +281,60 @@

   return sse;

+#if CONFIG_VP9_HIGHBITDEPTH

+static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,

+                                                      int bd) {

+  switch (bd) {

+    default:

+      switch (bsize) {

+        case BLOCK_8X8:

+          return vp9_high_mse8x8;

+        case BLOCK_16X8:

+          return vp9_high_mse16x8;

+        case BLOCK_8X16:

+          return vp9_high_mse8x16;

+        default:

+          return vp9_high_mse16x16;

+      }

+      break;

+    case 10:

+      switch (bsize) {

+        case BLOCK_8X8:

+          return vp9_high_10_mse8x8;

+        case BLOCK_16X8:

+          return vp9_high_10_mse16x8;

+        case BLOCK_8X16:

+          return vp9_high_10_mse8x16;

+        default:

+          return vp9_high_10_mse16x16;

+      }

+      break;

+    case 12:

+      switch (bsize) {

+        case BLOCK_8X8:

+          return vp9_high_12_mse8x8;

+        case BLOCK_16X8:

+          return vp9_high_12_mse16x8;

+        case BLOCK_8X16:

+          return vp9_high_12_mse8x16;

+        default:

+          return vp9_high_12_mse16x16;

+      }

+      break;

+  }

+}

+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,

+                                                const struct buf_2d *src,

+                                                const struct buf_2d *ref,

+                                                int bd) {

+  unsigned int sse;

+  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);

+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);

+  return sse;

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 // Refine the motion search range according to the frame dimension

 // for first pass test.

 static int get_search_range(const VP9_COMMON *cm) {

@@ -311,6 +365,11 @@

   // Override the default variance function to use MSE.

   v_fn_ptr.vf = get_block_variance_fn(bsize);

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   // Center the initial step/diamond search on best mv.

   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,

@@ -562,6 +621,24 @@

          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;

       vp9_encode_intra_block_plane(x, bsize, 0);

       this_error = vp9_get_mb_ss(x->plane[0].src_diff);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (cm->use_highbitdepth) {

+        switch (cm->bit_depth) {

+          case VPX_BITS_8:

+            break;

+          case VPX_BITS_10:

+            this_error >>= 4;

+            break;

+          case VPX_BITS_12:

+            this_error >>= 8;

+            break;

+          default:

+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "

+                        "VPX_BITS_10 or VPX_BITS_12");

+            return;

+        }

+      }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {

         vp9_clear_system_state();

@@ -601,8 +678,18 @@

         struct buf_2d unscaled_last_source_buf_2d;

         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;

-        motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                            &xd->plane[0].pre[0]);

+#if CONFIG_VP9_HIGHBITDEPTH

+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+          motion_error = highbd_get_prediction_error(

+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

+        } else {

+          motion_error = get_prediction_error(

+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

+        }

+#else

+        motion_error = get_prediction_error(

+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

         // Compute the motion error of the 0,0 motion using the last source

         // frame as the reference. Skip the further motion search on

@@ -611,8 +698,18 @@

             cpi->unscaled_last_source->y_buffer + recon_yoffset;

         unscaled_last_source_buf_2d.stride =

             cpi->unscaled_last_source->y_stride;

-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                                &unscaled_last_source_buf_2d);

+#if CONFIG_VP9_HIGHBITDEPTH

+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+          raw_motion_error = highbd_get_prediction_error(

+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);

+        } else {

+          raw_motion_error = get_prediction_error(

+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);

+        }

+#else

+        raw_motion_error = get_prediction_error(

+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

         // TODO(pengchong): Replace the hard-coded threshold

         if (raw_motion_error > 25 || lc != NULL) {

@@ -648,8 +745,18 @@

             int gf_motion_error;

             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;

-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                                   &xd->plane[0].pre[0]);

+#if CONFIG_VP9_HIGHBITDEPTH

+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+              gf_motion_error = highbd_get_prediction_error(

+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

+            } else {

+              gf_motion_error = get_prediction_error(

+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

+            }

+#else

+            gf_motion_error = get_prediction_error(

+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

             first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,

                                      &gf_motion_error);

--- a/vp9/encoder/vp9_mcomp.c

+++ b/vp9/encoder/vp9_mcomp.c

@@ -284,16 +284,7 @@

   int tc = bc;                                                             \

   bestmv->row *= 8;                                                        \

-  bestmv->col *= 8;                                                        \

-  if (second_pred != NULL) {                                               \

-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \

-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \

-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \

-  } else {                                                                 \

-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);          \

-  }                                                                        \

-  *distortion = besterr;                                                   \

-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);

+  bestmv->col *= 8;

 int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,

                                         MV *bestmv, const MV *ref_mv,

@@ -309,6 +300,29 @@

                                         const uint8_t *second_pred,

                                         int w, int h) {

   SETUP_SUBPEL_SEARCH;

+  if (second_pred != NULL) {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);

+      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,

+                             y_stride);

+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,

+                        sse1);

+    } else {

+      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);

+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);

+      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);

+    }

+#else

+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);

+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);

+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+  } else {

+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);

+  }

+  *distortion = besterr;

+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);

   if (sad_list &&

       sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&

@@ -401,6 +415,29 @@

                                  const uint8_t *second_pred,

                                  int w, int h) {

   SETUP_SUBPEL_SEARCH;

+  if (second_pred != NULL) {

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);

+      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,

+                             y_stride);

+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,

+                        sse1);

+    } else {

+      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);

+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);

+      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);

+    }

+#else

+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);

+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);

+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+  } else {

+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);

+  }

+  *distortion = besterr;

+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);

   (void) sad_list;  // to silence compiler warning

   // Each subsequent iteration checks at least one point in

--- a/vp9/encoder/vp9_picklpf.c

+++ b/vp9/encoder/vp9_picklpf.c

@@ -40,7 +40,15 @@

   vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,

                         partial_frame);

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (cm->use_highbitdepth) {

+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);

+  } else {

+    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);

+  }

+#else

   filt_err = vp9_get_y_sse(sd, cm->frame_to_show);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   // Re-instate the unfiltered frame

   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);

@@ -145,7 +153,26 @@

     const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);

     // These values were determined by linear fitting the result of the

     // searched level, filt_guess = q * 0.316206 + 3.87252

+#if CONFIG_VP9_HIGHDEPTH

+    int filt_guess;

+    switch (cm->bit_depth) {

+      case VPX_BITS_8:

+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);

+        break;

+      case VPX_BITS_10:

+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);

+        break;

+      case VPX_BITS_12:

+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);

+        break;

+      default:

+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "

+                    "or VPX_BITS_12");

+        return;

+    }

+#else

     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     if (cm->frame_type == KEY_FRAME)

       filt_guess -= 4;

     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -241,13 +241,44 @@

             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],

+                                 dc_quant >> (xd->bd - 5), &rate, &dist);

+  } else {

+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],

+                                 dc_quant >> 3, &rate, &dist);

+  }

+#else

   vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],

                                dc_quant >> 3, &rate, &dist);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   *out_rate_sum = rate >> 1;

   *out_dist_sum = dist << 3;

-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],

-                               ac_quant >> 3, &rate, &dist);

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    vp9_model_rd_from_var_lapndz(var,

+                                 1 << num_pels_log2_lookup[bsize],

+                                 ac_quant >> (xd->bd - 5),

+                                 &rate,

+                                 &dist);

+  } else {

+    vp9_model_rd_from_var_lapndz(var,

+                                 1 << num_pels_log2_lookup[bsize],

+                                 ac_quant >> 3,

+                                 &rate,

+                                 &dist);

+  }

+#else

+  vp9_model_rd_from_var_lapndz(var,

+                               1 << num_pels_log2_lookup[bsize],

+                               ac_quant >> 3,

+                               &rate,

+                               &dist);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   *out_rate_sum += rate;

   *out_dist_sum += dist << 4;

@@ -293,9 +324,17 @@

     // The encode_breakout input

     const unsigned int min_thresh =

         MIN(((unsigned int)x->encode_breakout << 4), max_thresh);

+#if CONFIG_VP9_HIGHBITDEPTH

+    const int shift = 2 * xd->bd - 16;

+#endif

     // Calculate threshold according to dequant value.

     thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {

+      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);

     // Adjust ac threshold according to partition size.

@@ -303,6 +342,11 @@

         8 - (b_width_log2(bsize) + b_height_log2(bsize));

     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {

+      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   } else {

     thresh_ac = 0;

     thresh_dc = 0;

@@ -438,9 +482,8 @@

   // var_y and sse_y are saved to be used in skipping checking

   unsigned int var_y = UINT_MAX;

   unsigned int sse_y = UINT_MAX;

-  const int intra_cost_penalty =

-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(

+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,

                                            intra_cost_penalty, 0);

   const int intra_mode_cost = 50;

@@ -461,14 +504,25 @@

   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.

   PRED_BUFFER tmp[4];

   DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);

+#if CONFIG_VP9_HIGHBITDEPTH

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);

+#endif

   struct buf_2d orig_dst = pd->dst;

   PRED_BUFFER *best_pred = NULL;

   PRED_BUFFER *this_mode_pred = NULL;

+  const int pixels_in_block = bh * bw;

   if (cpi->sf.reuse_inter_pred_sby) {

     int i;

     for (i = 0; i < 3; i++) {

-      tmp[i].data = &pred_buf[bw * bh * i];

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (cm->use_highbitdepth)

+        tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);

+      else

+        tmp[i].data = &pred_buf[pixels_in_block * i];

+#else

+      tmp[i].data = &pred_buf[pixels_in_block * i];

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       tmp[i].stride = bw;

       tmp[i].in_use = 0;

@@ -703,8 +757,18 @@

   if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&

       best_pred->data != orig_dst.buf) {

     pd->dst = orig_dst;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (cm->use_highbitdepth) {

+      vp9_high_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,

+                             NULL, 0, NULL, 0, bw, bh, xd->bd);

+    } else {

+      vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,

+                        NULL, 0, NULL, 0, bw, bh);

+    }

+#else

     vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,

                       NULL, 0, bw, bh);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   mbmi->mode          = best_mode;

--- a/vp9/encoder/vp9_rd.c

+++ b/vp9/encoder/vp9_rd.c

@@ -155,7 +155,7 @@

 #else

   int rdmult = 88 * q * q / 24;

-#endif

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {

     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;

     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];

@@ -187,7 +187,7 @@

 #else

   (void) bit_depth;

   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;

-#endif

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   // TODO(debargha): Adjust the function below.

   return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);

@@ -213,7 +213,7 @@

 #else

   cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];

   cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];

-#endif

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {

@@ -598,3 +598,24 @@

     if (sf->disable_split_mask & (1 << i))

       rd->thresh_mult_sub8x8[i] = INT_MAX;

+int vp9_get_intra_cost_penalty(int qindex, int qdelta,

+                               vpx_bit_depth_t bit_depth) {

+  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);

+#if CONFIG_VP9_HIGHBITDEPTH

+  switch (bit_depth) {

+    case VPX_BITS_8:

+      return 20 * q;

+    case VPX_BITS_10:

+      return 5 * q;

+    case VPX_BITS_12:

+      return ROUND_POWER_OF_TWO(5 * q, 2);

+    default:

+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");

+      return -1;

+  }

+#else

+  return 20 * q;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+}

--- a/vp9/encoder/vp9_rd.h

+++ b/vp9/encoder/vp9_rd.h

@@ -162,6 +162,10 @@

                           int mi_row, int mi_col,

                           const struct scale_factors *scale,

                           const struct scale_factors *scale_uv);

+int vp9_get_intra_cost_penalty(int qindex, int qdelta,

+                               vpx_bit_depth_t bit_depth);

 #ifdef __cplusplus

 }  // extern "C"

 #endif

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -228,9 +228,13 @@

     // Fast approximate the modelling function.

     if (cpi->oxcf.speed > 4) {

       int64_t rate;

-      int64_t dist;

       int64_t square_error = sse;

       int quantizer = (pd->dequant[1] >> 3);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        quantizer >>= (xd->bd - 8);

+      }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       if (quantizer < 120)

         rate = (square_error * (280 - quantizer)) >> 8;

@@ -240,8 +244,19 @@

       rate_sum += rate;

       dist_sum += dist;

     } else {

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],

+                                     pd->dequant[1] >> (xd->bd - 5),

+                                     &rate, &dist);

+      } else {

+        vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],

+                                     pd->dequant[1] >> 3, &rate, &dist);

+      }

+#else

       vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],

                                    pd->dequant[1] >> 3, &rate, &dist);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       rate_sum += rate;

       dist_sum += dist;

@@ -266,6 +281,31 @@

   return error;

+#if CONFIG_VP9_HIGHBITDEPTH

+int64_t vp9_high_block_error_c(const tran_low_t *coeff,

+                               const tran_low_t *dqcoeff,

+                               intptr_t block_size,

+                               int64_t *ssz, int bd) {

+  int i;

+  int64_t error = 0, sqcoeff = 0;

+  int shift = 2 * (bd - 8);

+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;

+  for (i = 0; i < block_size; i++) {

+    const int64_t diff = coeff[i] - dqcoeff[i];

+    error +=  diff * diff;

+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];

+  }

+  assert(error >= 0 && sqcoeff >= 0);

+  error = (error + rounding) >> shift;

+  sqcoeff = (sqcoeff + rounding) >> shift;

+  *ssz = sqcoeff;

+  return error;

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 /* The trailing '0' is a terminator which is used inside cost_coeffs() to

  * decide whether to include cost of a trailing EOB node or not (i.e. we

  * can skip this if the last coefficient in this transform block, e.g. the

@@ -351,8 +391,14 @@

   return cost;

+#if CONFIG_VP9_HIGHBITDEPTH

 static void dist_block(int plane, int block, TX_SIZE tx_size,

+                       struct rdcost_block_args* args, int bd) {

+#else

+static void dist_block(int plane, int block, TX_SIZE tx_size,

                        struct rdcost_block_args* args) {

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   const int ss_txfrm_size = tx_size << 1;

   MACROBLOCK* const x = args->x;

   MACROBLOCKD* const xd = &x->e_mbd;

@@ -362,8 +408,13 @@

   int shift = tx_size == TX_32X32 ? 0 : 2;

   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+#if CONFIG_VP9_HIGHBITDEPTH

+  args->dist = vp9_high_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,

+                                    &this_sse, bd) >> shift;

+#else

   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,

                                &this_sse) >> shift;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   args->sse  = this_sse >> shift;

   if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {

@@ -370,6 +421,11 @@

     // TODO(jingning): tune the model to better capture the distortion.

     int64_t p = (pd->dequant[1] * pd->dequant[1] *

                     (1 << ss_txfrm_size)) >> (shift + 2);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      p >>= ((xd->bd - 8) * 2);

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     args->dist += (p >> 4);

     args->sse  += p;

@@ -399,12 +455,28 @@

   if (!is_inter_block(mbmi)) {

     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      dist_block(plane, block, tx_size, args, xd->bd);

+    } else {

+      dist_block(plane, block, tx_size, args, 8);

+    }

+#else

     dist_block(plane, block, tx_size, args);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   } else if (max_txsize_lookup[plane_bsize] == tx_size) {

     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {

       // full forward transform and quantization

       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        dist_block(plane, block, tx_size, args, xd->bd);

+      } else {

+        dist_block(plane, block, tx_size, args, 8);

+      }

+#else

       dist_block(plane, block, tx_size, args);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {

       // compute DC coefficient

       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);

@@ -424,7 +496,15 @@

   } else {

     // full forward transform and quantization

     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      dist_block(plane, block, tx_size, args, xd->bd);

+    } else {

+      dist_block(plane, block, tx_size, args, 8);

+    }

+#else

     dist_block(plane, block, tx_size, args);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   rate_block(plane, block, plane_bsize, tx_size, args);

@@ -659,6 +739,9 @@

   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];

   int idx, idy;

   uint8_t best_dst[8 * 8];

+#if CONFIG_VP9_HIGHBITDEPTH

+  uint16_t best_dst16[8 * 8];

+#endif

   assert(ib < 4);

@@ -666,6 +749,108 @@

   vpx_memcpy(tl, l, sizeof(tl));

   xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {

+      int64_t this_rd;

+      int ratey = 0;

+      int64_t distortion = 0;

+      int rate = bmode_costs[mode];

+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))

+        continue;

+      // Only do the oblique modes if the best so far is

+      // one of the neighboring directional modes

+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {

+        if (conditional_skipintra(mode, *best_mode))

+            continue;

+      }

+      vpx_memcpy(tempa, ta, sizeof(ta));

+      vpx_memcpy(templ, tl, sizeof(tl));

+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {

+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {

+          const int block = ib + idy * 2 + idx;

+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];

+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];

+          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,

+                                                              p->src_diff);

+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);

+          xd->mi[0].src_mi->bmi[block].as_mode = mode;

+          vp9_predict_intra_block(xd, block, 1,

+                                  TX_4X4, mode,

+                                  x->skip_encode ? src : dst,

+                                  x->skip_encode ? src_stride : dst_stride,

+                                  dst, dst_stride, idx, idy, 0);

+          vp9_high_subtract_block(4, 4, src_diff, 8, src, src_stride,

+                                  dst, dst_stride, xd->bd);

+          if (xd->lossless) {

+            const scan_order *so = &vp9_default_scan_orders[TX_4X4];

+            vp9_high_fwht4x4(src_diff, coeff, 8);

+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);

+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,

+                                 so->scan, so->neighbors,

+                                 cpi->sf.use_fast_coef_costing);

+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

+              goto next_highbd;

+            vp9_high_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),

+                                 dst, dst_stride,

+                                 p->eobs[block], xd->bd);

+          } else {

+            int64_t unused;

+            const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);

+            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];

+            vp9_high_fht4x4(src_diff, coeff, 8, tx_type);

+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);

+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,

+                                 so->scan, so->neighbors,

+                                 cpi->sf.use_fast_coef_costing);

+            distortion += vp9_high_block_error(coeff,

+                                               BLOCK_OFFSET(pd->dqcoeff, block),

+                                               16, &unused, xd->bd) >> 2;

+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

+              goto next_highbd;

+            vp9_high_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),

+                                dst, dst_stride, p->eobs[block], xd->bd);

+          }

+        }

+      }

+      rate += ratey;

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+      if (this_rd < best_rd) {

+        *bestrate = rate;

+        *bestratey = ratey;

+        *bestdistortion = distortion;

+        best_rd = this_rd;

+        *best_mode = mode;

+        vpx_memcpy(a, tempa, sizeof(tempa));

+        vpx_memcpy(l, templ, sizeof(templ));

+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {

+          vpx_memcpy(best_dst16 + idy * 8,

+                     CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),

+                     num_4x4_blocks_wide * 4 * sizeof(uint16_t));

+        }

+      }

+    next_highbd:

+      {}

+    }

+    if (best_rd >= rd_thresh || x->skip_encode)

+      return best_rd;

+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {

+      vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),

+                 best_dst16 + idy * 8,

+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));

+    }

+    return best_rd;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {

     int64_t this_rd;

     int ratey = 0;

@@ -1118,6 +1303,16 @@

   for (ref = 0; ref < 1 + is_compound; ++ref) {

     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,

                                                pd->pre[ref].stride)];

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    vp9_high_build_inter_predictor(pre, pd->pre[ref].stride,

+                                   dst, pd->dst.stride,

+                                   &mi->bmi[i].as_mv[ref].as_mv,

+                                   &xd->block_refs[ref]->sf, width, height, ref,

+                                   kernel, MV_PRECISION_Q3,

+                                   mi_col * MI_SIZE + 4 * (i % 2),

+                                   mi_row * MI_SIZE + 4 * (i / 2), xd->bd);

+  } else {

     vp9_build_inter_predictor(pre, pd->pre[ref].stride,

                               dst, pd->dst.stride,

                               &mi->bmi[i].as_mv[ref].as_mv,

@@ -1126,11 +1321,32 @@

                               mi_col * MI_SIZE + 4 * (i % 2),

                               mi_row * MI_SIZE + 4 * (i / 2));

+#else

+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,

+                              dst, pd->dst.stride,

+                              &mi->bmi[i].as_mv[ref].as_mv,

+                              &xd->block_refs[ref]->sf, width, height, ref,

+                              kernel, MV_PRECISION_Q3,

+                              mi_col * MI_SIZE + 4 * (i % 2),

+                              mi_row * MI_SIZE + 4 * (i / 2));

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+  }

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    vp9_high_subtract_block(

+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,

+        src, p->src.stride, dst, pd->dst.stride, xd->bd);

+  } else {

+    vp9_subtract_block(

+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,

+        src, p->src.stride, dst, pd->dst.stride);

+  }

+#else

   vp9_subtract_block(height, width,

                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,

-                     src, p->src.stride,

-                     dst, pd->dst.stride);

+                     src, p->src.stride, dst, pd->dst.stride);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   k = i;

   for (idy = 0; idy < height / 4; ++idy) {

@@ -1143,8 +1359,19 @@

       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),

                     coeff, 8);

       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        thisdistortion += vp9_high_block_error(coeff,

+                                               BLOCK_OFFSET(pd->dqcoeff, k),

+                                               16, &ssz, xd->bd);

+      } else {

+        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),

+                                          16, &ssz);

+      }

+#else

       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),

                                         16, &ssz);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

       thissse += ssz;

       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,

                               so->scan, so->neighbors,

@@ -1901,7 +2128,12 @@

   int_mv ref_mv[2];

   int ite, ref;

   // Prediction buffer from second frame.

+#if CONFIG_VP9_HIGHBITDEPTH

+  uint8_t *second_pred;

+  uint8_t *second_pred_alloc;

+#else

   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);

   // Do joint motion search in compound mode to get more accurate mv.

@@ -1912,6 +2144,15 @@

     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),

     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])

};

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));

+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);

+  } else {

+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));

+    second_pred = second_pred_alloc;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   for (ref = 0; ref < 2; ++ref) {

     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];

@@ -1950,6 +2191,28 @@

     ref_yv12[1] = xd->plane[0].pre[1];

     // Get pred block from second frame.

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      vp9_high_build_inter_predictor(ref_yv12[!id].buf,

+                                     ref_yv12[!id].stride,

+                                     second_pred, pw,

+                                     &frame_mv[refs[!id]].as_mv,

+                                     &xd->block_refs[!id]->sf,

+                                     pw, ph, 0,

+                                     kernel, MV_PRECISION_Q3,

+                                     mi_col * MI_SIZE, mi_row * MI_SIZE,

+                                     xd->bd);

+    } else {

+      vp9_build_inter_predictor(ref_yv12[!id].buf,

+                                ref_yv12[!id].stride,

+                                second_pred, pw,

+                                &frame_mv[refs[!id]].as_mv,

+                                &xd->block_refs[!id]->sf,

+                                pw, ph, 0,

+                                kernel, MV_PRECISION_Q3,

+                                mi_col * MI_SIZE, mi_row * MI_SIZE);

+    }

+#else

     vp9_build_inter_predictor(ref_yv12[!id].buf,

                               ref_yv12[!id].stride,

                               second_pred, pw,

@@ -1958,6 +2221,7 @@

                               pw, ph, 0,

                               kernel, MV_PRECISION_Q3,

                               mi_col * MI_SIZE, mi_row * MI_SIZE);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     // Compound motion search on first ref frame.

     if (id)

@@ -2026,7 +2290,11 @@

                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);

+#if CONFIG_VP9_HIGHBITDEPTH

+  vpx_free(second_pred_alloc);

+#else

   vpx_free(second_pred);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 static INLINE void restore_dst_buf(MACROBLOCKD *xd,

@@ -2068,6 +2336,13 @@

     // Calculate threshold according to dequant value.

     thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      const int shift = 2 * xd->bd - 16;

+      if (shift > 0)

+        thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);

     // Adjust threshold according to partition size.

@@ -2074,6 +2349,13 @@

     thresh_ac >>= 8 - (b_width_log2(bsize) +

         b_height_log2(bsize));

     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+      const int shift = 2 * xd->bd - 16;

+      if (shift > 0)

+        thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);

+    }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   } else {

     thresh_ac = 0;

     thresh_dc = 0;

@@ -2145,7 +2427,13 @@

   int refs[2] = { mbmi->ref_frame[0],

     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };

   int_mv cur_mv[2];

+#if CONFIG_VP9_HIGHBITDEPTH

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);

+  uint8_t *tmp_buf = tmp_buf8;

+#else

   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   int pred_exists = 0;

   int intpel_mv;

   int64_t rd, tmp_rd, best_rd = INT64_MAX;

@@ -2162,6 +2450,14 @@

       (((mi_row + mi_col) >> bsl) +

        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);

+  } else {

+    tmp_buf = tmp_buf8;

+  }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

   if (pred_filter_search) {

     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;

     if (xd->up_available)

@@ -2575,8 +2871,8 @@

   int64_t dist_uv[TX_SIZES];

   int skip_uv[TX_SIZES];

   PREDICTION_MODE mode_uv[TX_SIZES];

-  const int intra_cost_penalty =

-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(

+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

   int best_skip2 = 0;

   uint8_t ref_frame_skip_mask[2] = { 0 };

   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };

@@ -3011,9 +3307,14 @@

         // based on qp, activity mask and history

         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&

             (mode_index > MIN_EARLY_TERM_INDEX)) {

-          const int qstep = xd->plane[0].dequant[1];

+          int qstep = xd->plane[0].dequant[1];

           // TODO(debargha): Enhance this by specializing for each mode_index

           int scale = 4;

+#if CONFIG_VP9_HIGHBITDEPTH

+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+            qstep >>= (xd->bd - 8);

+          }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

           if (x->source_variance < UINT_MAX) {

             const int var_adjust = (x->source_variance < 16);

             scale -= var_adjust;

@@ -3329,8 +3630,8 @@

   int64_t dist_uv;

   int skip_uv;

   PREDICTION_MODE mode_uv = DC_PRED;

-  const int intra_cost_penalty =

-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(

+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);

   int_mv seg_mvs[4][MAX_REF_FRAMES];

   b_mode_info best_bmodes[4];

   int best_skip2 = 0;

@@ -3748,9 +4049,14 @@

         // based on qp, activity mask and history

         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&

             (ref_index > MIN_EARLY_TERM_INDEX)) {

-          const int qstep = xd->plane[0].dequant[1];

+          int qstep = xd->plane[0].dequant[1];

           // TODO(debargha): Enhance this by specializing for each mode_index

           int scale = 4;

+#if CONFIG_VP9_HIGHBITDEPTH

+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+            qstep >>= (xd->bd - 8);

+          }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

           if (x->source_variance < UINT_MAX) {

             const int var_adjust = (x->source_variance < 16);

             scale -= var_adjust;

--- a/vp9/encoder/vp9_rdopt.h

+++ b/vp9/encoder/vp9_rdopt.h

@@ -54,7 +54,6 @@

                                       BLOCK_SIZE bsize,

                                       PICK_MODE_CONTEXT *ctx,

                                       int64_t best_rd_so_far);

 #ifdef __cplusplus

 }  // extern "C"

 #endif

--- a/vp9/encoder/vp9_tokenize.h

+++ b/vp9/encoder/vp9_tokenize.h

@@ -53,6 +53,12 @@

  *  fields are not.

*/

 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;

+#if CONFIG_VP9_HIGHBITDEPTH

+extern const int16_t *vp9_dct_value_cost_high10_ptr;

+extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;

+extern const int16_t *vp9_dct_value_cost_high12_ptr;

+extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus

 }  // extern "C"

--

⑨