shithub: libvpx

Download patch

ref: 516db21c2c903a7d9b0b5fc156277e9bb538ade9
parent: e42b280e1170d485af47000f411ff45d56af33bd
author: Deb Mukherjee <debargha@google.com>
date: Wed Jan 9 01:26:54 EST 2013

Further enhancements/fixes on dct/dwt hybrid txfm

Fixes some scaling issues. Adds an option to only compute the
dct on the low-low subband for 32x32 and 64x64 blocks using
only a single 16x16 dct after 1 and 2 wavelet decomposition
levels respectively. Also adds an option to use a 8x8 dct
as building block.

Currenlty with the 2/6 filter and with a single 16x16 dct on
the low low band, the reuslts compared to full 32x32 dct is
as follows:
derf: -0.15%
yt: -0.29%
std-hd: -0.18%
hd: -0.6%
These are my current recommended settings, since the 2/6 filter
is very simple.

Results with 8x8 dct are about 0.3% worse.

Change-Id: I00100cdc96e32deced591985785ef0d06f325e44

--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -47,6 +47,18 @@
 #define MAX_MV_REFS 9
 #define MAX_MV_REF_CANDIDATES 4
 
+#if CONFIG_DWTDCTHYBRID
+#define DWT_MAX_LENGTH     64
+#define DWT_TYPE           26    // 26/53/97
+#define DWT_PRECISION_BITS 2
+#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)
+
+#define DWTDCT16X16        0
+#define DWTDCT16X16_LEAN   1
+#define DWTDCT8X8          2
+#define DWTDCT_TYPE        DWTDCT16X16_LEAN
+#endif
+
 typedef struct {
   int r, c;
 } POS;
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -70,7 +70,8 @@
   12, 13, 14, 15
 };
 
-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
+DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = {
+  0, 1, 2, 3, 5, 4, 4, 5,
   5, 3, 6, 3, 5, 4, 6, 6,
   6, 5, 5, 6, 6, 6, 6, 6,
   6, 6, 6, 6, 6, 6, 6, 6,
@@ -143,7 +144,215 @@
 };
 
 #if CONFIG_DWTDCTHYBRID
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
 DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+  0,    1,   32,   64,   33,    2,    3,   34,
+  65,   96, 128,   97,   66,   35,    4,  5,
+  36,   67,   98,  129,  160,  192,  161,  130,
+  99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224, 256,  225,  194,  163,
+  132,  101,   70,   39,    8,    9,   40,   71,
+  102,  133,  164,  195,  226,  257,  288,  320,
+  289,  258,  227,  196,  165,  134,  103,   72,
+  41,   10,   11,   42,   73,  104,  135,  166,
+  197,  228,  259,  290,  321,  352,  384,  353,
+  322,  291,  260,  229,  198,  167,  136,  105,
+  74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,
+  416,  448,  417,  386,  355,  324,  293,  262,
+  231,  200,  169,  138,  107,   76,   45,   14,
+  15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,
+  481,  450,  419,  388,  357,  326,  295,  264,
+  233,  202,  171,  140,  109,   78,   47,   79,
+  110,  141,  172,  203,  234,  265,  296,  327,
+  358,  389,  420,  451,  482,  483,  452,  421,
+  390,  359,  328,  297,  266,  235,  204,  173,
+  142,  111,  143,  174,  205,  236,  267,  298,
+  329,  360,  391,  422,  453,  484,  485,  454,
+  423,  392,  361,  330,  299,  268,  237,  206,
+  175,  207,  238,  269,  300,  331,  362,  393,
+  424,  455,  486,  487,  456,  425,  394,  363,
+  332,  301,  270,  239,  271,  302,  333,  364,
+  395,  426,  457,  488,  489,  458,  427,  396,
+  365,  334,  303,  335,  366,  397,  428,  459,
+  490,  491,  460,  429,  398,  367,  399,  430,
+  461,  492,  493,  462,  431,  463,  494,  495,
+
+  16,   512,  528, 17,  513,  529,   48,  544,
+  560, 80,  576,  592,   49,  545,  561,   18,
+  514,  530,   19,  515,  531,   50,  546,  562,
+  81,  577,  593,  112,  608,  624,  144,  640,
+  656,  113,  609,  625,   82,  578,  594,   51,
+  547,  563,   20,  516,  532,   21,  517,  533,
+  52,  548,  564,   83,  579,  595,  114,  610,
+  626,  145,  641,  657,  176,  672,  688,  208,
+  704,  720,  177,  673,  689,  146,  642,  658,
+  115,  611,  627,   84,  580,  596,   53,  549,
+  565,   22,  518,  534,   23,  519,  535,   54,
+  550,  566,   85,  581,  597,  116,  612,  628,
+  147,  643,  659,  178,  674,  690,  209,  705,
+  721,  240,  736,  752,  272,  768,  784,  241,
+  737,  753,  210,  706,  722,  179,  675,  691,
+  148,  644,  660,  117,  613,  629,   86,  582,
+  598,   55,  551,  567,   24,  520,  536,   25,
+  521,  537,   56,  552,  568,   87,  583,  599,
+  118,  614,  630,  149,  645,  661,  180,  676,
+  692,  211,  707,  723,  242,  738,  754,  273,
+  769,  785,  304,  800,  816,  336,  832,  848,
+  305,  801,  817,  274,  770,  786,  243,  739,
+  755,  212,  708,  724,  181,  677,  693,  150,
+  646,  662,  119,  615,  631,   88,  584,  600,
+  57,  553,  569,   26,  522,  538,   27,  523,
+  539,   58,  554,  570,   89,  585,  601,  120,
+  616,  632,  151,  647,  663,  182,  678,  694,
+  213,  709,  725,  244,  740,  756,  275,  771,
+  787,  306,  802,  818,  337,  833,  849,  368,
+  864,  880,  400,  896,  912,  369,  865,  881,
+  338,  834,  850,  307,  803,  819,  276,  772,
+  788,  245,  741,  757,  214,  710,  726,  183,
+
+  679,  695,  152,  648,  664,  121,  617,  633,
+  90,  586,  602,   59,  555,  571,   28,  524,
+  540,   29,  525,  541,   60,  556,  572,   91,
+  587,  603,  122,  618,  634,  153,  649,  665,
+  184,  680,  696,  215,  711,  727,  246,  742,
+  758,  277,  773,  789,  308,  804,  820,  339,
+  835,  851,  370,  866,  882,  401,  897,  913,
+  432,  928,  944,  464,  960,  976,  433,  929,
+  945,  402,  898,  914,  371,  867,  883,  340,
+  836,  852,  309,  805,  821,  278,  774,  790,
+  247,  743,  759,  216,  712,  728,  185,  681,
+  697,  154,  650,  666,  123,  619,  635,   92,
+  588,  604,   61,  557,  573,   30,  526,  542,
+  31,  527,  543,   62,  558,  574,   93,  589,
+  605,  124,  620,  636,  155,  651,  667,  186,
+  682,  698,  217,  713,  729,  248,  744,  760,
+  279,  775,  791,  310,  806,  822,  341,  837,
+  853,  372,  868,  884,  403,  899,  915,  434,
+  930,  946,  465,  961,  977,  496,  992, 1008,
+  497,  993, 1009,  466,  962,  978,  435,  931,
+  947,  404,  900,  916,  373,  869,  885,  342,
+  838,  854,  311,  807,  823,  280,  776,  792,
+  249,  745,  761,  218,  714,  730,  187,  683,
+  699,  156,  652,  668,  125,  621,  637,   94,
+  590,  606,   63,  559,  575,   95,  591,  607,
+  126,  622,  638,  157,  653,  669,  188,  684,
+  700,  219,  715,  731,  250,  746,  762,  281,
+  777,  793,  312,  808,  824,  343,  839,  855,
+  374,  870,  886,  405,  901,  917,  436,  932,
+  948,  467,  963,  979,  498,  994, 1010,  499,
+  995, 1011,  468,  964,  980,  437,  933,  949,
+  406,  902,  918,  375,  871,  887,  344,  840,
+
+  856,  313,  809,  825,  282,  778,  794,  251,
+  747,  763,  220,  716,  732,  189,  685,  701,
+  158,  654,  670,  127,  623,  639,  159,  655,
+  671,  190,  686,  702,  221,  717,  733,  252,
+  748,  764,  283,  779,  795,  314,  810,  826,
+  345,  841,  857,  376,  872,  888,  407,  903,
+  919,  438,  934,  950,  469,  965,  981,  500,
+  996, 1012,  501,  997, 1013,  470,  966,  982,
+  439,  935,  951,  408,  904,  920,  377,  873,
+  889,  346,  842,  858,  315,  811,  827,  284,
+  780,  796,  253,  749,  765,  222,  718,  734,
+  191,  687,  703,  223,  719,  735,  254,  750,
+  766,  285,  781,  797,  316,  812,  828,  347,
+  843,  859,  378,  874,  890,  409,  905,  921,
+  440,  936,  952,  471,  967,  983,  502,  998,
+  1014,  503,  999, 1015,  472,  968,  984,  441,
+  937,  953,  410,  906,  922,  379,  875,  891,
+  348,  844,  860,  317,  813,  829,  286,  782,
+  798,  255,  751,  767,  287,  783,  799,  318,
+  814,  830,  349,  845,  861,  380,  876,  892,
+  411,  907,  923,  442,  938,  954,  473,  969,
+  985,  504, 1000, 1016,  505, 1001, 1017,  474,
+  970,  986,  443,  939,  955,  412,  908,  924,
+  381,  877,  893,  350,  846,  862,  319,  815,
+  831,  351,  847,  863,  382,  878,  894,  413,
+  909,  925,  444,  940,  956,  475,  971,  987,
+  506, 1002, 1018,  507, 1003, 1019,  476,  972,
+  988,  445,  941,  957,  414,  910,  926,  383,
+  879,  895,  415,  911,  927,  446,  942,  958,
+  477,  973,  989,  508, 1004, 1020,  509, 1005,
+  1021,  478,  974,  990,  447,  943,  959,  479,
+  975,  991,  510, 1006, 1022,  511, 1007, 1023,
+};
+
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
   0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,
   6, 6, 6,
   6,
@@ -351,7 +560,206 @@
   975,  991,  510, 1006, 1022,  511, 1007, 1023,
 };
 
-#else  // CONFIG_DWTDCTHYBRID
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5,
+  5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+  0,    1,   32,   64,   33,    2,    3,   34,
+  65,   96,  128,   97,   66,   35,    4,    5,
+  36,   67,   98,  129,  160,  192,  161,  130,
+  99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224,  225,  194,  163,  132,
+  101,   70,   39,   71,  102,  133,  164,  195,
+  226,  227,  196,  165,  134,  103,  135,  166,
+  197,  228,  229,  198,  167,  199,  230,  231,
+
+  8,  256,  264,    9,  257,  265,   40,  288, 296, 72,  320,  328,
+  41,  289,  297,   10, 258,  266, 11,  259,  267,   42,  290,  298,
+  73,  321,  329,  104,  352,  360,  136,  384, 392,  105,  353,  361,
+  74,  322,  330,   43, 291,  299,   12,  260,  268,   13,  261,  269,
+  44,  292,  300,   75,  323,  331,  106,  354, 362,  137,  385,  393,
+  168,  416,  424,  200, 448,  456,  169,  417,  425,  138,  386,  394,
+  107,  355,  363,   76,  324,  332,   45,  293, 301,   14,  262,  270,
+  15,  263,  271,   46, 294,  302,   77,  325,  333,  108,  356,  364,
+  139,  387,  395,  170, 418,  426,  201,  449, 457,  232,  480,  488,
+  233,  481,  489,  202, 450,  458,  171,  419,  427,  140,  388,  396,
+  109,  357,  365,   78,  326,  334,   47,  295, 303,   79,  327,  335,
+  110,  358,  366,  141, 389,  397,  172,  420,  428,  203,  451,  459,
+  234,  482,  490,  235,  483,  491,  204,  452, 460,  173,  421,  429,
+  142,  390,  398,  111, 359,  367,  143,  391,  399,  174,  422,  430,
+  205,  453,  461,  236,  484,  492,  237,  485, 493,  206,  454,  462,
+  175,  423,  431,  207, 455,  463,  238,  486,  494,  239,  487,  495,
+
+  16,  512,  528,   17,  513,  529,   18,  514,
+  530,   19,  515,  531,   20,  516,  532,   21,
+  517,  533,   22,  518,  534,   23,  519,  535,
+  24,  520,  536,   25,  521,  537,   26,  522,
+  538,   27,  523,  539,   28,  524,  540,   29,
+  525,  541,   30,  526,  542,   31,  527,  543,
+  48,  544,  560,   49,  545,  561,   50,  546,
+  562,   51,  547,  563,   52,  548,  564,   53,
+  549,  565,   54,  550,  566,   55,  551,  567,
+  56,  552,  568,   57,  553,  569,   58,  554,
+  570,   59,  555,  571,   60,  556,  572,   61,
+  557,  573,   62,  558,  574,   63,  559,  575,
+  80,  576,  592,   81,  577,  593,   82,  578,
+  594,   83,  579,  595,   84,  580,  596,   85,
+  581,  597,   86,  582,  598,   87,  583,  599,
+  88,  584,  600,   89,  585,  601,   90,  586,
+  602,   91,  587,  603,   92,  588,  604,   93,
+  589,  605,   94,  590,  606,   95,  591,  607,
+  112,  608,  624,  113,  609,  625,  114,  610,
+  626,  115,  611,  627,  116,  612,  628,  117,
+  613,  629,  118,  614,  630,  119,  615,  631,
+  120,  616,  632,  121,  617,  633,  122,  618,
+  634,  123,  619,  635,  124,  620,  636,  125,
+  621,  637,  126,  622,  638,  127,  623,  639,
+  144,  640,  656,  145,  641,  657,  146,  642,
+  658,  147,  643,  659,  148,  644,  660,  149,
+  645,  661,  150,  646,  662,  151,  647,  663,
+  152,  648,  664,  153,  649,  665,  154,  650,
+  666,  155,  651,  667,  156,  652,  668,  157,
+  653,  669,  158,  654,  670,  159,  655,  671,
+  176,  672,  688,  177,  673,  689,  178,  674,
+  690,  179,  675,  691,  180,  676,  692,  181,
+  677,  693,  182,  678,  694,  183,  679,  695,
+  184,  680,  696,  185,  681,  697,  186,  682,
+  698,  187,  683,  699,  188,  684,  700,  189,
+  685,  701,  190,  686,  702,  191,  687,  703,
+  208,  704,  720,  209,  705,  721,  210,  706,
+  722,  211,  707,  723,  212,  708,  724,  213,
+  709,  725,  214,  710,  726,  215,  711,  727,
+  216,  712,  728,  217,  713,  729,  218,  714,
+  730,  219,  715,  731,  220,  716,  732,  221,
+  717,  733,  222,  718,  734,  223,  719,  735,
+  240,  736,  752,  241,  737,  753,  242,  738,
+  754,  243,  739,  755,  244,  740,  756,  245,
+  741,  757,  246,  742,  758,  247,  743,  759,
+  248,  744,  760,  249,  745,  761,  250,  746,
+  762,  251,  747,  763,  252,  748,  764,  253,
+  749,  765,  254,  750,  766,  255,  751,  767,
+  272,  768,  784,  273,  769,  785,  274,  770,
+  786,  275,  771,  787,  276,  772,  788,  277,
+  773,  789,  278,  774,  790,  279,  775,  791,
+  280,  776,  792,  281,  777,  793,  282,  778,
+  794,  283,  779,  795,  284,  780,  796,  285,
+  781,  797,  286,  782,  798,  287,  783,  799,
+  304,  800,  816,  305,  801,  817,  306,  802,
+  818,  307,  803,  819,  308,  804,  820,  309,
+  805,  821,  310,  806,  822,  311,  807,  823,
+  312,  808,  824,  313,  809,  825,  314,  810,
+  826,  315,  811,  827,  316,  812,  828,  317,
+  813,  829,  318,  814,  830,  319,  815,  831,
+  336,  832,  848,  337,  833,  849,  338,  834,
+  850,  339,  835,  851,  340,  836,  852,  341,
+  837,  853,  342,  838,  854,  343,  839,  855,
+  344,  840,  856,  345,  841,  857,  346,  842,
+  858,  347,  843,  859,  348,  844,  860,  349,
+  845,  861,  350,  846,  862,  351,  847,  863,
+  368,  864,  880,  369,  865,  881,  370,  866,
+  882,  371,  867,  883,  372,  868,  884,  373,
+  869,  885,  374,  870,  886,  375,  871,  887,
+  376,  872,  888,  377,  873,  889,  378,  874,
+  890,  379,  875,  891,  380,  876,  892,  381,
+  877,  893,  382,  878,  894,  383,  879,  895,
+  400,  896,  912,  401,  897,  913,  402,  898,
+  914,  403,  899,  915,  404,  900,  916,  405,
+  901,  917,  406,  902,  918,  407,  903,  919,
+  408,  904,  920,  409,  905,  921,  410,  906,
+  922,  411,  907,  923,  412,  908,  924,  413,
+  909,  925,  414,  910,  926,  415,  911,  927,
+  432,  928,  944,  433,  929,  945,  434,  930,
+  946,  435,  931,  947,  436,  932,  948,  437,
+  933,  949,  438,  934,  950,  439,  935,  951,
+  440,  936,  952,  441,  937,  953,  442,  938,
+  954,  443,  939,  955,  444,  940,  956,  445,
+  941,  957,  446,  942,  958,  447,  943,  959,
+  464,  960,  976,  465,  961,  977,  466,  962,
+  978,  467,  963,  979,  468,  964,  980,  469,
+  965,  981,  470,  966,  982,  471,  967,  983,
+  472,  968,  984,  473,  969,  985,  474,  970,
+  986,  475,  971,  987,  476,  972,  988,  477,
+  973,  989,  478,  974,  990,  479,  975,  991,
+  496,  992, 1008,  497,  993, 1009,  498,  994,
+  1010,  499,  995, 1011,  500,  996, 1012,  501,
+  997, 1013,  502,  998, 1014,  503,  999, 1015,
+  504, 1000, 1016,  505, 1001, 1017,  506, 1002,
+  1018,  507, 1003, 1019,  508, 1004, 1020,  509,
+  1005, 1021,  510, 1006, 1022,  511, 1007, 1023,
+};
+#endif
+
+#else
 
 DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
   0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -1536,6 +1536,7 @@
 #if !CONFIG_DWTDCTHYBRID
 #define DownshiftMultiplyBy2(x) x * 2
 #define DownshiftMultiply(x) x
+
 static void idct16(double *input, double *output, int stride) {
   static const double C1 = 0.995184726672197;
   static const double C2 = 0.98078528040323;
@@ -1738,6 +1739,7 @@
   output[stride*9] = step[6] - step[ 9];
   output[stride*8] = step[7] - step[ 8];
 }
+
 static void butterfly_32_idct_1d(double *input, double *output, int stride) {
   static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
   static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
@@ -1878,13 +1880,8 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
-#else  // CONFIG_DWTDCTHYBRID
+#else  // !CONFIG_DWTDCTHYBRID
 
-#define DWT_MAX_LENGTH   32
-#define DWT_TYPE         26    // 26/53/97
-#define DWT_PRECISION_BITS 2
-#define DWT_PRECISION_RND ((1 << DWT_PRECISION_BITS) / 2)
-
 #if DWT_TYPE == 53
 
 // Note: block length must be even for this implementation
@@ -2388,6 +2385,72 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
+static void idct8_1d(double *x) {
+  int i, j;
+  double t[8];
+  static const double idctmat[64] = {
+    0.35355339059327,  0.49039264020162,  0.46193976625564,  0.41573480615127,
+    0.35355339059327,   0.2777851165098,  0.19134171618254, 0.097545161008064,
+    0.35355339059327,  0.41573480615127,  0.19134171618254, -0.097545161008064,
+    -0.35355339059327, -0.49039264020161, -0.46193976625564,  -0.2777851165098,
+    0.35355339059327,   0.2777851165098, -0.19134171618254, -0.49039264020162,
+    -0.35355339059327, 0.097545161008064,  0.46193976625564,  0.41573480615127,
+    0.35355339059327, 0.097545161008063, -0.46193976625564,  -0.2777851165098,
+    0.35355339059327,  0.41573480615127, -0.19134171618254, -0.49039264020162,
+    0.35355339059327, -0.097545161008063, -0.46193976625564,   0.2777851165098,
+    0.35355339059327, -0.41573480615127, -0.19134171618255,  0.49039264020162,
+    0.35355339059327,  -0.2777851165098, -0.19134171618254,  0.49039264020161,
+    -0.35355339059327, -0.097545161008064,  0.46193976625564, -0.41573480615127,
+    0.35355339059327, -0.41573480615127,  0.19134171618254, 0.097545161008065,
+    -0.35355339059327,  0.49039264020162, -0.46193976625564,   0.2777851165098,
+    0.35355339059327, -0.49039264020162,  0.46193976625564, -0.41573480615127,
+    0.35355339059327,  -0.2777851165098,  0.19134171618255, -0.097545161008064
+  };
+  for (i = 0; i < 8; ++i) {
+    t[i] = 0;
+    for (j = 0; j < 8; ++j)
+      t[i] += idctmat[i * 8 + j] * x[j];
+  }
+  for (i = 0; i < 8; ++i) {
+    x[i] = t[i];
+  }
+}
+
+static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
+                                  int scale) {
+  double X[8 * 8], Y[8];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; j++) {
+        X[i * 8 + j] = (double)coefs[i * shortpitch + j];
+      }
+    }
+    for (i = 0; i < 8; i++)
+      idct8_1d(X + 8 * i);
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; ++j)
+        Y[j] = X[i + 8 * j];
+      idct8_1d(Y);
+      for (j = 0; j < 8; ++j)
+        X[i + 8 * j] = Y[j];
+    }
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; j++) {
+        block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
+      }
+    }
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+
 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
   // assume output is a 32x32 buffer
   // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
@@ -2396,7 +2459,7 @@
   int16_t buffer2[32 * 32];
   // Note: pitch is in bytes, short_pitch is in short units
   const int short_pitch = pitch >> 1;
-  int i;
+  int i, j;
 
   // TODO(debargha): Implement more efficiently by adding output pitch
   // argument to the idct16x16 function
@@ -2405,6 +2468,46 @@
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
   }
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
+#endif
+}
+
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 32x32 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[16 * 16];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[32 * 32];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
   vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
                           1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
@@ -2431,6 +2534,66 @@
 #endif
 }
 
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 32x32 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[8 * 8];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[32 * 32];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct8x8_c_f(input, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
+               sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
+               sizeof(*buffer2) * 8);
+  }
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
+#endif
+}
+
+#endif
+
+#if CONFIG_TX64X64
 void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
   // assume output is a 64x64 buffer
   // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
@@ -2448,6 +2611,20 @@
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
   }
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 16; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#elif DWTDCT_TYPE == DWTDCT16X16
   vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
                           2 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
@@ -2467,33 +2644,19 @@
   }
 
   // Copying and scaling highest bands into buffer2
-#if DWT_PRECISION_BITS < 1
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 64 + 32 + j] =
-          input[i * short_pitch + 32 + j] >> (1 - DWT_PRECISION_BITS);
-    }
-  }
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 64; ++j) {
+    for (j = 32; j < 64; ++j) {
       buffer2[i * 64 + j] =
-          input[(i + 32) * short_pitch + j] >> (1 - DWT_PRECISION_BITS);
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
     }
   }
-#else
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 64 + 32 + j] =
-          input[i * short_pitch + 32 + j] << (DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 0; i < 32; ++i) {
+  for (i = 32; i < 64; ++i) {
     for (j = 0; j < 64; ++j) {
       buffer2[i * 64 + j] =
-          input[(i + 32) * short_pitch + j] << (DWT_PRECISION_BITS - 1);
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
     }
   }
-#endif
+#endif  // DWTDCT_TYPE
 
 #if DWT_TYPE == 26
   dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
@@ -2503,4 +2666,5 @@
   dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
 #endif
 }
-#endif  // CONFIG_DWTDCTHYBRID
+#endif  // CONFIG_TX64X64
+#endif  // !CONFIG_DWTDCTHYBRID
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -1686,11 +1686,6 @@
 
 #else  // CONFIG_DWTDCTHYBRID
 
-#define DWT_MAX_LENGTH   64
-#define DWT_TYPE         26    // 26/53/97
-#define DWT_PRECISION_BITS 2
-#define DWT_PRECISION_RND ((1 << DWT_PRECISION_BITS) / 2)
-
 #if DWT_TYPE == 53
 
 // Note: block length must be even for this implementation
@@ -2139,10 +2134,97 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
+void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {
+  int j1, i, j, k;
+  static int count = 0;
+  short x[8 * 8];
+  float b[8];
+  float b1[8];
+  float d[8][8];
+  float f0 = (float) .7071068;
+  float f1 = (float) .4903926;
+  float f2 = (float) .4619398;
+  float f3 = (float) .4157348;
+  float f4 = (float) .3535534;
+  float f5 = (float) .2777851;
+  float f6 = (float) .1913417;
+  float f7 = (float) .0975452;
+  pitch = pitch / 2;
+  for (i = 0, k = 0; i < 8; i++, k += pitch) {
+    for (j = 0; j < 8; j++) {
+      b[j] = (float)(block[k + j] << (3 - scale));
+    }
+    /* Horizontal transform */
+    for (j = 0; j < 4; j++) {
+      j1 = 7 - j;
+      b1[j] = b[j] + b[j1];
+      b1[j1] = b[j] - b[j1];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[i][0] = (b[0] + b[1]) * f4;
+    d[i][4] = (b[0] - b[1]) * f4;
+    d[i][2] = b[2] * f6 + b[3] * f2;
+    d[i][6] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[i][1] = b1[4] * f7 + b1[7] * f1;
+    d[i][5] = b1[5] * f3 + b1[6] * f5;
+    d[i][7] = b1[7] * f7 - b1[4] * f1;
+    d[i][3] = b1[6] * f3 - b1[5] * f5;
+  }
+  /* Vertical transform */
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 4; j++) {
+      j1 = 7 - j;
+      b1[j] = d[j][i] + d[j1][i];
+      b1[j1] = d[j][i] - d[j1][i];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[0][i] = (b[0] + b[1]) * f4;
+    d[4][i] = (b[0] - b[1]) * f4;
+    d[2][i] = b[2] * f6 + b[3] * f2;
+    d[6][i] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[1][i] = b1[4] * f7 + b1[7] * f1;
+    d[5][i] = b1[5] * f3 + b1[6] * f5;
+    d[7][i] = b1[7] * f7 - b1[4] * f1;
+    d[3][i] = b1[6] * f3 - b1[5] * f5;
+  }
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);
+    }
+  }
+  return;
+}
+
+#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+
 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
   // assume out is a 32x32 buffer
   short buffer[16 * 16];
-  int i;
+  int i, j;
   const int short_pitch = pitch >> 1;
 #if DWT_TYPE == 26
   dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
@@ -2156,7 +2238,37 @@
   vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+}
 
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
   vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
@@ -2170,6 +2282,52 @@
     vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
 }
 
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[8 * 8];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);
+
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+}
+
+#endif
+
 #if CONFIG_TX64X64
 void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
   // assume out is a 64x64 buffer
@@ -2189,6 +2347,18 @@
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
 
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 48; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 16; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#elif DWTDCT_TYPE == DWTDCT16X16
   vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
@@ -2204,29 +2374,17 @@
   // There is no dct used on the highest bands for now.
   // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
   // TODO(debargha): experiment with turning these coeffs to 0
-#if DWT_PRECISION_BITS < 1
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 64 + 32 + j] <<= (1 - DWT_PRECISION_BITS);
+    for (j = 32; j < 64; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
     }
   }
-  for (i = 0; i < 32; ++i) {
+  for (i = 32; i < 64; ++i) {
     for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] <<= (1 - DWT_PRECISION_BITS);
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
     }
   }
-#else
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 64 + 32 + j] >>= (DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] >>= (DWT_PRECISION_BITS - 1);
-    }
-  }
-#endif
+#endif  // DWTDCT_TYPE
 }
 #endif  // CONFIG_TX64X64
 #endif  // CONFIG_DWTDCTHYBRID
--