shithub: libvpx

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -893,7 +893,7 @@

     specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;

     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/;

+    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";

     # Need to add 34 eob idct32x32 neon implementation.

     $vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;

--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

@@ -17,12 +17,31 @@

 SECTION_RODATA

 pw_11585x2: times 8 dw 23170

+pw_m2404x2: times 8 dw -2404*2

+pw_m4756x2: times 8 dw -4756*2

+pw_m5520x2: times 8 dw -5520*2

+pw_16364x2: times 8 dw 16364*2

+pw_16305x2: times 8 dw 16305*2

+pw_16207x2: times 8 dw 16207*2

+pw_16069x2: times 8 dw 16069*2

+pw_15893x2: times 8 dw 15893*2

+pw_15679x2: times 8 dw 15679*2

+pw_15426x2: times 8 dw 15426*2

+pw__3981x2: times 8 dw  3981*2

+pw__3196x2: times 8 dw  3196*2

+pw__1606x2: times 8 dw  1606*2

+pw___804x2: times 8 dw   804*2

 pd_8192:    times 4 dd 8192

+pw_32:      times 8 dw 32

 pw_16:      times 8 dw 16

 %macro TRANSFORM_COEFFS 2

 pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

 pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1

+pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2

 %endmacro

 TRANSFORM_COEFFS    6270, 15137

@@ -29,6 +48,15 @@

 TRANSFORM_COEFFS    3196, 16069

 TRANSFORM_COEFFS   13623,  9102

+; constants for 32x32_34

+TRANSFORM_COEFFS      804, 16364

+TRANSFORM_COEFFS    15426,  5520

+TRANSFORM_COEFFS     3981, 15893

+TRANSFORM_COEFFS    16207,  2404

+TRANSFORM_COEFFS     1606, 16305

+TRANSFORM_COEFFS    15679,  4756

+TRANSFORM_COEFFS    11585, 11585

 %macro PAIR_PP_COEFFS 2

 dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2

 %endmacro

@@ -80,6 +108,15 @@

   packssdw           m%2, m%6

 %endmacro

+%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

+  punpckhwd          m%6, m%2, m%1

+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]

+  punpcklwd          m%2, m%1

+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]

+  packssdw           m%1, m%7

+  packssdw           m%2, m%6

+%endmacro

 ; matrix transpose

 %macro INTERLEAVE_2X 4

   punpckh%1          m%4, m%2, m%3

@@ -298,4 +335,453 @@

RET

+%define  idx0 16 * 0

+%define  idx1 16 * 1

+%define  idx2 16 * 2

+%define  idx3 16 * 3

+%define  idx4 16 * 4

+%define  idx5 16 * 5

+%define  idx6 16 * 6

+%define  idx7 16 * 7

+%define  idx8 16 * 0

+%define  idx9 16 * 1

+%define idx10 16 * 2

+%define idx11 16 * 3

+%define idx12 16 * 4

+%define idx13 16 * 5

+%define idx14 16 * 6

+%define idx15 16 * 7

+%define idx16 16 * 0

+%define idx17 16 * 1

+%define idx18 16 * 2

+%define idx19 16 * 3

+%define idx20 16 * 4

+%define idx21 16 * 5

+%define idx22 16 * 6

+%define idx23 16 * 7

+%define idx24 16 * 0

+%define idx25 16 * 1

+%define idx26 16 * 2

+%define idx27 16 * 3

+%define idx28 16 * 4

+%define idx29 16 * 5

+%define idx30 16 * 6

+%define idx31 16 * 7

+%macro IDCT32X32_34x 4

+  ; FROM idct32x32_add_neon.asm

+  ;

+  ; Instead of doing the transforms stage by stage, it is done by loading

+  ; some input values and doing as many stages as possible to minimize the

+  ; storing/loading of intermediate results. To fit within registers, the

+  ; final coefficients are cut into four blocks:

+  ; BLOCK A: 16-19,28-31

+  ; BLOCK B: 20-23,24-27

+  ; BLOCK C: 8-11,12-15

+  ; BLOCK D: 0-3,4-7

+  ; Blocks A and C are straight calculation through the various stages. In

+  ; block B, further calculations are performed using the results from

+  ; block A. In block D, further calculations are performed using the results

+  ; from block C and then the final calculations are done using results from

+  ; block A and B which have been combined at the end of block B.

+  ;

+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                m11, m1

+  pmulhrsw             m1, [pw___804x2] ; stp1_16

+  mova      [r4 +      0], m0

+  pmulhrsw            m11, [pw_16364x2] ; stp2_31

+  mova      [r4 + 16 * 2], m2

+  mova                m12, m7

+  pmulhrsw             m7, [pw_15426x2] ; stp1_28

+  mova      [r4 + 16 * 4], m4

+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19

+  mova      [r4 + 16 * 6], m6

+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                 m2, m1   ; stp1_16

+  mova                 m0, m11  ; stp1_31

+  mova                 m4, m7   ; stp1_28

+  mova                m15, m12  ; stp1_19

+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30

+  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18

+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19

+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18

+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28

+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29

+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29

+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28

+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                 m6, m5

+  pmulhrsw             m5, [pw__3981x2] ; stp1_20

+  mova [stp + %4 + idx28], m12

+  mova [stp + %4 + idx29], m15

+  pmulhrsw             m6, [pw_15893x2] ; stp2_27

+  mova [stp + %4 + idx30], m2

+  mova                 m2, m3

+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23

+  mova [stp + %4 + idx31], m11

+  pmulhrsw             m2, [pw_16207x2] ; stp2_24

+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                m13, m5 ; stp1_20

+  mova                m14, m6 ; stp1_27

+  mova                m15, m3 ; stp1_23

+  mova                m11, m2 ; stp1_24

+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26

+  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22

+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20

+  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21

+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27

+  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26

+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20

+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21

+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23

+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22

+  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21

+  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20

+  mova [stp + %3 + idx16], m1

+  mova [stp + %3 + idx17], m0

+  mova [stp + %3 + idx18], m4

+  mova [stp + %3 + idx19], m7

+  mova                 m4, [stp + %4 + idx28]

+  mova                 m7, [stp + %4 + idx29]

+  mova                m10, [stp + %4 + idx30]

+  mova                m12, [stp + %4 + idx31]

+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27

+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26

+  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25

+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24

+  mova [stp + %4 + idx28], m4

+  mova [stp + %4 + idx29], m7

+  mova [stp + %4 + idx30], m10

+  mova [stp + %4 + idx31], m12

+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+%if 0 ; overflow occurs in SUM_SUB when using test streams

+  mova                m10, [pw_11585x2]

+  SUM_SUB               6,    5,  9

+  pmulhrsw             m6, m10  ; stp1_27

+  pmulhrsw             m5, m10  ; stp1_20

+  SUM_SUB              13, 14,  9

+  pmulhrsw            m13, m10  ; stp1_26

+  pmulhrsw            m14, m10  ; stp1_21

+  SUM_SUB              11, 15,  9

+  pmulhrsw            m11, m10  ; stp1_25

+  pmulhrsw            m15, m10  ; stp1_22

+  SUM_SUB               2,  3,  9

+  pmulhrsw             m2, m10  ; stp1_24

+  pmulhrsw             m3, m10  ; stp1_23

+%else

+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27

+  SWAP 6, 5

+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26

+  SWAP 13, 14

+  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25

+  SWAP 11, 15

+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24

+  SWAP 2, 3

+%endif

+  mova [stp + %4 + idx24], m2

+  mova [stp + %4 + idx25], m11

+  mova [stp + %4 + idx26], m13

+  mova [stp + %4 + idx27], m6

+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  ;

+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                 m0, [rsp + transposed_in + 16 *  2]

+  mova                 m6, [rsp + transposed_in + 16 *  6]

+  mova                 m1, m0

+  pmulhrsw             m0, [pw__1606x2] ; stp1_8

+  mova [stp + %3 + idx20], m5

+  mova [stp + %3 + idx21], m14

+  pmulhrsw             m1, [pw_16305x2] ; stp2_15

+  mova [stp + %3 + idx22], m15

+  mova                 m7, m6

+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11

+  mova [stp + %3 + idx23], m3

+  pmulhrsw             m6, [pw_15679x2] ; stp1_12

+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                 m3, m0 ; stp1_8

+  mova                 m2, m1 ; stp1_15

+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14

+  mova                 m4, m7 ; stp1_11

+  mova                 m5, m6 ; stp1_12

+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10

+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11

+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10

+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12

+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13

+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+%if 0 ; overflow occurs in SUM_SUB when using test streams

+  mova                m10, [pw_11585x2]

+  SUM_SUB               5,    4,  9

+  pmulhrsw             m5, m10  ; stp1_13

+  pmulhrsw             m4, m10  ; stp1_10

+  SUM_SUB               6,    7,  9

+  pmulhrsw             m6, m10  ; stp1_12

+  pmulhrsw             m7, m10  ; stp1_11

+%else

+  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13

+  SWAP 5, 4

+  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12

+  SWAP 6, 7

+%endif

+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova [stp + %2 +  idx8], m0

+  mova [stp + %2 +  idx9], m2

+  mova [stp + %2 + idx10], m4

+  mova [stp + %2 + idx11], m7

+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  ;

+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  ;

+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                m11, [rsp + transposed_in + 16 *  4]

+  mova                m12, m11

+  pmulhrsw            m11, [pw__3196x2] ; stp1_4

+  pmulhrsw            m12, [pw_16069x2] ; stp1_7

+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  mova                 m0, [rsp + transposed_in + 16 *  0]

+  mova                m10, [pw_11585x2]

+  mova                 m7, m0

+  pmulhrsw             m0, m10  ; stp1_1

+  pmulhrsw             m7, m10  ; stp1_0

+  mova                m14, m11 ; stp1_4

+  mova                m13, m12 ; stp1_7

+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+%if 0 ; overflow occurs in SUM_SUB when using test streams

+  SUM_SUB              13,   14,  9

+  pmulhrsw            m13, m10  ; stp1_6

+  pmulhrsw            m14, m10  ; stp1_5

+%else

+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6

+  SWAP 13, 14

+%endif

+  mova                 m4, m0 ; stp1_1

+  mova                 m2, m7 ; stp1_0

+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7

+  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6

+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5

+  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4

+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15

+  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14

+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13

+  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12

+  ; 0-3, 28-31 final stage

+  mova                m15, [stp + %4 + idx30]

+  mova                m10, [stp + %4 + idx31]

+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31

+  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30

+  mova [stp + %1 +  idx0], m0

+  mova [stp + %1 +  idx1], m7

+  mova [stp + %4 + idx30], m15

+  mova [stp + %4 + idx31], m10

+  mova                 m7, [stp + %4 + idx28]

+  mova                 m0, [stp + %4 + idx29]

+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29

+  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28

+  mova [stp + %1 +  idx2], m2

+  mova [stp + %1 +  idx3], m4

+  mova [stp + %4 + idx28], m7

+  mova [stp + %4 + idx29], m0

+  ; 12-15, 16-19 final stage

+  mova                 m0, [stp + %3 + idx16]

+  mova                 m7, [stp + %3 + idx17]

+  mova                 m2, [stp + %3 + idx18]

+  mova                 m4, [stp + %3 + idx19]

+  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16

+  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17

+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18

+  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19

+  mova [stp + %2 + idx12], m6

+  mova [stp + %2 + idx13], m5

+  mova [stp + %2 + idx14], m3

+  mova [stp + %2 + idx15], m1

+  mova [stp + %3 + idx16], m0

+  mova [stp + %3 + idx17], m7

+  mova [stp + %3 + idx18], m2

+  mova [stp + %3 + idx19], m4

+  mova                 m4, [stp + %2 +  idx8]

+  mova                 m5, [stp + %2 +  idx9]

+  mova                 m6, [stp + %2 + idx10]

+  mova                 m7, [stp + %2 + idx11]

+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11

+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10

+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9

+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8

+  ; 4-7, 24-27 final stage

+  mova                 m0, [stp + %4 + idx27]

+  mova                 m1, [stp + %4 + idx26]

+  mova                 m2, [stp + %4 + idx25]

+  mova                 m3, [stp + %4 + idx24]

+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27

+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26

+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25

+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24

+  mova [stp + %4 + idx27], m0

+  mova [stp + %4 + idx26], m1

+  mova [stp + %4 + idx25], m2

+  mova [stp + %4 + idx24], m3

+  mova [stp + %1 +  idx4], m11

+  mova [stp + %1 +  idx5], m14

+  mova [stp + %1 +  idx6], m13

+  mova [stp + %1 +  idx7], m12

+  ; 8-11, 20-23 final stage

+  mova                 m0, [stp + %3 + idx20]

+  mova                 m1, [stp + %3 + idx21]

+  mova                 m2, [stp + %3 + idx22]

+  mova                 m3, [stp + %3 + idx23]

+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20

+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21

+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22

+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23

+  mova [stp + %2 +  idx8], m4

+  mova [stp + %2 +  idx9], m5

+  mova [stp + %2 + idx10], m6

+  mova [stp + %2 + idx11], m7

+  mova [stp + %3 + idx20], m0

+  mova [stp + %3 + idx21], m1

+  mova [stp + %3 + idx22], m2

+  mova [stp + %3 + idx23], m3

+%endmacro

+%macro RECON_AND_STORE 1

+  mova            m11, [pw_32]

+  lea             stp, [rsp + %1]

+  mov              r6, 32

+  pxor             m8, m8

+%%recon_and_store:

+  mova             m0, [stp + 16 * 32 * 0]

+  mova             m1, [stp + 16 * 32 * 1]

+  mova             m2, [stp + 16 * 32 * 2]

+  mova             m3, [stp + 16 * 32 * 3]

+  add             stp, 16

+  paddw            m0, m11

+  paddw            m1, m11

+  paddw            m2, m11

+  paddw            m3, m11

+  psraw            m0, 6

+  psraw            m1, 6

+  psraw            m2, 6

+  psraw            m3, 6

+  movh             m4, [outputq +  0]

+  movh             m5, [outputq +  8]

+  movh             m6, [outputq + 16]

+  movh             m7, [outputq + 24]

+  punpcklbw        m4, m8

+  punpcklbw        m5, m8

+  punpcklbw        m6, m8

+  punpcklbw        m7, m8

+  paddw            m0, m4

+  paddw            m1, m5

+  paddw            m2, m6

+  paddw            m3, m7

+  packuswb         m0, m1

+  packuswb         m2, m3

+  mova [outputq +  0], m0

+  mova [outputq + 16], m2

+  lea         outputq, [outputq + strideq]

+  dec              r6

+  jnz %%recon_and_store

+%endmacro

+%define i32x32_size     16*32*5

+%define pass_two_start  16*32*0

+%define transposed_in   16*32*4

+%define pass_one_start  16*32*0

+%define stp r8

+INIT_XMM ssse3

+cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride

+  mova            m8, [pd_8192]

+  lea            stp, [rsp + pass_one_start]

+idct32x32_34:

+  mov             r3, inputq

+  lea             r4, [rsp + transposed_in]

+idct32x32_34_transpose:

+  mova            m0, [r3 +       0]

+  mova            m1, [r3 + 16 *  4]

+  mova            m2, [r3 + 16 *  8]

+  mova            m3, [r3 + 16 * 12]

+  mova            m4, [r3 + 16 * 16]

+  mova            m5, [r3 + 16 * 20]

+  mova            m6, [r3 + 16 * 24]

+  mova            m7, [r3 + 16 * 28]

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT32X32_34x 16*0, 16*32, 16*64, 16*96

+  lea            stp, [stp + 16 * 8]

+  mov             r6, 4

+  lea            stp, [rsp + pass_one_start]

+  lea             r9, [rsp + pass_one_start]

+idct32x32_34_2:

+  lea             r4, [rsp + transposed_in]

+  mov             r3, r9

+idct32x32_34_transpose_2:

+  mova            m0, [r3 +      0]

+  mova            m1, [r3 + 16 * 1]

+  mova            m2, [r3 + 16 * 2]

+  mova            m3, [r3 + 16 * 3]

+  mova            m4, [r3 + 16 * 4]

+  mova            m5, [r3 + 16 * 5]

+  mova            m6, [r3 + 16 * 6]

+  mova            m7, [r3 + 16 * 7]

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT32X32_34x 16*0, 16*8, 16*16, 16*24

+  lea            stp, [stp + 16 * 32]

+  add             r9, 16 * 32

+  dec             r6

+  jnz idct32x32_34_2

+  RECON_AND_STORE pass_two_start

+  RET

 %endif

--

⑨