ref: 9fdfb8e92823abadc54d825d0603a7e9fd344520
dir: /vp8/encoder/x86/dct_sse2.asm/
; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" global sym(vp8_short_fdct4x4_wmt) %define DCTCONSTANTSBITS (16) %define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) %define x_c1 (60547) ; cos(pi /8) * (1<<15) %define x_c2 (46341) ; cos(pi*2/8) * (1<<15) %define x_c3 (25080) ; cos(pi*3/8) * (1<<15) %define _1STSTAGESHIFT 14 %define _2NDSTAGESHIFT 16 ;; using matrix multiply ;void vp8_short_fdct4x4_wmt(short *input, short *output) sym(vp8_short_fdct4x4_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 GET_GOT rbx ; end prolog mov rax, arg(0) ;input mov rcx, arg(1) ;output lea rdx, [dct_matrix_sse2 GLOBAL] movdqu xmm0, [rax ] movdqu xmm1, [rax+16] ; first column movdqa xmm2, xmm0 movdqa xmm7, [rdx] pmaddwd xmm2, xmm7 movdqa xmm3, xmm1 pmaddwd xmm3, xmm7 movdqa xmm4, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm4, xmm3 movdqa xmm3, xmm2 punpckldq xmm2, xmm4 punpckhdq xmm3, xmm4 paddd xmm2, xmm3 paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] psrad xmm2, _1STSTAGESHIFT ;second column movdqa xmm3, xmm0 pmaddwd xmm3, [rdx+16] movdqa xmm4, xmm1 pmaddwd xmm4, [rdx+16] movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4 movdqa xmm4, xmm3 punpckldq xmm3, xmm5 punpckhdq xmm4, xmm5 paddd xmm3, xmm4 paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] psrad xmm3, _1STSTAGESHIFT packssdw xmm2, xmm3 ;third column movdqa xmm3, xmm0 pmaddwd xmm3, [rdx+32] movdqa xmm4, xmm1 pmaddwd xmm4, [rdx+32] movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4 movdqa xmm4, xmm3 punpckldq xmm3, xmm5 punpckhdq xmm4, xmm5 paddd xmm3, xmm4 paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] psrad xmm3, _1STSTAGESHIFT ;fourth column (this is the last column, so we do not have save the source any more) pmaddwd xmm0, [rdx+48] pmaddwd xmm1, [rdx+48] movdqa xmm4, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm4, xmm1 movdqa xmm1, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm1, xmm4 paddd xmm0, xmm1 paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] psrad xmm0, _1STSTAGESHIFT packssdw xmm3, xmm0 ; done with one pass ; now start second pass movdqa xmm0, xmm2 movdqa xmm1, xmm3 pmaddwd xmm2, xmm7 pmaddwd xmm3, xmm7 movdqa xmm4, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm4, xmm3 movdqa xmm3, xmm2 punpckldq xmm2, xmm4 punpckhdq xmm3, xmm4 paddd xmm2, xmm3 paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] psrad xmm2, _2NDSTAGESHIFT ;second column movdqa xmm3, xmm0 pmaddwd xmm3, [rdx+16] movdqa xmm4, xmm1 pmaddwd xmm4, [rdx+16] movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4 movdqa xmm4, xmm3 punpckldq xmm3, xmm5 punpckhdq xmm4, xmm5 paddd xmm3, xmm4 paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] psrad xmm3, _2NDSTAGESHIFT packssdw xmm2, xmm3 movdqu [rcx], xmm2 ;third column movdqa xmm3, xmm0 pmaddwd xmm3, [rdx+32] movdqa xmm4, xmm1 pmaddwd xmm4, [rdx+32] movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4 movdqa xmm4, xmm3 punpckldq xmm3, xmm5 punpckhdq xmm4, xmm5 paddd xmm3, xmm4 paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] psrad xmm3, _2NDSTAGESHIFT ;fourth column pmaddwd xmm0, [rdx+48] pmaddwd xmm1, [rdx+48] movdqa xmm4, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm4, xmm1 movdqa xmm1, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm1, xmm4 paddd xmm0, xmm1 paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] psrad xmm0, _2NDSTAGESHIFT packssdw xmm3, xmm0 movdqu [rcx+16], xmm3 mov rsp, rbp ; begin epilog RESTORE_GOT UNSHADOW_ARGS pop rbp ret SECTION_RODATA ;static unsigned int dct1st_stage_rounding_sse2[4] = align 16 dct1st_stage_rounding_sse2: times 4 dd 8192 ;static unsigned int dct2nd_stage_rounding_sse2[4] = align 16 dct2nd_stage_rounding_sse2: times 4 dd 32768 ;static short dct_matrix_sse2[4][8]= align 16 dct_matrix_sse2: times 8 dw 23170 dw 30274 dw 12540 dw -12540 dw -30274 dw 30274 dw 12540 dw -12540 dw -30274 dw 23170 times 2 dw -23170 times 2 dw 23170 times 2 dw -23170 dw 23170 dw 12540 dw -30274 dw 30274 dw -12540 dw 12540 dw -30274 dw 30274 dw -12540