ref: 9d0b2cbbceaa672ec6307a6362cb041e8788c067
dir: /vp8/common/x86/postproc_mmx.c/
/* * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include <math.h> #include <stdlib.h> #include "vpx_scale/yv12config.h" #include "pragmas.h" #define VP8_FILTER_WEIGHT 128 #define VP8_FILTER_SHIFT 7 /* static constants */ __declspec(align(16)) const static short Blur[48] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 64, 64, 64, 64, 64, 64, 64, 64, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, }; #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; #ifndef RELOCATEABLE const static RD; const static R4D2; #endif /* external references */ extern double vp8_gaussian(double sigma, double mu, double x); extern short vp8_rv[]; extern int vp8_q2mbl(int x) ; void vp8_post_proc_down_and_across_mmx ( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit ) { #ifdef RELOCATEABLE RD R4D2 #endif __asm { push ebx lea ebx, Blur movd mm2, flimit punpcklwd mm2, mm2 punpckldq mm2, mm2 mov esi, src_ptr mov edi, dst_ptr mov ecx, DWORD PTR rows mov eax, src_pixels_per_line ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 nextrow: xor edx, edx ; clear out edx for use as loop counter nextcol: pxor mm7, mm7 ; mm7 = 00000000 movq mm6, [ebx + 32 ] ; mm6 = kernel 2 taps movq mm3, [esi] ; mm4 = r0 p0..p7 punpcklbw mm3, mm0 ; mm3 = p0..p3 movq mm1, mm3 ; mm1 = p0..p3 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers movq mm6, [ebx + 48] ; mm6 = kernel 3 taps movq mm5, [esi + eax] ; mm4 = r1 p0..p7 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers paddusw mm3, mm6 ; mm3 += mm6 ; thresholding movq mm7, mm1 ; mm7 = r0 p0..p3 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) pcmpgtw mm7, mm2 movq mm6, [ebx + 64 ] ; mm6 = kernel 4 modifiers movq mm5, [esi + 2*eax] ; mm4 = r2 p0..p7 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = r0 p0..p3 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds neg eax movq mm6, [ebx ] ; kernel 0 taps movq mm5, [esi+2*eax] ; mm4 = r-2 p0..p7 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = r0 p0..p3 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds movq mm6, [ebx + 16] ; kernel 1 taps movq mm4, [esi+eax] ; mm4 = r-1 p0..p7 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = r0 p0..p3 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds paddusw mm3, rd ; mm3 += round value psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 pand mm1, mm7 ; mm1 select vals > thresh from source pandn mm7, mm3 ; mm7 select vals < thresh from blurred result paddusw mm1, mm7 ; combination packuswb mm1, mm0 ; pack to bytes movd [edi], mm1 ; neg eax ; pitch is positive add esi, 4 add edi, 4 add edx, 4 cmp edx, cols jl nextcol // done with the all cols, start the across filtering in place sub esi, edx sub edi, edx push eax xor edx, edx mov eax, [edi-4]; acrossnextcol: pxor mm7, mm7 ; mm7 = 00000000 movq mm6, [ebx + 32 ] ; movq mm4, [edi+edx] ; mm4 = p0..p7 movq mm3, mm4 ; mm3 = p0..p7 punpcklbw mm3, mm0 ; mm3 = p0..p3 movq mm1, mm3 ; mm1 = p0..p3 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers movq mm6, [ebx + 48] psrlq mm4, 8 ; mm4 = p1..p7 movq mm5, mm4 ; mm5 = p1..p7 punpcklbw mm5, mm0 ; mm5 = p1..p4 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers paddusw mm3, mm6 ; mm3 += mm6 ; thresholding movq mm7, mm1 ; mm7 = p0..p3 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) pcmpgtw mm7, mm2 movq mm6, [ebx + 64 ] psrlq mm4, 8 ; mm4 = p2..p7 movq mm5, mm4 ; mm5 = p2..p7 punpcklbw mm5, mm0 ; mm5 = p2..p5 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = p0..p3 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds movq mm6, [ebx ] movq mm4, [edi+edx-2] ; mm4 = p-2..p5 movq mm5, mm4 ; mm5 = p-2..p5 punpcklbw mm5, mm0 ; mm5 = p-2..p1 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = p0..p3 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds movq mm6, [ebx + 16] psrlq mm4, 8 ; mm4 = p-1..p5 punpcklbw mm4, mm0 ; mm4 = p-1..p2 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. paddusw mm3, mm6 ; mm3 += mm5 ; thresholding movq mm6, mm1 ; mm6 = p0..p3 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw mm6, mm2 por mm7, mm6 ; accumulate thresholds paddusw mm3, rd ; mm3 += round value psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 pand mm1, mm7 ; mm1 select vals > thresh from source pandn mm7, mm3 ; mm7 select vals < thresh from blurred result paddusw mm1, mm7 ; combination packuswb mm1, mm0 ; pack to bytes mov DWORD PTR [edi+edx-4], eax ; store previous four bytes movd eax, mm1 add edx, 4 cmp edx, cols jl acrossnextcol; mov DWORD PTR [edi+edx-4], eax pop eax // done with this rwo add esi, eax ; next line mov eax, dst_pixels_per_line ; destination pitch? add edi, eax ; next destination mov eax, src_pixels_per_line ; destination pitch? dec ecx ; decrement count jnz nextrow ; next row pop ebx } } void vp8_post_proc_down_and_across_xmm ( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit ) { #ifdef RELOCATEABLE R4D2 #endif __asm { movd xmm2, flimit punpcklwd xmm2, xmm2 punpckldq xmm2, xmm2 punpcklqdq xmm2, xmm2 mov esi, src_ptr mov edi, dst_ptr mov ecx, DWORD PTR rows mov eax, src_pixels_per_line ; destination pitch? pxor xmm0, xmm0 ; mm0 = 00000000 nextrow: xor edx, edx ; clear out edx for use as loop counter nextcol: movq xmm3, QWORD PTR [esi] ; mm4 = r0 p0..p7 punpcklbw xmm3, xmm0 ; mm3 = p0..p3 movdqa xmm1, xmm3 ; mm1 = p0..p3 psllw xmm3, 2 ; movq xmm5, QWORD PTR [esi + eax] ; mm4 = r1 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm6 ; thresholding movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) pcmpgtw xmm7, xmm2 movq xmm5, QWORD PTR [esi + 2*eax] ; mm4 = r2 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds neg eax movq xmm5, QWORD PTR [esi+2*eax] ; mm4 = r-2 p0..p7 punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds movq xmm4, QWORD PTR [esi+eax] ; mm4 = r-1 p0..p7 punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 paddusw xmm3, xmm4 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds paddusw xmm3, rd42 ; mm3 += round value psraw xmm3, 3 ; mm3 /= 8 pand xmm1, xmm7 ; mm1 select vals > thresh from source pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result paddusw xmm1, xmm7 ; combination packuswb xmm1, xmm0 ; pack to bytes movq QWORD PTR [edi], xmm1 ; neg eax ; pitch is positive add esi, 8 add edi, 8 add edx, 8 cmp edx, cols jl nextcol // done with the all cols, start the across filtering in place sub esi, edx sub edi, edx xor edx, edx movq mm0, QWORD PTR [edi-8]; acrossnextcol: movq xmm7, QWORD PTR [edi +edx -2] movd xmm4, DWORD PTR [edi +edx +6] pslldq xmm4, 8 por xmm4, xmm7 movdqa xmm3, xmm4 psrldq xmm3, 2 punpcklbw xmm3, xmm0 ; mm3 = p0..p3 movdqa xmm1, xmm3 ; mm1 = p0..p3 psllw xmm3, 2 movdqa xmm5, xmm4 psrldq xmm5, 3 punpcklbw xmm5, xmm0 ; mm5 = p1..p4 paddusw xmm3, xmm5 ; mm3 += mm6 ; thresholding movdqa xmm7, xmm1 ; mm7 = p0..p3 psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) pcmpgtw xmm7, xmm2 movdqa xmm5, xmm4 psrldq xmm5, 4 punpcklbw xmm5, xmm0 ; mm5 = p2..p5 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds movdqa xmm5, xmm4 ; mm5 = p-2..p5 punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 paddusw xmm3, xmm5 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds psrldq xmm4, 1 ; mm4 = p-1..p5 punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 paddusw xmm3, xmm4 ; mm3 += mm5 ; thresholding movdqa xmm6, xmm1 ; mm6 = p0..p3 psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) pcmpgtw xmm6, xmm2 por xmm7, xmm6 ; accumulate thresholds paddusw xmm3, rd42 ; mm3 += round value psraw xmm3, 3 ; mm3 /= 8 pand xmm1, xmm7 ; mm1 select vals > thresh from source pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result paddusw xmm1, xmm7 ; combination packuswb xmm1, xmm0 ; pack to bytes movq QWORD PTR [edi+edx-8], mm0 ; store previous four bytes movdq2q mm0, xmm1 add edx, 8 cmp edx, cols jl acrossnextcol; // last 8 pixels movq QWORD PTR [edi+edx-8], mm0 // done with this rwo add esi, eax ; next line mov eax, dst_pixels_per_line ; destination pitch? add edi, eax ; next destination mov eax, src_pixels_per_line ; destination pitch? dec ecx ; decrement count jnz nextrow ; next row } } void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) { int c, i; __declspec(align(16)) int flimit2[2]; __declspec(align(16)) unsigned char d[16][8]; flimit = vp8_q2mbl(flimit); for (i = 0; i < 2; i++) flimit2[i] = flimit; rows += 8; for (c = 0; c < cols; c += 4) { unsigned char *s = &dst[c]; __asm { mov esi, s ; pxor mm0, mm0 ; mov eax, pitch ; neg eax // eax = -pitch lea esi, [esi + eax*8]; // edi = s[-pitch*8] neg eax pxor mm5, mm5 pxor mm6, mm6 ; pxor mm7, mm7 ; mov edi, esi mov ecx, 15 ; loop_initvar: movd mm1, DWORD PTR [edi]; punpcklbw mm1, mm0 ; paddw mm5, mm1 ; pmullw mm1, mm1 ; movq mm2, mm1 ; punpcklwd mm1, mm0 ; punpckhwd mm2, mm0 ; paddd mm6, mm1 ; paddd mm7, mm2 ; lea edi, [edi+eax] ; dec ecx jne loop_initvar //save the var and sum xor edx, edx loop_row: movd mm1, DWORD PTR [esi] // [s-pitch*8] movd mm2, DWORD PTR [edi] // [s+pitch*7] punpcklbw mm1, mm0 punpcklbw mm2, mm0 paddw mm5, mm2 psubw mm5, mm1 pmullw mm2, mm2 movq mm4, mm2 punpcklwd mm2, mm0 punpckhwd mm4, mm0 paddd mm6, mm2 paddd mm7, mm4 pmullw mm1, mm1 movq mm2, mm1 punpcklwd mm1, mm0 psubd mm6, mm1 punpckhwd mm2, mm0 psubd mm7, mm2 movq mm3, mm6 pslld mm3, 4 psubd mm3, mm6 movq mm1, mm5 movq mm4, mm5 pmullw mm1, mm1 pmulhw mm4, mm4 movq mm2, mm1 punpcklwd mm1, mm4 punpckhwd mm2, mm4 movq mm4, mm7 pslld mm4, 4 psubd mm4, mm7 psubd mm3, mm1 psubd mm4, mm2 psubd mm3, flimit2 psubd mm4, flimit2 psrad mm3, 31 psrad mm4, 31 packssdw mm3, mm4 packsswb mm3, mm0 movd mm1, DWORD PTR [esi+eax*8] movq mm2, mm1 punpcklbw mm1, mm0 paddw mm1, mm5 mov ecx, edx and ecx, 127 movq mm4, vp8_rv[ecx*2] paddw mm1, mm4 //paddw xmm1, eight8s psraw mm1, 4 packuswb mm1, mm0 pand mm1, mm3 pandn mm3, mm2 por mm1, mm3 and ecx, 15 movd DWORD PTR d[ecx*4], mm1 mov ecx, edx sub ecx, 8 and ecx, 15 movd mm1, DWORD PTR d[ecx*4] movd [esi], mm1 lea esi, [esi+eax] lea edi, [edi+eax] add edx, 1 cmp edx, rows jl loop_row } } } void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) { int c, i; __declspec(align(16)) int flimit4[4]; __declspec(align(16)) unsigned char d[16][8]; flimit = vp8_q2mbl(flimit); for (i = 0; i < 4; i++) flimit4[i] = flimit; rows += 8; for (c = 0; c < cols; c += 8) { unsigned char *s = &dst[c]; __asm { mov esi, s ; pxor xmm0, xmm0 ; mov eax, pitch ; neg eax // eax = -pitch lea esi, [esi + eax*8]; // edi = s[-pitch*8] neg eax pxor xmm5, xmm5 pxor xmm6, xmm6 ; pxor xmm7, xmm7 ; mov edi, esi mov ecx, 15 ; loop_initvar: movq xmm1, QWORD PTR [edi]; punpcklbw xmm1, xmm0 ; paddw xmm5, xmm1 ; pmullw xmm1, xmm1 ; movdqa xmm2, xmm1 ; punpcklwd xmm1, xmm0 ; punpckhwd xmm2, xmm0 ; paddd xmm6, xmm1 ; paddd xmm7, xmm2 ; lea edi, [edi+eax] ; dec ecx jne loop_initvar //save the var and sum xor edx, edx loop_row: movq xmm1, QWORD PTR [esi] // [s-pitch*8] movq xmm2, QWORD PTR [edi] // [s+pitch*7] punpcklbw xmm1, xmm0 punpcklbw xmm2, xmm0 paddw xmm5, xmm2 psubw xmm5, xmm1 pmullw xmm2, xmm2 movdqa xmm4, xmm2 punpcklwd xmm2, xmm0 punpckhwd xmm4, xmm0 paddd xmm6, xmm2 paddd xmm7, xmm4 pmullw xmm1, xmm1 movdqa xmm2, xmm1 punpcklwd xmm1, xmm0 psubd xmm6, xmm1 punpckhwd xmm2, xmm0 psubd xmm7, xmm2 movdqa xmm3, xmm6 pslld xmm3, 4 psubd xmm3, xmm6 movdqa xmm1, xmm5 movdqa xmm4, xmm5 pmullw xmm1, xmm1 pmulhw xmm4, xmm4 movdqa xmm2, xmm1 punpcklwd xmm1, xmm4 punpckhwd xmm2, xmm4 movdqa xmm4, xmm7 pslld xmm4, 4 psubd xmm4, xmm7 psubd xmm3, xmm1 psubd xmm4, xmm2 psubd xmm3, flimit4 psubd xmm4, flimit4 psrad xmm3, 31 psrad xmm4, 31 packssdw xmm3, xmm4 packsswb xmm3, xmm0 movq xmm1, QWORD PTR [esi+eax*8] movq xmm2, xmm1 punpcklbw xmm1, xmm0 paddw xmm1, xmm5 mov ecx, edx and ecx, 127 movdqu xmm4, vp8_rv[ecx*2] paddw xmm1, xmm4 //paddw xmm1, eight8s psraw xmm1, 4 packuswb xmm1, xmm0 pand xmm1, xmm3 pandn xmm3, xmm2 por xmm1, xmm3 and ecx, 15 movq QWORD PTR d[ecx*8], xmm1 mov ecx, edx sub ecx, 8 and ecx, 15 movq mm0, d[ecx*8] movq [esi], mm0 lea esi, [esi+eax] lea edi, [edi+eax] add edx, 1 cmp edx, rows jl loop_row } } } #if 0 /**************************************************************************** * * ROUTINE : plane_add_noise_wmt * * INPUTS : unsigned char *Start starting address of buffer to add gaussian * noise to * unsigned int Width width of plane * unsigned int Height height of plane * int Pitch distance between subsequent lines of frame * int q quantizer used to determine amount of noise * to add * * OUTPUTS : None. * * RETURNS : void. * * FUNCTION : adds gaussian noise to a plane of pixels * * SPECIAL NOTES : None. * ****************************************************************************/ void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) { unsigned int i; __declspec(align(16)) unsigned char blackclamp[16]; __declspec(align(16)) unsigned char whiteclamp[16]; __declspec(align(16)) unsigned char bothclamp[16]; char char_dist[300]; char Rand[2048]; double sigma; // return; __asm emms sigma = a + .5 + .6 * (63 - q) / 63.0; // set up a lookup table of 256 entries that matches // a gaussian distribution with sigma determined by q. // { double i; int next, j; next = 0; for (i = -32; i < 32; i++) { double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); int a = (int)(g + .5); if (a) { for (j = 0; j < a; j++) { char_dist[next+j] = (char) i; } next = next + j; } } for (next = next; next < 256; next++) char_dist[next] = 0; } for (i = 0; i < 2048; i++) { Rand[i] = char_dist[rand() & 0xff]; } for (i = 0; i < 16; i++) { blackclamp[i] = -char_dist[0]; whiteclamp[i] = -char_dist[0]; bothclamp[i] = -2 * char_dist[0]; } for (i = 0; i < Height; i++) { unsigned char *Pos = Start + i * Pitch; char *Ref = Rand + (rand() & 0xff); __asm { mov ecx, [Width] mov esi, Pos mov edi, Ref xor eax, eax nextset: movdqu xmm1, [esi+eax] // get the source psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise paddusb xmm1, bothclamp psubusb xmm1, whiteclamp movdqu xmm2, [edi+eax] // get the noise for this line paddb xmm1, xmm2 // add it in movdqu [esi+eax], xmm1 // store the result add eax, 16 // move to the next line cmp eax, ecx jl nextset } } } #endif __declspec(align(16)) static const int four8s[4] = { 8, 8, 8, 8}; void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) { int r, i; __declspec(align(16)) int flimit4[4]; unsigned char *s = src; int sumsq; int sum; flimit = vp8_q2mbl(flimit); flimit4[0] = flimit4[1] = flimit4[2] = flimit4[3] = flimit; for (r = 0; r < rows; r++) { sumsq = 0; sum = 0; for (i = -8; i <= 6; i++) { sumsq += s[i] * s[i]; sum += s[i]; } __asm { mov eax, sumsq movd xmm7, eax mov eax, sum movd xmm6, eax mov esi, s xor ecx, ecx mov edx, cols add edx, 8 pxor mm0, mm0 pxor mm1, mm1 pxor xmm0, xmm0 nextcol4: movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 punpcklbw xmm1, xmm0 // expanding punpcklbw xmm2, xmm0 // expanding punpcklwd xmm1, xmm0 // expanding to dwords punpcklwd xmm2, xmm0 // expanding to dwords psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 paddd xmm6, xmm2 paddd xmm7, xmm1 pshufd xmm6, xmm6, 0 // duplicate the last ones pshufd xmm7, xmm7, 0 // duplicate the last ones psrldq xmm1, 4 // 8--7 9--6 10--5 0000 psrldq xmm2, 4 // 8--7 9--6 10--5 0000 pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared paddd xmm6, xmm4 paddd xmm7, xmm3 pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared paddd xmm7, xmm3 paddd xmm6, xmm4 pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared paddd xmm7, xmm3 paddd xmm6, xmm4 movdqa xmm3, xmm6 pmaddwd xmm3, xmm3 movdqa xmm5, xmm7 pslld xmm5, 4 psubd xmm5, xmm7 psubd xmm5, xmm3 psubd xmm5, flimit4 psrad xmm5, 31 packssdw xmm5, xmm0 packsswb xmm5, xmm0 movd xmm1, DWORD PTR [esi+ecx] movq xmm2, xmm1 punpcklbw xmm1, xmm0 punpcklwd xmm1, xmm0 paddd xmm1, xmm6 paddd xmm1, four8s psrad xmm1, 4 packssdw xmm1, xmm0 packuswb xmm1, xmm0 pand xmm1, xmm5 pandn xmm5, xmm2 por xmm5, xmm1 movd [esi+ecx-8], mm0 movq mm0, mm1 movdq2q mm1, xmm5 psrldq xmm7, 12 psrldq xmm6, 12 add ecx, 4 cmp ecx, edx jl nextcol4 } s += pitch; } } #if 0 /**************************************************************************** * * ROUTINE : plane_add_noise_mmx * * INPUTS : unsigned char *Start starting address of buffer to add gaussian * noise to * unsigned int Width width of plane * unsigned int Height height of plane * int Pitch distance between subsequent lines of frame * int q quantizer used to determine amount of noise * to add * * OUTPUTS : None. * * RETURNS : void. * * FUNCTION : adds gaussian noise to a plane of pixels * * SPECIAL NOTES : None. * ****************************************************************************/ void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) { unsigned int i; int Pitch4 = Pitch * 4; const int noise_amount = 2; const int noise_adder = 2 * noise_amount + 1; __declspec(align(16)) unsigned char blackclamp[16]; __declspec(align(16)) unsigned char whiteclamp[16]; __declspec(align(16)) unsigned char bothclamp[16]; char char_dist[300]; char Rand[2048]; double sigma; __asm emms sigma = a + .5 + .6 * (63 - q) / 63.0; // set up a lookup table of 256 entries that matches // a gaussian distribution with sigma determined by q. // { double i, sum = 0; int next, j; next = 0; for (i = -32; i < 32; i++) { int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); if (a) { for (j = 0; j < a; j++) { char_dist[next+j] = (char) i; } next = next + j; } } for (next = next; next < 256; next++) char_dist[next] = 0; } for (i = 0; i < 2048; i++) { Rand[i] = char_dist[rand() & 0xff]; } for (i = 0; i < 16; i++) { blackclamp[i] = -char_dist[0]; whiteclamp[i] = -char_dist[0]; bothclamp[i] = -2 * char_dist[0]; } for (i = 0; i < Height; i++) { unsigned char *Pos = Start + i * Pitch; char *Ref = Rand + (rand() & 0xff); __asm { mov ecx, [Width] mov esi, Pos mov edi, Ref xor eax, eax nextset: movq mm1, [esi+eax] // get the source psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise paddusb mm1, bothclamp psubusb mm1, whiteclamp movq mm2, [edi+eax] // get the noise for this line paddb mm1, mm2 // add it in movq [esi+eax], mm1 // store the result add eax, 8 // move to the next line cmp eax, ecx jl nextset } } } #else extern char an[8][64][3072]; extern int cd[8][64]; void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) { unsigned int i; __declspec(align(16)) unsigned char blackclamp[16]; __declspec(align(16)) unsigned char whiteclamp[16]; __declspec(align(16)) unsigned char bothclamp[16]; __asm emms for (i = 0; i < 16; i++) { blackclamp[i] = -cd[a][q]; whiteclamp[i] = -cd[a][q]; bothclamp[i] = -2 * cd[a][q]; } for (i = 0; i < Height; i++) { unsigned char *Pos = Start + i * Pitch; char *Ref = an[a][q] + (rand() & 0xff); __asm { mov ecx, [Width] mov esi, Pos mov edi, Ref xor eax, eax nextset: movq mm1, [esi+eax] // get the source psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise paddusb mm1, bothclamp psubusb mm1, whiteclamp movq mm2, [edi+eax] // get the noise for this line paddb mm1, mm2 // add it in movq [esi+eax], mm1 // store the result add eax, 8 // move to the next line cmp eax, ecx jl nextset } } } void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) { unsigned int i; __declspec(align(16)) unsigned char blackclamp[16]; __declspec(align(16)) unsigned char whiteclamp[16]; __declspec(align(16)) unsigned char bothclamp[16]; __asm emms for (i = 0; i < 16; i++) { blackclamp[i] = -cd[a][q]; whiteclamp[i] = -cd[a][q]; bothclamp[i] = -2 * cd[a][q]; } for (i = 0; i < Height; i++) { unsigned char *Pos = Start + i * Pitch; char *Ref = an[a][q] + (rand() & 0xff); __asm { mov ecx, [Width] mov esi, Pos mov edi, Ref xor eax, eax nextset: movdqu xmm1, [esi+eax] // get the source psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise paddusb xmm1, bothclamp psubusb xmm1, whiteclamp movdqu xmm2, [edi+eax] // get the noise for this line paddb xmm1, xmm2 // add it in movdqu [esi+eax], xmm1 // store the result add eax, 16 // move to the next line cmp eax, ecx jl nextset } } } #endif