ref: 9fdfb8e92823abadc54d825d0603a7e9fd344520
dir: /vp8/encoder/x86/preproc_mmx.c/
/* * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "memory.h" #include "preproc.h" #include "pragmas.h" /**************************************************************************** * Macros ****************************************************************************/ #define FRAMECOUNT 7 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) /**************************************************************************** * Imports ****************************************************************************/ extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); /**************************************************************************** * Exported Global Variables ****************************************************************************/ void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); /**************************************************************************** * * ROUTINE : temp_filter_wmt * * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. * unsigned char *s : Pointer to source frame. * unsigned char *d : Pointer to destination frame. * int bytes : Number of bytes to filter. * int strength : Strength of filter to apply. * * OUTPUTS : None. * * RETURNS : void * * FUNCTION : Performs a closesness adjusted temporarl blur * * SPECIAL NOTES : Destination frame can be same as source frame. * ****************************************************************************/ void temp_filter_wmt ( pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength ) { int byte = 0; unsigned char *frameptr = ppi->frame_buffer; __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; if (ppi->frame == 0) { do { int i; int frame = 0; do { for (i = 0; i < 8; i++) { *frameptr = s[byte+i]; ++frameptr; } ++frame; } while (frame < FRAMECOUNT); for (i = 0; i < 8; i++) d[byte+i] = s[byte+i]; byte += 8; } while (byte < bytes); } else { int i; int offset2 = (ppi->frame % FRAMECOUNT); do { __declspec(align(16)) unsigned short counts[8]; __declspec(align(16)) unsigned short sums[8]; __asm { mov eax, offset2 mov edi, s // source pixels pxor xmm1, xmm1 // accumulator pxor xmm7, xmm7 mov esi, frameptr // accumulator pxor xmm2, xmm2 // count movq xmm3, QWORD PTR [edi] movq QWORD PTR [esi+8*eax], xmm3 punpcklbw xmm3, xmm2 // xmm3 source pixels mov ecx, FRAMECOUNT next_frame: movq xmm4, QWORD PTR [esi] // get frame buffer values punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels movdqa xmm6, xmm4 // save the pixel values psubsw xmm4, xmm3 // subtracted pixel values pmullw xmm4, xmm4 // square xmm4 movd xmm5, strength psrlw xmm4, xmm5 // should be strength pmullw xmm4, threes // 3 * modifier movdqa xmm5, sixteens // 16s psubusw xmm5, xmm4 // 16 - modifiers movdqa xmm4, xmm5 // save the modifiers pmullw xmm4, xmm6 // multiplier values paddusw xmm1, xmm4 // accumulator paddusw xmm2, xmm5 // count add esi, 8 // next frame dec ecx // next set of eight pixels jnz next_frame movdqa counts, xmm2 psrlw xmm2, 1 // divide count by 2 for rounding paddusw xmm1, xmm2 // rounding added in mov frameptr, esi movdqa sums, xmm1 } for (i = 0; i < 8; i++) { int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; blurvalue >>= 16; d[i] = blurvalue; } s += 8; d += 8; byte += 8; } while (byte < bytes); } ++ppi->frame; __asm emms } /**************************************************************************** * * ROUTINE : temp_filter_mmx * * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. * unsigned char *s : Pointer to source frame. * unsigned char *d : Pointer to destination frame. * int bytes : Number of bytes to filter. * int strength : Strength of filter to apply. * * OUTPUTS : None. * * RETURNS : void * * FUNCTION : Performs a closesness adjusted temporarl blur * * SPECIAL NOTES : Destination frame can be same as source frame. * ****************************************************************************/ void temp_filter_mmx ( pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength ) { int byte = 0; unsigned char *frameptr = ppi->frame_buffer; __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; if (ppi->frame == 0) { do { int i; int frame = 0; do { for (i = 0; i < 4; i++) { *frameptr = s[byte+i]; ++frameptr; } ++frame; } while (frame < FRAMECOUNT); for (i = 0; i < 4; i++) d[byte+i] = s[byte+i]; byte += 4; } while (byte < bytes); } else { int i; int offset2 = (ppi->frame % FRAMECOUNT); do { __declspec(align(16)) unsigned short counts[8]; __declspec(align(16)) unsigned short sums[8]; __asm { mov eax, offset2 mov edi, s // source pixels pxor mm1, mm1 // accumulator pxor mm7, mm7 mov esi, frameptr // accumulator pxor mm2, mm2 // count movd mm3, DWORD PTR [edi] movd DWORD PTR [esi+4*eax], mm3 punpcklbw mm3, mm2 // mm3 source pixels mov ecx, FRAMECOUNT next_frame: movd mm4, DWORD PTR [esi] // get frame buffer values punpcklbw mm4, mm7 // mm4 frame buffer pixels movq mm6, mm4 // save the pixel values psubsw mm4, mm3 // subtracted pixel values pmullw mm4, mm4 // square mm4 movd mm5, strength psrlw mm4, mm5 // should be strength pmullw mm4, threes // 3 * modifier movq mm5, sixteens // 16s psubusw mm5, mm4 // 16 - modifiers movq mm4, mm5 // save the modifiers pmullw mm4, mm6 // multiplier values paddusw mm1, mm4 // accumulator paddusw mm2, mm5 // count add esi, 4 // next frame dec ecx // next set of eight pixels jnz next_frame movq counts, mm2 psrlw mm2, 1 // divide count by 2 for rounding paddusw mm1, mm2 // rounding added in mov frameptr, esi movq sums, mm1 } for (i = 0; i < 4; i++) { int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; blurvalue >>= 16; d[i] = blurvalue; } s += 4; d += 4; byte += 4; } while (byte < bytes); } ++ppi->frame; __asm emms }