ref: 927f29a64495fecc244b1eec760f9d2b24d952ad
dir: /vpx_scale/intel_linux/scaleopt.c/
/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
/****************************************************************************
*
*   Module Title :     scaleopt.cpp
*
*   Description  :     Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"
/****************************************************************************
*  Module Statics
****************************************************************************/
#if 0
__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
#endif
#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"
/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_3_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
    __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    (void) dest_width;
    __asm
    {
        push ebx
        mov         esi,    source
        mov         edi,    dest
        mov         ecx,    source_width
        lea         edx,    [esi+ecx-3];
        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7
        horiz_line_3_5_loop:
        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov        ebx,    eax
        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03
        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx
        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03
        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //
        mov         [edi],  ebx             // writeoutput 00 xx xx xx
        add         esi,    3
        add         edi,    5
        paddw       mm0,    mm1
        paddw       mm0,    mm4
        psrlw       mm0,    8
        cmp         esi,    edx
        packuswb    mm0,    mm7
        movd        DWORD Ptr [edi-4], mm0
        jl          horiz_line_3_5_loop
//Exit:
        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov         ebx,    eax
        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03
        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx
        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03
        shl         eax,    8               // eax = xx 01 02 02
        and         eax,    0xffff0000      // eax = xx xx 02 02
        or          eax,    ebx             // eax = 01 02 02 02
        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //
        mov         [edi],  ebx             // writeoutput 00 xx xx xx
        paddw       mm0,    mm1
        paddw       mm0,    mm4
        psrlw       mm0,    8
        packuswb    mm0,    mm7
        movd        DWORD Ptr [edi+1], mm0
        pop ebx
    }
    /*
    const unsigned char *src = source;
    unsigned char *des = dest;
    unsigned int a, b, c ;
    unsigned int i;
    (void) dest_width;
    for ( i=0; i<source_width-3; i+=3 )
    {
        a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        // 2 * left + 3 * right /5
        des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
        c = src[2] ;
        // 4 * left + 1 * right /5
        des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
        // 1 * left + 4 * right /5
        des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
        a = src[3];
        // 3 * left + 2 * right /5
        des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
        src += 3;
        des += 5;
    }
    a = src[0];
    b = src[1];
    des [0] = (UINT8) (a);
    // 2 * left + 3 * right /5
    des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
    c = src[2] ;
    // 4 * left + 1 * right /5
    des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
    // 1 * left + 4 * right /5
    des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
    des [4] = (UINT8) (c);
    */
}
/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_4_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
    __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
    __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
    (void)dest_width;
    __asm
    {
        mov         esi,    source
        mov         edi,    dest
        mov         ecx,    source_width
        lea         edx,    [esi+ecx-8];
        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7
        horiz_line_4_5_loop:
        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4
        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7
        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        add         edi,    10
        add         esi,    8
        paddw       mm2,    mm3             //
        paddw       mm2,    mm4             // added round values
        cmp         esi,    edx
        psrlw       mm2,    8
        packuswb    mm2,    mm7
        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
        jl         horiz_line_4_5_loop
//Exit:
        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
        movq        mm3,    mm1
        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4
        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        paddw       mm2,    mm3             //
        paddw       mm2,    mm4             // added round values
        psrlw       mm2,    8
        packuswb    mm2,    mm7
        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
    }
    /*
        const unsigned char *src = source;
        unsigned char *des = dest;
        unsigned int a, b, c ;
        unsigned i;
        (void) dest_width;
        for ( i=0; i<source_width-4; i+=4 )
        {
            a = src[0];
            b = src[1];
            des [0] = (UINT8) a;
            des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
            c = src[2] * 154;
            a = src[3];
            des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
            des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
            b = src[4];
            des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
            src += 4;
            des += 5;
        }
        a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
        c = src[2] * 154;
        a = src[3];
        des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
        des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
        des [4] = (UINT8) (a);
    */
}
/****************************************************************************
 *
 *  ROUTINE       : vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has a "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter
        vs_4_5_loop:
        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word
        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm0,    mm5                     // a * 1/5
        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word
        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan
        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5
        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b
        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]
        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking
        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy
        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths
        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5
        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2
        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8
        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        movq        mm1,    [edi]                   // mm1=Src[3];
        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking
        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5
        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5
        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy
        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5
        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5
        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [edi], mm0            // write des[3]
        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking
        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
        movq        mm5,    four_fifths              // mm5 = 4/5
        pmullw      mm1,    mm5                     // d * 4/5
        movq        mm6,    one_fifth                // mm6 = 1/5
        movq        mm2,    mm0                     // make a copy
        pmullw      mm3,    mm5                     // d * 4/5
        punpcklbw   mm0,    mm7                     // unpack low
        pmullw      mm0,    mm6                     // an * 1/5
        punpckhbw   mm2,    mm7                     // unpack high
        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
        pmullw      mm2,    mm6                     // an * 1/5
        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8
        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[4]
        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
        add         edi,    8
        add         esi,    8
        sub         edx,    8
        jg         vs_4_5_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : None
 *
 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter
        last_vs_4_5_loop:
        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word
        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm0,    mm5                     // a * 1/5
        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word
        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan
        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5
        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b
        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]
        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking
        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy
        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths
        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5
        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2
        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8
        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        movq        mm1,    [edi]                   // mm1=Src[3];
        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking
        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5
        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5
        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy
        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5
        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5
        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [edi], mm0            // write des[3]
        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking
        add         edi,    8
        add         esi,    8
        sub         edx,    8
        jg          last_vs_4_5_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter
        vs_3_5_loop:
        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word
        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm0,    mm5                     // a * 2/5
        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word
        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5
        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5
        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b
        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]
        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking
        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low
        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high
        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5
        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5
        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high
        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high
        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low
        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high
        pmullw      mm7,    four_fifths              // c * 4/5 high
        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8
        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128
        psrlw       mm4,    8
        psrlw       mm5,    8
        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [edi], mm4            // write des[3]
        //  mm0, mm2 --- Src[3]
        pxor        mm7,    mm7                     // clear mm7 for unpacking
        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
        movq        mm5,    three_fifths             // mm5 = 3/5
        pmullw      mm0,    mm5                     // d * 3/5
        movq        mm6,    two_fifths                // mm6 = 2/5
        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm5                     // d * 3/5
        punpcklbw   mm1,    mm7                     // unpack low
        pmullw      mm1,    mm6                     // an * 2/5
        punpckhbw   mm3,    mm7                     // unpack high
        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
        pmullw      mm3,    mm6                     // an * 2/5
        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des[4]
        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
        add         edi,    8
        add         esi,    8
        sub         edx,    8
        jg          vs_3_5_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter
        last_vs_3_5_loop:
        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word
        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word
        pmullw      mm0,    mm5                     // a * 2/5
        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word
        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5
        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5
        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b
        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8
        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]
        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking
        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low
        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high
        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5
        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5
        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high
        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high
        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low
        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high
        pmullw      mm7,    four_fifths              // c * 4/5 high
        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8
        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128
        psrlw       mm4,    8
        psrlw       mm5,    8
        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [edi], mm4            // write des[3]
        //  mm0, mm2 --- Src[3]
        add         edi,    8
        add         esi,    8
        sub         edx,    8
        jg          last_vs_3_5_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter
        vs_1_2_loop:
        movq        mm0,    [esi]                   // get Src[0]
        movq        mm1,    [esi + ecx * 2]         // get Src[1]
        movq        mm2,    mm0                     // make copy before unpack
        movq        mm3,    mm1                     // make copy before unpack
        punpcklbw   mm0,    mm7                     // low Src[0]
        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
        punpcklbw   mm1,    mm7                     // low Src[1]
        paddw       mm0,    mm1                     // low (a + b)
        punpckhbw   mm2,    mm7                     // high Src[0]
        paddw       mm0,    mm6                     // low (a + b + 1)
        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3                     // high (a + b )
        psraw       mm0,    1                       // low (a + b +1 )/2
        paddw       mm2,    mm6                     // high (a + b + 1)
        psraw       mm2,    1                       // high (a + b + 1)/2
        packuswb    mm0,    mm2                     // pack results
        movq        [esi+ecx], mm0                  // write out eight bytes
        add         esi,    8
        sub         edx,    8
        jg          vs_1_2_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size
        mov         edx,    dest_width               // Loop counter
        last_vs_1_2_loop:
        movq        mm0,    [esi]                   // get Src[0]
        movq        [esi+ecx], mm0                  // write out eight bytes
        add         esi,    8
        sub         edx,    8
        jg         last_vs_1_2_loop
    }
}
/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_1_2_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_1_2_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
    (void) dest_width;
    __asm
    {
        mov         esi,    source
        mov         edi,    dest
        pxor        mm7,    mm7
        movq        mm6,    four_ones
        mov         ecx,    source_width
        hs_1_2_loop:
        movq        mm0,    [esi]
        movq        mm1,    [esi+1]
        movq        mm2,    mm0
        movq        mm3,    mm1
        movq        mm4,    mm0
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        paddw       mm0,    mm1
        paddw       mm0,    mm6
        punpckhbw   mm2,    mm7
        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3
        paddw       mm2,    mm6
        psraw       mm0,    1
        psraw       mm2,    1
        packuswb    mm0,    mm2
        movq        mm2,    mm4
        punpcklbw   mm2,    mm0
        movq        [edi],  mm2
        punpckhbw   mm4,    mm0
        movq        [edi+8], mm4
        add         esi,    8
        add         edi,    16
        sub         ecx,    8
        cmp         ecx,    8
        jg          hs_1_2_loop
// last eight pixel
        movq        mm0,    [esi]
        movq        mm1,    mm0
        movq        mm2,    mm0
        movq        mm3,    mm1
        psrlq       mm1,    8
        psrlq       mm3,    56
        psllq       mm3,    56
        por         mm1,    mm3
        movq        mm3,    mm1
        movq        mm4,    mm0
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        paddw       mm0,    mm1
        paddw       mm0,    mm6
        punpckhbw   mm2,    mm7
        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3
        paddw       mm2,    mm6
        psraw       mm0,    1
        psraw       mm2,    1
        packuswb    mm0,    mm2
        movq        mm2,    mm4
        punpcklbw   mm2,    mm0
        movq        [edi],  mm2
        punpckhbw   mm4,    mm0
        movq        [edi+8], mm4
    }
}
/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_5_4_scale_mmx
 *
 *  INPUTS        : const unsigned char *source : Pointer to source data.
 *                  unsigned int source_width    : Stride of source.
 *                  unsigned char *dest         : Pointer to destination data.
 *                  unsigned int dest_width      : Stride of destination (NOT USED).
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Copies horizontal line of pixels from source to
 *                  destination scaling up by 4 to 5.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_5_4_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) const unsigned short const54_2[] = {  0,  64, 128, 192 };
    __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128,  64 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    /*
    unsigned i;
    unsigned int a, b, c, d, e;
    unsigned char *des = dest;
    const unsigned char *src = source;
    (void) dest_width;
    for ( i=0; i<source_width; i+=5 )
    {
        a = src[0];
        b = src[1];
        c = src[2];
        d = src[3];
        e = src[4];
        des[0] = a;
        des[1] = ((b*192 + c* 64 + 128)>>8);
        des[2] = ((c*128 + d*128 + 128)>>8);
        des[3] = ((d* 64 + e*192 + 128)>>8);
        src += 5;
        des += 4;
    }
    */
    __asm
    {
        mov         esi,        source              ;
        mov         edi,        dest                ;
        mov         ecx,        source_width         ;
        movq        mm5,        const54_1           ;
        pxor        mm7,        mm7                 ;
        movq        mm6,        const54_2           ;
        movq        mm4,        round_values         ;
        lea         edx,        [esi+ecx]           ;
        horizontal_line_5_4_loop:
        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07
        psrlq       mm0,        8                   ;
        01 02 03 04 05 06 07 xx
        punpcklbw   mm1,        mm7                 ;
        xx 00 xx 01 xx 02 xx 03
        punpcklbw   mm0,        mm7                 ;
        xx 01 xx 02 xx 03 xx 04
        pmullw      mm1,        mm5
        pmullw      mm0,        mm6
        add         esi,        5
        add         edi,        4
        paddw       mm1,        mm0
        paddw       mm1,        mm4
        psrlw       mm1,        8
        cmp         esi,        edx
        packuswb    mm1,        mm7
        movd        DWORD PTR [edi-4], mm1
        jl          horizontal_line_5_4_loop
    }
}
static
void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    __declspec(align(16)) const unsigned short one_fourths[]   = {  64,  64,  64, 64  };
    __declspec(align(16)) const unsigned short two_fourths[]   = { 128, 128, 128, 128 };
    __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        push        ebx
        mov         esi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size
        mov         edi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_pitch               // Loop counter
        mov         ebx,    dest_width
        vs_5_4_loop:
        movd        mm0,    DWORD ptr [esi]         // src[0];
        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
        movd        mm2,    DWORD ptr [esi+ecx*2]
        lea         eax,    [esi+ecx*2]             //
        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7
        movq        mm3,    mm2
        pmullw      mm1,    three_fourths
        pmullw      mm2,    one_fourths
        movd        mm4,    [eax+ecx]
        pmullw      mm3,    two_fourths
        punpcklbw   mm4,    mm7
        movq        mm5,    mm4
        pmullw      mm4,    two_fourths
        paddw       mm1,    mm2
        movd        mm6,    [eax+ecx*2]
        pmullw      mm5,    one_fourths
        paddw       mm1,    round_values;
        paddw       mm3,    mm4
        psrlw       mm1,    8
        punpcklbw   mm6,    mm7
        paddw       mm3,    round_values
        pmullw      mm6,    three_fourths
        psrlw       mm3,    8
        packuswb    mm1,    mm7
        packuswb    mm3,    mm7
        movd        DWORD PTR [edi], mm0
        movd        DWORD PTR [edi+edx], mm1
        paddw       mm5,    mm6
        movd        DWORD PTR [edi+edx*2], mm3
        lea         eax,    [edi+edx*2]
        paddw       mm5,    round_values
        psrlw       mm5,    8
        add         edi,    4
        packuswb    mm5,    mm7
        movd        DWORD PTR [eax+edx], mm5
        add         esi,    4
        sub         ebx,    4
        jg         vs_5_4_loop
        pop         ebx
    }
}
static
void horizontal_line_5_3_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) const unsigned short const53_1[] = {  0,  85, 171, 0 };
    __declspec(align(16)) const unsigned short const53_2[] = {256, 171,  85, 0 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,        source              ;
        mov         edi,        dest                ;
        mov         ecx,        source_width         ;
        movq        mm5,        const53_1           ;
        pxor        mm7,        mm7                 ;
        movq        mm6,        const53_2           ;
        movq        mm4,        round_values         ;
        lea         edx,        [esi+ecx-5]         ;
        horizontal_line_5_3_loop:
        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07
        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx
        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx
        pmullw      mm0,        mm6
        pmullw      mm1,        mm5
        add         esi,        5
        add         edi,        3
        paddw       mm1,        mm0
        paddw       mm1,        mm4
        psrlw       mm1,        8
        cmp         esi,        edx
        packuswb    mm1,        mm7
        movd        DWORD PTR [edi-3], mm1
        jl          horizontal_line_5_3_loop
//exit condition
        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07
        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx
        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx
        pmullw      mm0,        mm6
        pmullw      mm1,        mm5
        paddw       mm1,        mm0
        paddw       mm1,        mm4
        psrlw       mm1,        8
        packuswb    mm1,        mm7
        movd        eax,        mm1
        mov         edx,        eax
        shr         edx,        16
        mov         WORD PTR[edi],   ax
        mov         BYTE PTR[edi+2], dl
    }
}
static
void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __declspec(align(16)) const unsigned short one_thirds[] = {  85,  85,  85,  85 };
    __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };
    __asm
    {
        push        ebx
        mov         esi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size
        mov         edi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_pitch               // Loop counter
        movq        mm5,    one_thirds
        movq        mm6,    two_thirds
        mov         ebx,    dest_width;
        vs_5_3_loop:
        movd        mm0,    DWORD ptr [esi]         // src[0];
        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
        movd        mm2,    DWORD ptr [esi+ecx*2]
        lea         eax,    [esi+ecx*2]             //
        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7
        pmullw      mm1,    mm5
        pmullw      mm2,    mm6
        movd        mm3,    DWORD ptr [eax+ecx]
        movd        mm4,    DWORD ptr [eax+ecx*2]
        punpcklbw   mm3,    mm7
        punpcklbw   mm4,    mm7
        pmullw      mm3,    mm6
        pmullw      mm4,    mm5
        movd        DWORD PTR [edi], mm0
        paddw       mm1,    mm2
        paddw       mm1,    round_values
        psrlw       mm1,    8
        packuswb    mm1,    mm7
        paddw       mm3,    mm4
        paddw       mm3,    round_values
        movd        DWORD PTR [edi+edx], mm1
        psrlw       mm3,    8
        packuswb    mm3,    mm7
        movd        DWORD PTR [edi+edx*2], mm3
        add         edi,    4
        add         esi,    4
        sub         ebx,    4
        jg          vs_5_3_loop
        pop         ebx
    }
}
/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_2_1_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_2_1_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void) dest_width;
    __asm
    {
        mov         esi,    source
        mov         edi,    dest
        pxor        mm7,    mm7
        mov         ecx,    dest_width
        xor         edx,    edx
        hs_2_1_loop:
        movq        mm0,    [esi+edx*2]
        psllw       mm0,    8
        psrlw       mm0,    8
        packuswb    mm0,    mm7
        movd        DWORD Ptr [edi+edx], mm0;
        add         edx,    4
        cmp         edx,    ecx
        jl          hs_2_1_loop
    }
}
static
void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    vpx_memcpy(dest, source, dest_width);
}
static
void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    __declspec(align(16)) const unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
    __declspec(align(16)) const unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,        source
        mov         edi,        dest
        mov         eax,        src_pitch
        mov         edx,        dest_width
        pxor        mm7,        mm7
        sub         esi,        eax             //back one line
        lea         ecx,        [esi+edx];
        movq        mm6,        round_values;
        movq        mm5,        three_sixteenths;
        movq        mm4,        ten_sixteenths;
        vs_2_1_i_loop:
        movd        mm0,        [esi]           //
        movd        mm1,        [esi+eax]       //
        movd        mm2,        [esi+eax*2]     //
        punpcklbw   mm0,        mm7
        pmullw      mm0,        mm5
        punpcklbw   mm1,        mm7
        pmullw      mm1,        mm4
        punpcklbw   mm2,        mm7
        pmullw      mm2,        mm5
        paddw       mm0,        round_values
        paddw       mm1,        mm2
        paddw       mm0,        mm1
        psrlw       mm0,        8
        packuswb    mm0,        mm7
        movd        DWORD PTR [edi],        mm0
        add         esi,        4
        add         edi,        4;
        cmp         esi,        ecx
        jl          vs_2_1_i_loop
    }
}
void
register_mmxscalers(void)
{
    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
    vp8_vertical_band_5_4_scale          = vertical_band_5_4_scale_mmx;
    vp8_vertical_band_5_3_scale          = vertical_band_5_3_scale_mmx;
    vp8_vertical_band_2_1_scale          = vertical_band_2_1_scale_mmx;
    vp8_vertical_band_2_1_scale_i        = vertical_band_2_1_scale_i_mmx;
    vp8_horizontal_line_2_1_scale        = horizontal_line_2_1_scale_mmx;
    vp8_horizontal_line_5_3_scale        = horizontal_line_5_3_scale_mmx;
    vp8_horizontal_line_5_4_scale        = horizontal_line_5_4_scale_mmx;
}