ref: 2e5f1b213478bfe830ad846088d6ad14c1400dd5
dir: /codec/encoder/core/mips/score_mmi.c/
/*! * \copy * Copyright (c) 2009-2018, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file score_mmi.c * * \brief Loongson optimization * * \date 21/07/2018 Created * ************************************************************************************* */ #include <stdint.h> #include "asmdefs_mmi.h" unsigned char nozero_count_table[] __attribute__((aligned(16))) = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; int32_t WelsGetNoneZeroCount_mmi(int16_t *level) { int ret_val = 0; __asm__ volatile( ".set arch=loongson3a \n\t" "gslqc1 $f2, $f0, 0x0(%[level]) \n\t" "gslqc1 $f6, $f4, 0x10(%[level]) \n\t" "xor $f8, $f8, $f8 \n\t" "pcmpeqh $f0, $f0, $f8 \n\t" "pcmpeqh $f2, $f2, $f8 \n\t" "pcmpeqh $f4, $f4, $f8 \n\t" "pcmpeqh $f6, $f6, $f8 \n\t" "packsshb $f4, $f4, $f6 \n\t" "packsshb $f6, $f0, $f2 \n\t" "pmovmskb $f0, $f4 \n\t" "pmovmskb $f2, $f6 \n\t" "dmfc1 $8, $f0 \n\t" "dmfc1 $9, $f2 \n\t" "xor $8, 0xFF \n\t" "xor $9, 0xFF \n\t" PTR_ADDU "$10, $8, %[nozero_count_table] \n\t" "lbu $8, 0x0($10) \n\t" PTR_ADDU "$10, $9, %[nozero_count_table] \n\t" "lbu $9, 0x0($10) \n\t" PTR_ADDU "%[ret_val], $8, $9 \n\t" : [ret_val] "=r"((int)ret_val) : [level] "r"((unsigned char *)level), [nozero_count_table] "r"((unsigned char *)nozero_count_table) : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8" ); return ret_val; } void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) { BACKUP_REG; __asm__ volatile( ".set arch=loongson3a \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t" "dli $8, 0x3 \n\t" "dmtc1 $8, $f22 \n\t" "dli $8, 0x2 \n\t" "dmtc1 $8, $f24 \n\t" "dli $8, 0x1 \n\t" "dmtc1 $8, $f26 \n\t" "dmtc1 $0, $f28 \n\t" "pextrh $f18, $f2, $f22 \n\t" "pextrh $f20, $f4, $f24 \n\t" "pextrh $f16, $f2, $f26 \n\t" "pinsrh_2 $f4, $f4, $f18 \n\t" "pinsrh_3 $f2, $f2, $f16 \n\t" "pextrh $f18, $f4, $f28 \n\t" "pinsrh_1 $f2, $f2, $f18 \n\t" "pinsrh_0 $f4, $f4, $f20 \n\t" "dli $8, 0x93 \n\t" "dmtc1 $8, $f22 \n\t" "dli $8, 0x39 \n\t" "dmtc1 $8, $f24 \n\t" "punpckhwd $f10, $f0, $f2 \n\t" "punpcklwd $f8, $f0, $f2 \n\t" "punpckhwd $f14, $f4, $f6 \n\t" "punpcklwd $f12, $f4, $f6 \n\t" "mov.d $f0, $f8 \n\t" "pshufh $f2, $f10, $f22 \n\t" "pshufh $f4, $f12, $f24 \n\t" "mov.d $f6, $f14 \n\t" "gssqc1 $f2, $f0, 0x0(%[level]) \n\t" "gssqc1 $f6, $f4, 0x10(%[level]) \n\t" : : [level] "r"((short *)level), [pDct] "r"((short *)pDct) : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28" ); RECOVER_REG; } void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) { BACKUP_REG; __asm__ volatile( ".set arch=loongson3a \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t" "mov.d $f8, $f2 \n\t" "mov.d $f2, $f4 \n\t" "mov.d $f10, $f6 \n\t" "mov.d $f12, $f2 \n\t" "punpckhwd $f2, $f0, $f8 \n\t" "punpcklwd $f0, $f0, $f8 \n\t" "punpckhwd $f14, $f12, $f10 \n\t" "punpcklwd $f12, $f12, $f10 \n\t" "dmtc1 $0, $f20 \n\t" "dli $8, 0x10 \n\t" "dmtc1 $8, $f22 \n\t" "dli $8, 0x30 \n\t" "dmtc1 $8, $f24 \n\t" "dli $8, 0x3 \n\t" "dmtc1 $8, $f26 \n\t" "dli $8, 0x93 \n\t" "dmtc1 $8, $f28 \n\t" "dli $8, 0x39 \n\t" "dmtc1 $8, $f30 \n\t" "pextrh $f16, $f0, $f26 \n\t" "pextrh $f18, $f2, $f26 \n\t" "pinsrh_3 $f2, $f2, $f16 \n\t" "pextrh $f16, $f14, $f20 \n\t" "pinsrh_0 $f14, $f14, $f18 \n\t" "pextrh $f18, $f12, $f20 \n\t" "pinsrh_0 $f12, $f12, $f16 \n\t" "pinsrh_3 $f0, $f0, $f18 \n\t" "mov.d $f4, $f0 \n\t" "pshufh $f6, $f2, $f28 \n\t" "pshufh $f8, $f12, $f30 \n\t" "mov.d $f10, $f14 \n\t" "mov.d $f12, $f8 \n\t" "mov.d $f14, $f10 \n\t" "dsrl $f4, $f4, $f22 \n\t" "pinsrh_3 $f4, $f4, $f6 \n\t" "dsrl $f6, $f6, $f22 \n\t" "dsll $f14, $f12, $f24 \n\t" "xor $f12, $f12, $f12 \n\t" "or $f4, $f4, $f12 \n\t" "or $f6, $f6, $f14 \n\t" "dsrl $f8, $f8, $f22 \n\t" "pinsrh_3 $f8, $f8, $f10 \n\t" "dsrl $f10, $f10, $f22 \n\t" "gssqc1 $f6, $f4, 0x0(%[zig_value]) \n\t" "gssqc1 $f10, $f8, 0x10(%[zig_value]) \n\t" : : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct) : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } unsigned char i_ds_table[]__attribute__((aligned(16))) = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; unsigned char high_mask_table[]__attribute__((aligned(16))) = { 0, 0, 0, 3, 0, 2, 3, 6, 0, 2, 2, 5, 3, 5, 6, 9, 0, 1, 2, 5, 2, 4, 5, 8, 3, 5, 5, 8, 6, 8, 9,12, 0, 1, 1, 4, 2, 4, 5, 8, 2, 4, 4, 7, 5, 7, 8,11, 3, 4, 5, 8, 5, 7, 8,11, 6, 8, 8,11, 9,11,12,15, 0, 1, 1, 4, 1, 3, 4, 7, 2, 4, 4, 7, 5, 7, 8,11, 2, 3, 4, 7, 4, 6, 7,10, 5, 7, 7,10, 8,10,11,14, 3, 4, 4, 7, 5, 7, 8,11, 5, 7, 7,10, 8,10, 11,14, 6, 7, 8,11, 8,10,11,14, 9,11,11,14,12,14,15,18, 0, 0, 1, 4, 1, 3, 4, 7, 1, 3, 3, 6, 4, 6, 7,10, 2, 3, 4, 7, 4, 6, 7,10, 5, 7, 7,10, 8,10,11,14, 2, 3, 3, 6, 4, 6, 7,10, 4, 6, 6, 9, 7, 9,10,13, 5, 6, 7,10, 7, 9,10,13, 8,10,10,13,11,13, 14,17, 3, 4, 4, 7, 4, 6, 7,10, 5, 7, 7,10, 8,10,11,14, 5, 6, 7,10, 7, 9,10,13, 8,10,10,13, 11,13,14,17, 6, 7, 7,10, 8,10, 11,14, 8,10,10,13,11,13,14,17, 9,10,11,14,11,13,14,17,12,14, 14,17,15,17,18,21}; unsigned char low_mask_table[]__attribute__((aligned(16))) = { 0, 3, 2, 6, 2, 5, 5, 9, 1, 5, 4, 8, 5, 8, 8,12, 1, 4, 4, 8, 4, 7, 7,11, 4, 8, 7,11, 8,11, 11,15, 1, 4, 3, 7, 4, 7, 7,11, 3, 7, 6,10, 7,10,10,14, 4, 7, 7,11, 7,10,10,14, 7,11,10,14, 11,14,14,18, 0, 4, 3, 7, 3, 6, 6,10, 3, 7, 6,10, 7,10,10,14, 3, 6, 6,10, 6, 9, 9,13, 6,10, 9,13,10,13,13,17, 4, 7, 6,10, 7,10,10,14, 6,10, 9,13,10,13, 13,17, 7,10,10,14,10,13,13,17, 10,14,13,17,14,17,17,21, 0, 3, 3, 7, 3, 6, 6,10, 2, 6, 5, 9, 6, 9, 9,13, 3, 6, 6,10, 6, 9, 9,13, 6,10, 9,13,10,13,13,17, 3, 6, 5, 9, 6, 9, 9,13, 5, 9, 8,12, 9,12,12,16, 6, 9, 9,13, 9,12,12,16, 9,13,12,16,13,16, 16,20, 3, 7, 6,10, 6, 9, 9,13, 6,10, 9,13,10,13,13,17, 6, 9, 9,13, 9,12,12,16, 9,13,12,16, 13,16,16,20, 7,10, 9,13,10,13, 13,17, 9,13,12,16,13,16,16,20, 10,13,13,17,13,16,16,20,13,17, 16,20,17,20,20,24}; int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) { int32_t iSingleCtr = 0; __asm__ volatile( ".set arch=loongson3a \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t" "packsshb $f0, $f0, $f2 \n\t" "packsshb $f2, $f4, $f6 \n\t" "xor $f10, $f10, $f10 \n\t" "xor $f8, $f8, $f8 \n\t" "pcmpeqb $f0, $f0, $f8 \n\t" "pcmpeqb $f2, $f2, $f8 \n\t" "pmovmskb $f10, $f0 \n\t" "pmovmskb $f12, $f2 \n\t" "punpcklbh $f10, $f10, $f12 \n\t" "dmfc1 $12, $f10 \n\t" "dli $8, 0xffff \n\t" "xor $12, $12, $8 \n\t" "xor %[pDct], %[pDct], %[pDct] \n\t" "dli $8, 0x80 \n\t" "dli $9, 0x7 \n\t" "dli $10, 0x100 \n\t" "dli $11, 0x8 \n\t" "1: \n\t" "and $13, $12, $8 \n\t" "bnez $13, 2f \n\t" "nop \n\t" "daddiu $9, -0x1 \n\t" "dsrl $8, 1 \n\t" "bnez $9, 1b \n\t" "nop \n\t" "2: \n\t" "and $13, $12, $10 \n\t" "bnez $13, 3f \n\t" "nop \n\t" "daddiu $11, 0x1 \n\t" "dsll $10, 1 \n\t" "daddiu $13, $11, -0x10 \n\t" "bltz $13, 2b \n\t" "nop \n\t" "3: \n\t" "dsubu $11, $11, $9 \n\t" "daddiu $11, -0x1 \n\t" PTR_ADDU "$8, %[i_ds_table], $11 \n\t" "lb $10, 0x0($8) \n\t" PTR_ADDU "%[pDct], %[pDct], $10 \n\t" "move $11, $12 \n\t" "dli $10, 0xff \n\t" "and $12, $10 \n\t" "dsrl $11, 0x8 \n\t" "and $11, $10 \n\t" PTR_ADDU "$8, %[low_mask_table], $12 \n\t" "lb $10, 0x0($8) \n\t" PTR_ADDU "%[pDct], %[pDct], $10 \n\t" PTR_ADDU "$8, %[high_mask_table], $11 \n\t" "lb $10, 0x0($8) \n\t" PTR_ADDU "%[iSingleCtr], %[pDct], $10 \n\t" : [iSingleCtr] "=r"(iSingleCtr) : [pDct] "r"((short *)pDct), [i_ds_table] "r"((unsigned char *)i_ds_table), [high_mask_table] "r"((unsigned char *)high_mask_table), [low_mask_table] "r"((unsigned char *)low_mask_table) : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12" ); return iSingleCtr; }