shithub: openh264

Download patch

ref: 0237194f6e2c1f72463bced4259f4d283ddbc2dd
parent: c930424642b5a0bf3289c4c93d3278c19c9ba764
author: Guangwei Wang <guangwwa@cisco.com>
date: Tue Jun 9 06:19:09 EDT 2015

bugfix:save neon register v8-v15

--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -314,53 +314,56 @@
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
     sub x0, x0, #2
-    movi v30.8h, #20, lsl #0
-    movi v31.8h, #5, lsl #0
+    stp q8,q9, [sp,#-32]!
+    movi v8.8h, #20, lsl #0
+    movi v9.8h, #5, lsl #0
 w8_h_mc_luma_loop:
-    VEC4_LD1_8BITS_16ELEMENT x0, x1, v0, v4, v8, v12    //load src[-2] in v0,v4,v8,v12 for 4 row; only use 13(8+5);
+    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
 
     //1st row:
-    ext v1.16b, v0.16b, v0.16b, #5  //src[3]
-    ext v2.16b, v0.16b, v0.16b, #1  //src[-1]
-    ext v3.16b, v0.16b, v0.16b, #4  //src[2]
+    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
+    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
+    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
     //2nd row:
-    ext v5.16b, v4.16b, v4.16b, #5  //src[3]
-    ext v6.16b, v4.16b, v4.16b, #1  //src[-1]
-    ext v7.16b, v4.16b, v4.16b, #4  //src[2]
+    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
+    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
+    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
     //3rd row:
-    ext v9.16b, v8.16b, v8.16b, #5  //src[3]
-    ext v10.16b, v8.16b, v8.16b, #1  //src[-1]
-    ext v11.16b, v8.16b, v8.16b, #4  //src[2]
+    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
+    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
+    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
     //4th row:
-    ext v13.16b, v12.16b, v12.16b, #5  //src[3]
-    ext v14.16b, v12.16b, v12.16b, #1  //src[-1]
-    ext v15.16b, v12.16b, v12.16b, #4  //src[2]
+    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
+    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
+    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
 
-    VEC4_UADDL_8BITS v0, v1, v4, v5, v8, v9, v12, v13, v16, v18, v20, v22   //v16/v18/v20/v22=src[-2]+src[3]
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -= 5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
 
     //1st row:
-    ext v2.16b, v0.16b, v0.16b, #2  //src[0]
-    ext v3.16b, v0.16b, v0.16b, #3  //src[1]
+    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
+    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
     //2nd row:
-    ext v6.16b, v4.16b, v4.16b, #2  //src[0]
-    ext v7.16b, v4.16b, v4.16b, #3  //src[1]
+    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
+    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
     //3rd row:
-    ext v10.16b, v8.16b, v8.16b, #2  //src[0]
-    ext v11.16b, v8.16b, v8.16b, #3  //src[1]
+    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
+    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
     //4th row:
-    ext v14.16b, v12.16b, v12.16b, #2  //src[0]
-    ext v15.16b, v12.16b, v12.16b, #3  //src[1]
+    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
+    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
 
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22+=20*(src[0]+src[1])
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
+    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
 
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
     cbnz x4, w8_h_mc_luma_loop
+
+    ldp q8,q9,[sp],#32
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
@@ -422,55 +425,58 @@
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon
     sub x0, x0, #2
-    movi v30.8h, #20, lsl #0
-    movi v31.8h, #5, lsl #0
+    stp q8,q9, [sp,#-32]!
+    movi v8.8h, #20, lsl #0
+    movi v9.8h, #5, lsl #0
 w8_xy_10_mc_luma_loop:
-    VEC4_LD1_8BITS_16ELEMENT x0, x1, v0, v4, v8, v12    //load src[-2] in v0,v4,v8,v12 for 4 row; only use 13(8+5);
+    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
 
     //1st row:
-    ext v1.16b, v0.16b, v0.16b, #5  //src[3]
-    ext v2.16b, v0.16b, v0.16b, #1  //src[-1]
-    ext v3.16b, v0.16b, v0.16b, #4  //src[2]
+    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
+    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
+    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
     //2nd row:
-    ext v5.16b, v4.16b, v4.16b, #5  //src[3]
-    ext v6.16b, v4.16b, v4.16b, #1  //src[-1]
-    ext v7.16b, v4.16b, v4.16b, #4  //src[2]
+    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
+    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
+    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
     //3rd row:
-    ext v9.16b, v8.16b, v8.16b, #5  //src[3]
-    ext v10.16b, v8.16b, v8.16b, #1  //src[-1]
-    ext v11.16b, v8.16b, v8.16b, #4  //src[2]
+    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
+    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
+    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
     //4th row:
-    ext v13.16b, v12.16b, v12.16b, #5  //src[3]
-    ext v14.16b, v12.16b, v12.16b, #1  //src[-1]
-    ext v15.16b, v12.16b, v12.16b, #4  //src[2]
+    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
+    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
+    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
 
-    VEC4_UADDL_8BITS v0, v1, v4, v5, v8, v9, v12, v13, v16, v18, v20, v22   //v16/v18/v20/v22=src[-2]+src[3]
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -= 5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
 
     //1st row:
-    ext v2.16b, v0.16b, v0.16b, #2  //src[0]
-    ext v3.16b, v0.16b, v0.16b, #3  //src[1]
+    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
+    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
     //2nd row:
-    ext v6.16b, v4.16b, v4.16b, #2  //src[0]
-    ext v7.16b, v4.16b, v4.16b, #3  //src[1]
+    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
+    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
     //3rd row:
-    ext v10.16b, v8.16b, v8.16b, #2  //src[0]
-    ext v11.16b, v8.16b, v8.16b, #3  //src[1]
+    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
+    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
     //4th row:
-    ext v14.16b, v12.16b, v12.16b, #2  //src[0]
-    ext v15.16b, v12.16b, v12.16b, #3  //src[1]
+    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
+    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
 
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22+=20*(src[0]+src[1])
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
+    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
-    VEC4_UADDL_8BITS v17, v2, v19, v6, v21, v10, v23, v14, v16, v18, v20, v22   //average with arc[0]
-    VEC4_RSHRN_16BITS_SHIFT1 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v1, v18, v3, v22, v5, v26, v7, v30, v0, v2, v4, v6   //average with arc[0]
+    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
     cbnz x4, w8_xy_10_mc_luma_loop
+
+    ldp q8,q9,[sp],#32
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
@@ -533,55 +539,58 @@
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon
     sub x0, x0, #2
-    movi v30.8h, #20, lsl #0
-    movi v31.8h, #5, lsl #0
+    stp q8,q9, [sp,#-32]!
+    movi v8.8h, #20, lsl #0
+    movi v9.8h, #5, lsl #0
 w8_xy_30_mc_luma_loop:
-    VEC4_LD1_8BITS_16ELEMENT x0, x1, v0, v4, v8, v12    //load src[-2] in v0,v4,v8,v12 for 4 row; only use 13(8+5);
+    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
 
     //1st row:
-    ext v1.16b, v0.16b, v0.16b, #5  //src[3]
-    ext v2.16b, v0.16b, v0.16b, #1  //src[-1]
-    ext v3.16b, v0.16b, v0.16b, #4  //src[2]
+    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
+    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
+    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
     //2nd row:
-    ext v5.16b, v4.16b, v4.16b, #5  //src[3]
-    ext v6.16b, v4.16b, v4.16b, #1  //src[-1]
-    ext v7.16b, v4.16b, v4.16b, #4  //src[2]
+    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
+    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
+    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
     //3rd row:
-    ext v9.16b, v8.16b, v8.16b, #5  //src[3]
-    ext v10.16b, v8.16b, v8.16b, #1  //src[-1]
-    ext v11.16b, v8.16b, v8.16b, #4  //src[2]
+    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
+    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
+    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
     //4th row:
-    ext v13.16b, v12.16b, v12.16b, #5  //src[3]
-    ext v14.16b, v12.16b, v12.16b, #1  //src[-1]
-    ext v15.16b, v12.16b, v12.16b, #4  //src[2]
+    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
+    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
+    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
 
-    VEC4_UADDL_8BITS v0, v1, v4, v5, v8, v9, v12, v13, v16, v18, v20, v22   //v16/v18/v20/v22=src[-2]+src[3]
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -= 5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
 
     //1st row:
-    ext v2.16b, v0.16b, v0.16b, #2  //src[0]
-    ext v3.16b, v0.16b, v0.16b, #3  //src[1]
+    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
+    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
     //2nd row:
-    ext v6.16b, v4.16b, v4.16b, #2  //src[0]
-    ext v7.16b, v4.16b, v4.16b, #3  //src[1]
+    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
+    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
     //3rd row:
-    ext v10.16b, v8.16b, v8.16b, #2  //src[0]
-    ext v11.16b, v8.16b, v8.16b, #3  //src[1]
+    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
+    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
     //4th row:
-    ext v14.16b, v12.16b, v12.16b, #2  //src[0]
-    ext v15.16b, v12.16b, v12.16b, #3  //src[1]
+    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
+    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
 
-    VEC4_UADDL_8BITS v2, v3, v6, v7, v10, v11, v14, v15, v17, v19, v21, v23 //v17/v19/v21/v23=src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22+=20*(src[0]+src[1])
+    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
+    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
-    VEC4_UADDL_8BITS v17, v3, v19, v7, v21, v11, v23, v15, v16, v18, v20, v22   //average with arc[1]
-    VEC4_RSHRN_16BITS_SHIFT1 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v1, v19, v3, v23, v5, v27, v7, v31, v0, v2, v4, v6   //average with arc[0]
+    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
     cbnz x4, w8_xy_30_mc_luma_loop
+
+    ldp q8,q9,[sp],#32
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
@@ -707,36 +716,36 @@
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
 
-    ld1 {v0.8b}, [x0], x1 // v0=src[-2*stride]
-    ld1 {v1.8b}, [x0], x1 // v1=src[-1*stride]
-    ld1 {v2.8b}, [x0], x1 // v2=src[0*stride]
-    ld1 {v3.8b}, [x0], x1 // v3=src[1*stride]
-    ld1 {v4.8b}, [x0], x1 // v4=src[2*stride]
+    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
+    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
+    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
+    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
+    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
 
 w8_xy_01_mc_luma_loop:
-    ld1 {v5.8b}, [x0], x1 // v5=src[3*stride]
-    ld1 {v6.8b}, [x0], x1 // v6=src[4*stride]
-    ld1 {v7.8b}, [x0], x1 // v7=src[5*stride]
-    ld1 {v8.8b}, [x0], x1 // v8=src[6*stride]
+    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
+    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
+    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
+    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
 
-    VEC4_UADDL_8BITS v0, v5, v1, v6, v2, v7, v3, v8, v16, v18, v20, v22 //v16/v18/v20/v22 =src[-2]+src[3]
-    VEC4_UADDL_8BITS v1, v4, v2, v5, v3, v6, v4, v7, v17, v19, v21, v23 //v17/v19/v21/v23 =src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -=5*(src[-1]+src[2])
-    VEC4_UADDL_8BITS v2, v3, v3, v4, v4, v5, v5, v6, v17, v19, v21, v23 //v17/v19/v21/v23 =src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22 += 20*(src[0]+src[1])
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
+    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
+    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_UADDL_8BITS v17, v2, v19, v3, v21, v4, v23, v5, v16, v18, v20, v22 //v16/v18/v20/v22 = average with src[0]
-    VEC4_RSHRN_16BITS_SHIFT1 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v1, v18, v3, v19, v5, v20, v7, v21, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0]
+    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23  //store 8bytes*4row
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
 
     sub x4, x4, #4
-    mov v0.16b, v4.16b
-    mov v1.16b, v5.16b
-    mov v2.16b, v6.16b
-    mov v3.16b, v7.16b
-    mov v4.16b, v8.16b
+    mov v16.16b, v20.16b
+    mov v17.16b, v21.16b
+    mov v18.16b, v22.16b
+    mov v19.16b, v23.16b
+    mov v20.16b, v24.16b
 
     cbnz x4, w8_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -884,36 +893,36 @@
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
 
-    ld1 {v0.8b}, [x0], x1 // v0=src[-2*stride]
-    ld1 {v1.8b}, [x0], x1 // v1=src[-1*stride]
-    ld1 {v2.8b}, [x0], x1 // v2=src[0*stride]
-    ld1 {v3.8b}, [x0], x1 // v3=src[1*stride]
-    ld1 {v4.8b}, [x0], x1 // v4=src[2*stride]
+    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
+    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
+    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
+    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
+    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
 
 w8_xy_03_mc_luma_loop:
-    ld1 {v5.8b}, [x0], x1 // v5=src[3*stride]
-    ld1 {v6.8b}, [x0], x1 // v6=src[4*stride]
-    ld1 {v7.8b}, [x0], x1 // v7=src[5*stride]
-    ld1 {v8.8b}, [x0], x1 // v8=src[6*stride]
+    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
+    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
+    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
+    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
 
-    VEC4_UADDL_8BITS v0, v5, v1, v6, v2, v7, v3, v8, v16, v18, v20, v22 //v16/v18/v20/v22 =src[-2]+src[3]
-    VEC4_UADDL_8BITS v1, v4, v2, v5, v3, v6, v4, v7, v17, v19, v21, v23 //v17/v19/v21/v23 =src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -=5*(src[-1]+src[2])
-    VEC4_UADDL_8BITS v2, v3, v3, v4, v4, v5, v5, v6, v17, v19, v21, v23 //v17/v19/v21/v23 =src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22 += 20*(src[0]+src[1])
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
+    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
+    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_UADDL_8BITS v17, v3, v19, v4, v21, v5, v23, v6, v16, v18, v20, v22 //v16/v18/v20/v22 = average with src[1]
-    VEC4_RSHRN_16BITS_SHIFT1 v16, v18, v20, v22, v17, v19, v21, v23
+    VEC4_UADDL_8BITS v1, v19, v3, v20, v5, v21, v7, v22, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0]
+    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
 
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23  //store 8bytes*4row
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
 
     sub x4, x4, #4
-    mov v0.16b, v4.16b
-    mov v1.16b, v5.16b
-    mov v2.16b, v6.16b
-    mov v3.16b, v7.16b
-    mov v4.16b, v8.16b
+    mov v16.16b, v20.16b
+    mov v17.16b, v21.16b
+    mov v18.16b, v22.16b
+    mov v19.16b, v23.16b
+    mov v20.16b, v24.16b
 
     cbnz x4, w8_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1061,32 +1070,32 @@
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
 
-    ld1 {v0.8b}, [x0], x1 // v0=src[-2*stride]
-    ld1 {v1.8b}, [x0], x1 // v1=src[-1*stride]
-    ld1 {v2.8b}, [x0], x1 // v2=src[0*stride]
-    ld1 {v3.8b}, [x0], x1 // v3=src[1*stride]
-    ld1 {v4.8b}, [x0], x1 // v4=src[2*stride]
+    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
+    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
+    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
+    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
+    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
 
 w8_xy_02_mc_luma_loop:
-    ld1 {v5.8b}, [x0], x1 // v5=src[3*stride]
-    ld1 {v6.8b}, [x0], x1 // v6=src[4*stride]
-    ld1 {v7.8b}, [x0], x1 // v7=src[5*stride]
-    ld1 {v8.8b}, [x0], x1 // v8=src[6*stride]
+    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
+    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
+    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
+    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
 
-    VEC4_UADDL_8BITS v0, v5, v1, v6, v2, v7, v3, v8, v16, v18, v20, v22 //v16/v18/v20/v22 =src[-2]+src[3]
-    VEC4_UADDL_8BITS v1, v4, v2, v5, v3, v6, v4, v7, v17, v19, v21, v23 //v17/v19/v21/v23 =src[-1]+src[2]
-    VEC4_MLS_16BITS v17, v31, v19, v31, v21, v31, v23, v31, v16, v18, v20, v22  //v16/v18/v20/v22 -=5*(src[-1]+src[2])
-    VEC4_UADDL_8BITS v2, v3, v3, v4, v4, v5, v5, v6, v17, v19, v21, v23 //v17/v19/v21/v23 =src[0]+src[1]
-    VEC4_MLA_16BITS v17, v30, v19, v30, v21, v30, v23, v30, v16, v18, v20, v22  //v16/v18/v20/v22 += 20*(src[0]+src[1])
-    VEC4_SQRSHRUN_16BITS_SHIFT5 v16, v18, v20, v22, v17, v19, v21, v23
-    VEC4_ST1_8BITS_8ELEMENT x2, x3, v17, v19, v21, v23  //store 8bytes*4row
+    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
+    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
+    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
+    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
+    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
+    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
+    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
 
     sub x4, x4, #4
-    mov v0.16b, v4.16b
-    mov v1.16b, v5.16b
-    mov v2.16b, v6.16b
-    mov v3.16b, v7.16b
-    mov v4.16b, v8.16b
+    mov v16.16b, v20.16b
+    mov v17.16b, v21.16b
+    mov v18.16b, v22.16b
+    mov v19.16b, v23.16b
+    mov v20.16b, v24.16b
 
     cbnz x4, w8_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1667,50 +1676,56 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
-    ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x4] //load A/B/C/D
-    ld1 {v0.16b}, [x0], x1  // src[x]
-    ext v1.16b, v0.16b, v0.16b, #1  // src[x+1]
+    ld4r {v28.8b, v29.8b, v30.8b, v31.8b}, [x4] //load A/B/C/D
+    ld1 {v16.16b}, [x0], x1  // src[x]
+    ext v17.16b, v16.16b, v16.16b, #1  // src[x+1]
 w8_mc_chroma_loop:
-    ld1 {v2.16b}, [x0], x1  // src[x+stride]
-    ext v3.16b, v2.16b, v2.16b, #1  // src[x+stride+1]
-    ld1 {v4.16b}, [x0], x1  // src[x+2*stride]
-    ext v5.16b, v4.16b, v4.16b, #1  // src[x+2*stride+1]
-    ld1 {v6.16b}, [x0], x1  // src[x+3*stride]
-    ext v7.16b, v6.16b, v6.16b, #1  // src[x+3*stride+1]
-    ld1 {v30.16b}, [x0], x1  // src[x+4*stride]
-    ext v31.16b, v30.16b, v30.16b, #1  // src[x+4*stride+1]
+    ld1 {v18.16b}, [x0], x1  // src[x+stride]
+    ext v19.16b, v18.16b, v18.16b, #1  // src[x+stride+1]
 
-    umull v8.8h, v0.8b, v16.8b
-    umull v10.8h, v2.8b, v16.8b
-    umull v12.8h, v4.8b, v16.8b
-    umull v14.8h, v6.8b, v16.8b
+    ld1 {v20.16b}, [x0], x1  // src[x+2*stride]
+    ext v21.16b, v20.16b, v20.16b, #1  // src[x+2*stride+1]
 
-    umlal v8.8h, v1.8b, v17.8b
-    umlal v10.8h, v3.8b, v17.8b
-    umlal v12.8h, v5.8b, v17.8b
-    umlal v14.8h, v7.8b, v17.8b
+    ld1 {v22.16b}, [x0], x1  // src[x+3*stride]
+    ext v23.16b, v22.16b, v22.16b, #1  // src[x+3*stride+1]
 
-    umlal v8.8h, v2.8b, v18.8b
-    umlal v10.8h, v4.8b, v18.8b
-    umlal v12.8h, v6.8b, v18.8b
-    umlal v14.8h, v30.8b, v18.8b
+    ld1 {v24.16b}, [x0], x1  // src[x+4*stride]
+    ext v25.16b, v24.16b, v24.16b, #1  // src[x+4*stride+1]
 
-    umlal v8.8h, v3.8b, v19.8b
-    umlal v10.8h, v5.8b, v19.8b
-    umlal v12.8h, v7.8b, v19.8b
-    umlal v14.8h, v31.8b, v19.8b
+    umull v0.8h, v16.8b, v28.8b
+    umull v2.8h, v18.8b, v28.8b
+    umull v4.8h, v20.8b, v28.8b
+    umull v6.8h, v22.8b, v28.8b
 
-    rshrn v9.8b, v8.8h, #6
-    st1 {v9.8b}, [x2], x3
-    rshrn v11.8b, v10.8h, #6
-    st1 {v11.8b}, [x2], x3
-    rshrn v13.8b, v12.8h, #6
-    st1 {v13.8b}, [x2], x3
-    rshrn v15.8b, v14.8h, #6
-    st1 {v15.8b}, [x2], x3
+    umlal v0.8h, v17.8b, v29.8b
+    umlal v2.8h, v19.8b, v29.8b
+    umlal v4.8h, v21.8b, v29.8b
+    umlal v6.8h, v23.8b, v29.8b
 
-    mov v0.16b, v30.16b
-    mov v1.16b, v31.16b
+    umlal v0.8h, v18.8b, v30.8b
+    umlal v2.8h, v20.8b, v30.8b
+    umlal v4.8h, v22.8b, v30.8b
+    umlal v6.8h, v24.8b, v30.8b
+
+    umlal v0.8h, v19.8b, v31.8b
+    umlal v2.8h, v21.8b, v31.8b
+    umlal v4.8h, v23.8b, v31.8b
+    umlal v6.8h, v25.8b, v31.8b
+
+    rshrn v1.8b, v0.8h, #6
+    st1 {v1.8b}, [x2], x3
+
+    rshrn v3.8b, v2.8h, #6
+    st1 {v3.8b}, [x2], x3
+
+    rshrn v5.8b, v4.8h, #6
+    st1 {v5.8b}, [x2], x3
+
+    rshrn v7.8b, v6.8h, #6
+    st1 {v7.8b}, [x2], x3
+
+    mov v16.16b, v24.16b
+    mov v17.16b, v25.16b
     sub x5, x5, #4
     cbnz x5, w8_mc_chroma_loop
 WELS_ASM_AARCH64_FUNC_END