ref: a1e3f35842de92b526422af05360c84cf233f07f
parent: efd852af30ff160ecea04674713d9810c5370644
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Tue Apr 9 08:55:19 EDT 2019
arm:mc: NEON implementation of blend, blend_h and blend_v function A73 A53 blend_h_w2_8bpc_c: 149.3 246.8 blend_h_w2_8bpc_neon: 74.6 137 blend_h_w4_8bpc_c: 251.6 409.8 blend_h_w4_8bpc_neon: 66 146.6 blend_h_w8_8bpc_c: 446.6 844.1 blend_h_w8_8bpc_neon: 68.6 131.2 blend_h_w16_8bpc_c: 830 1513 blend_h_w16_8bpc_neon: 85.9 192 blend_h_w32_8bpc_c: 1605.2 2847.8 blend_h_w32_8bpc_neon: 149.8 357.6 blend_h_w64_8bpc_c: 3304.8 5515.5 blend_h_w64_8bpc_neon: 262.8 629.5 blend_h_w128_8bpc_c: 7895.1 13260.6 blend_h_w128_8bpc_neon: 577 1402 blend_v_w2_8bpc_c: 241.2 410.8 blend_v_w2_8bpc_neon: 122.1 196.8 blend_v_w4_8bpc_c: 874.4 1418.2 blend_v_w4_8bpc_neon: 248.5 375.9 blend_v_w8_8bpc_c: 1550.5 2514.7 blend_v_w8_8bpc_neon: 210.8 376 blend_v_w16_8bpc_c: 2925.3 5086 blend_v_w16_8bpc_neon: 253.4 608.3 blend_v_w32_8bpc_c: 5686.7 9470.5 blend_v_w32_8bpc_neon: 348.2 994.8 blend_w4_8bpc_c: 201.5 309.3 blend_w4_8bpc_neon: 38.6 99.2 blend_w8_8bpc_c: 531.3 944.8 blend_w8_8bpc_neon: 55.1 125.8 blend_w16_8bpc_c: 1992.8 3349.8 blend_w16_8bpc_neon: 150.1 344 blend_w32_8bpc_c: 4982 8165.9 blend_w32_8bpc_neon: 360.4 910.9
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -439,6 +439,421 @@
pop {r4-r11,pc}
endfunc
+function blend_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ ldr r5, [sp, #28]
+ clz r6, r3
+ adr r7, L(blend_tbl)
+ sub r6, r6, #26
+ ldr r6, [r7, r6, lsl #2]
+ add r7, r7, r6
+ bx r7
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.32 {d2[]}, [r5], r3
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d0[]}, [r0]
+ subs r4, r4, #2
+ vld1.32 {d2[1]}, [r5], r3
+ vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r12]
+ vsub.i8 d3, d22, d2
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d3
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0], r1
+ vst1.32 {d20[1]}, [r12], r1
+ bgt 4b
+ pop {r4-r8,pc}
+80:
+ vmov.i8 d16, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.u8 {d2}, [r5], r3
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d0}, [r0]
+ vsub.i8 d17, d16, d2
+ vld1.u8 {d3}, [r5], r3
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d1}, [r12]
+ subs r4, r4, #2
+ vsub.i8 d18, d16, d3
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d18
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0], r1
+ vst1.u8 {d23}, [r12], r1
+ bgt 8b
+ pop {r4-r8,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.u8 {q2}, [r5], r3
+ vld1.u8 {q1}, [r2], r3
+ vld1.u8 {q0}, [r0]
+ subs r4, r4, #2
+ vsub.i8 q11, q12, q2
+ vld1.u8 {q15}, [r5], r3
+ vld1.u8 {q14}, [r2], r3
+ vld1.u8 {q13}, [r12]
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d22
+ vmull.u8 q8, d3, d5
+ vmlal.u8 q8, d1, d23
+ vsub.i8 q11, q12, q15
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d28, d30
+ vmlal.u8 q3, d26, d22
+ vmull.u8 q8, d29, d31
+ vmlal.u8 q8, d27, d23
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {q9}, [r0], r1
+ vst1.u8 {q10}, [r12], r1
+ bgt 16b
+ pop {r4-r8,pc}
+
+320:
+ vmov.i8 q10, #64
+32:
+ vld1.u8 {q2, q3}, [r5], r3
+ vld1.u8 {q8, q9}, [r2], r3
+ vld1.u8 {q0, q1}, [r0]
+ subs r4, r4, #1
+ vsub.i8 q11, q10, q2
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vsub.i8 q11, q10, q3
+ vrshrn.i16 d24, q15, #6
+ vrshrn.i16 d25, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d22
+ vmull.u8 q14, d19, d7
+ vmlal.u8 q14, d3, d23
+ vrshrn.i16 d26, q15, #6
+ vrshrn.i16 d27, q14, #6
+ vst1.u8 {q12, q13}, [r0], r1
+ bgt 32b
+ pop {r4-r8,pc}
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz r6, r3
+ adr r7, L(blend_h_tbl)
+ sub r6, r6, #24
+ ldr r6, [r7, r6, lsl #2]
+ add r7, r7, r6
+ bx r7
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld1.16 {d2[], d3[]}, [r5]!
+ vld1.16 {d1[]}, [r2], r3
+ subs r4, r4, #2
+ vld1.16 {d0[]}, [r0]
+ vzip.8 d2, d3
+ vld1.16 {d1[1]}, [r2], r3
+ vsub.i8 d4, d22, d2
+ vld1.16 {d0[1]}, [r12]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d4
+ vrshrn.i16 d20, q8, #6
+ vst1.16 {d20[0]}, [r0], r1
+ vst1.16 {d20[1]}, [r12], r1
+ bgt 2b
+ pop {r4-r8,pc}
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.u8 {d2[]}, [r5]!
+ vld1.32 {d1[]}, [r2], r3
+ subs r4, r4, #2
+ vld1.u8 {d6[]}, [r5]!
+ vld1.32 {d1[1]}, [r2], r3
+ vext.u8 d2, d2, d6, #4
+ vld1.32 {d0[]}, [r0]
+ vsub.i8 d3, d22, d2
+ vld1.32 {d0[1]}, [r12]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d3
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0], r1
+ vst1.32 {d20[1]}, [r12], r1
+ bgt 4b
+ pop {r4-r8,pc}
+80:
+ vmov.i8 d16, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.u8 {d2[]}, [r5]!
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d0}, [r0]
+ vsub.i8 d17, d16, d2
+ vld1.u8 {d3[]}, [r5]!
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d1}, [r12]
+ subs r4, r4, #2
+ vsub.i8 d18, d16, d3
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d18
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0], r1
+ vst1.u8 {d23}, [r12], r1
+ bgt 8b
+ pop {r4-r8,pc}
+160:
+ vmov.i8 d24, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.u8 {d4[]}, [r5]!
+ vld1.u8 {q1}, [r2], r3
+ vsub.i8 d5, d24, d4
+ vld1.u8 {q0}, [r0]
+ subs r4, r4, #2
+ vld1.u8 {d30[]}, [r5]!
+ vld1.u8 {q14}, [r2], r3
+ vsub.i8 d31, d24, d30
+ vld1.u8 {q13}, [r12]
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d5
+ vmull.u8 q8, d3, d4
+ vmlal.u8 q8, d1, d5
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d28, d30
+ vmlal.u8 q3, d26, d31
+ vmull.u8 q8, d29, d30
+ vmlal.u8 q8, d27, d31
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {q9}, [r0], r1
+ vst1.u8 {q10}, [r12], r1
+ bgt 16b
+ pop {r4-r8,pc}
+320:
+640:
+1280:
+ vmov.i8 d20, #64
+ sub r1, r1, r3
+321:
+ vld1.u8 {d6[]}, [r5]!
+ vsub.i8 d7, d20, d6
+ mov r8, r3
+32:
+ vld1.u8 {q8, q9}, [r2]!
+ vld1.u8 {q0, q1}, [r0]
+ vmull.u8 q15, d16, d6
+ vmlal.u8 q15, d0, d7
+ vmull.u8 q14, d17, d6
+ vmlal.u8 q14, d1, d7
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d7
+ vmull.u8 q14, d19, d6
+ vmlal.u8 q14, d3, d7
+ vrshrn.i16 d2, q15, #6
+ vrshrn.i16 d3, q14, #6
+ vst1.u8 {q0, q1}, [r0]!
+ subs r8, r8, #32
+ bgt 32b
+ add r0, r0, r1
+ subs r4, r4, #1
+ bgt 321b
+ pop {r4-r8,pc}
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldr r4, [sp, #24]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r3
+ clz r8, r3
+ adr r7, L(blend_v_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ add r7, r7, r8
+ bx r7
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ vld1.8 {d2[]}, [r5]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d3, d22, d2
+2:
+ vld1.8 {d1[]}, [r2], r3
+ vld1.8 {d0[]}, [r0]
+ subs r4, r4, #2
+ vld1.8 {d1[1]}, [r2], r3
+ vld1.8 {d0[1]}, [r12]
+ vmull.u8 q2, d1, d2
+ vmlal.u8 q2, d0, d3
+ vrshrn.i16 d6, q2, #6
+ vst1.8 {d6[0]}, [r0], r1
+ vst1.8 {d6[1]}, [r12], r1
+ bgt 2b
+ pop {r4-r8,pc}
+40:
+ vmov.i8 d22, #64
+ vld1.32 {d4[]}, [r5]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d5, d22, d4
+4:
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d0[]}, [r0]
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r12]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d5
+ vrshrn.i16 d20, q3, #6
+ vst1.16 {d20[0]}, [r0]!
+ vst1.16 {d20[2]}, [r12]!
+ vst1.8 {d20[2]}, [r0]!
+ vst1.8 {d20[6]}, [r12]!
+ sub r0, r0, #3
+ sub r12, r12, #3
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 4b
+ pop {r4-r8,pc}
+80:
+ vmov.i8 d16, #64
+ vld1.u8 {d2}, [r5]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d17, d16, d2
+8:
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d0}, [r0]
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d1}, [r12]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d2, d5
+ vmlal.u8 q10, d1, d17
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.32 {d22[0]}, [r0]!
+ vst1.32 {d23[0]}, [r12]!
+ vst1.16 {d22[2]}, [r0]!
+ vst1.16 {d23[2]}, [r12]!
+ sub r0, r0, #6
+ sub r12, r12, #6
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 8b
+ pop {r4-r8,pc}
+160:
+ vmov.i8 q12, #64
+ vld1.u8 {q2}, [r5]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 q11, q12, q2
+16:
+ vld1.u8 {q1}, [r2], r3
+ vld1.u8 {q0}, [r0]
+ subs r4, r4, #2
+ vld1.u8 {q14}, [r2], r3
+ vld1.u8 {q13}, [r12]
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d22
+ vmull.u8 q8, d3, d5
+ vmlal.u8 q8, d1, d23
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d28, d4
+ vmlal.u8 q3, d26, d22
+ vmull.u8 q8, d29, d5
+ vmlal.u8 q8, d27, d23
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {d18}, [r0]!
+ vst1.u8 {d20}, [r12]!
+ vst1.32 {d19[0]}, [r0]!
+ vst1.32 {d21[0]}, [r12]!
+ sub r0, r0, #12
+ sub r12, r12, #12
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 16b
+ pop {r4-r8,pc}
+320:
+ vmov.i8 q10, #64
+ vld1.u8 {q2, q3}, [r5]
+ vsub.i8 q11, q10, q2
+ vsub.i8 q12, q10, q3
+32:
+ vld1.u8 {q8, q9}, [r2], r3
+ vld1.u8 {q0, q1}, [r0]
+ subs r4, r4, #1
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d24
+ vrshrn.i16 d2, q15, #6
+ vst1.u8 {d0, d1, d2}, [r0], r1
+ bgt 32b
+ pop {r4-r8,pc}
+endfunc
.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
vld1.\wd {\d0[]}, [\s0], \strd
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -55,6 +55,9 @@
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
+decl_blend_fn(dav1d_blend_8bpc_neon);
+decl_blend_dir_fn(dav1d_blend_h_8bpc_neon);
+decl_blend_dir_fn(dav1d_blend_v_8bpc_neon);
decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
@@ -97,6 +100,10 @@
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
+#elif ARCH_ARM
+ c->blend = dav1d_blend_8bpc_neon;
+ c->blend_h = dav1d_blend_h_8bpc_neon;
+ c->blend_v = dav1d_blend_v_8bpc_neon;
#endif
#endif
}