ref: 9484bf7f57f9908d87f0447b093443056a77d009
dir: /vp8/common/arm/armv6/simpleloopfilter_v6.asm/
; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| AREA |.text|, CODE, READONLY ; name this block of code MACRO TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 ; a0: 03 02 01 00 ; a1: 13 12 11 10 ; a2: 23 22 21 20 ; a3: 33 32 31 30 ; b3 b2 b1 b0 uxtb16 $b1, $a1 ; xx 12 xx 10 uxtb16 $b0, $a0 ; xx 02 xx 00 uxtb16 $b3, $a3 ; xx 32 xx 30 uxtb16 $b2, $a2 ; xx 22 xx 20 orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 MEND src RN r0 pstep RN r1 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, ;r2 const char *flimit, ;r3 const char *limit, ;stack const char *thresh, ;stack int count ;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed ;for flimit. Same way applies to limit and thresh. ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines ldr r12, [r3], #4 ; limit ldr r3, [src], pstep ; p1 ldr r9, [sp, #36] ; count for 8-in-parallel ldr r4, [src], pstep ; p0 ldr r7, [r2], #4 ; flimit ldr r5, [src], pstep ; q0 ldr r2, c0x80808080 ldr r6, [src] ; q1 uadd8 r7, r7, r7 ; flimit * 2 mov r9, r9, lsl #1 ; 4-in-parallel uadd8 r12, r7, r12 ; flimit * 2 + limit |simple_hnext8| ; vp8_simple_filter_mask() function uqsub8 r7, r3, r6 ; p1 - q1 uqsub8 r8, r6, r3 ; q1 - p1 uqsub8 r10, r4, r5 ; p0 - q0 uqsub8 r11, r5, r4 ; q0 - p0 orr r8, r8, r7 ; abs(p1 - q1) ldr lr, c0x7F7F7F7F ; 01111111 mask orr r10, r10, r11 ; abs(p0 - q0) and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2 uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 mvn lr, #0 ; r10 == -1 uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 ; STALL waiting on r10 :( uqsub8 r10, r10, r12 ; compare to flimit mov r8, #0 usub8 r10, r8, r10 ; use usub8 instead of ssub8 ; STALL (maybe?) when are flags set? :/ sel r10, lr, r8 ; filter mask: lr cmp r10, #0 beq simple_hskip_filter ; skip filtering ;vp8_simple_filter() function eor r3, r3, r2 ; p1 offset to convert to a signed value eor r6, r6, r2 ; q1 offset to convert to a signed value eor r4, r4, r2 ; p0 offset to convert to a signed value eor r5, r5, r2 ; q0 offset to convert to a signed value qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) qadd8 r3, r3, r6 ldr r8, c0x03030303 ; r8 = 3 qadd8 r3, r3, r6 ldr r7, c0x04040404 qadd8 r3, r3, r6 and r3, r3, lr ; vp8_filter &= mask; ;save bottom 3 bits so that we round one side +4 and the other +3 qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) mov r7, #0 shadd8 r8 , r8 , r7 ; Filter2 >>= 3 shadd8 r3 , r3 , r7 ; Filter1 >>= 3 shadd8 r8 , r8 , r7 shadd8 r3 , r3 , r7 shadd8 r8 , r8 , r7 ; r8: Filter2 shadd8 r3 , r3 , r7 ; r7: filter1 ;calculate output sub src, src, pstep, lsl #1 qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) eor r4, r4, r2 ; *op0 = u^0x80 str r4, [src], pstep ; store op0 result eor r5, r5, r2 ; *oq0 = u^0x80 str r5, [src], pstep ; store oq0 result |simple_hskip_filter| add src, src, #4 sub src, src, pstep sub src, src, pstep, lsl #1 subs r9, r9, #1 ;pld [src] ;pld [src, pstep] ;pld [src, pstep, lsl #1] ldrne r3, [src], pstep ; p1 ldrne r4, [src], pstep ; p0 ldrne r5, [src], pstep ; q0 ldrne r6, [src] ; q1 bne simple_hnext8 ldmia sp!, {r4 - r11, pc} ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_vertical_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} ldr r12, [r2], #4 ; r12: flimit ldr r2, c0x80808080 ldr r7, [r3], #4 ; limit ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] ldrh r4, [src], pstep uadd8 r12, r12, r12 ; flimit * 2 ldrh r5, [src, #-2] ldrh r6, [src], pstep uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] ldrh r4, [src], pstep ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] ldrh r6, [src], pstep mov r11, r11, lsl #1 ; 4-in-parallel |simple_vnext8| ; vp8_simple_filter_mask() function pkhbt r9, r3, r4, lsl #16 pkhbt r10, r5, r6, lsl #16 ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 uqsub8 r7, r3, r6 ; p1 - q1 uqsub8 r8, r6, r3 ; q1 - p1 uqsub8 r9, r4, r5 ; p0 - q0 uqsub8 r10, r5, r4 ; q0 - p0 orr r7, r7, r8 ; abs(p1 - q1) orr r9, r9, r10 ; abs(p0 - q0) ldr lr, c0x7F7F7F7F ; 0111 1111 mask uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2 mov r8, #0 uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 mvn r10, #0 ; r10 == -1 uqsub8 r7, r7, r12 ; compare to flimit usub8 r7, r8, r7 sel r7, r10, r8 ; filter mask: lr cmp lr, #0 beq simple_vskip_filter ; skip filtering ;vp8_simple_filter() function eor r3, r3, r2 ; p1 offset to convert to a signed value eor r6, r6, r2 ; q1 offset to convert to a signed value eor r4, r4, r2 ; p0 offset to convert to a signed value eor r5, r5, r2 ; q0 offset to convert to a signed value qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) qadd8 r3, r3, r6 ldr r8, c0x03030303 ; r8 = 3 qadd8 r3, r3, r6 ldr r7, c0x04040404 qadd8 r3, r3, r6 and r3, r3, lr ; vp8_filter &= mask ;save bottom 3 bits so that we round one side +4 and the other +3 qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) mov r7, #0 shadd8 r8 , r8 , r7 ; Filter2 >>= 3 shadd8 r3 , r3 , r7 ; Filter1 >>= 3 shadd8 r8 , r8 , r7 shadd8 r3 , r3 , r7 shadd8 r8 , r8 , r7 ; r8: filter2 shadd8 r3 , r3 , r7 ; r7: filter1 ;calculate output sub src, src, pstep, lsl #2 qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) eor r4, r4, r2 ; *op0 = u^0x80 eor r5, r5, r2 ; *oq0 = u^0x80 strb r4, [src, #-1] ; store the result mov r4, r4, lsr #8 strb r5, [src], pstep mov r5, r5, lsr #8 strb r4, [src, #-1] mov r4, r4, lsr #8 strb r5, [src], pstep mov r5, r5, lsr #8 strb r4, [src, #-1] mov r4, r4, lsr #8 strb r5, [src], pstep mov r5, r5, lsr #8 strb r4, [src, #-1] strb r5, [src], pstep |simple_vskip_filter| subs r11, r11, #1 ;pld [src] ;pld [src, pstep] ;pld [src, pstep, lsl #1] ; load soure data to r7, r8, r9, r10 ldrneh r3, [src, #-2] ldrneh r4, [src], pstep ldrneh r5, [src, #-2] ldrneh r6, [src], pstep pkhbt r7, r3, r4, lsl #16 ldrneh r3, [src, #-2] ldrneh r4, [src], pstep pkhbt r8, r5, r6, lsl #16 ldrneh r5, [src, #-2] ldrneh r6, [src], pstep bne simple_vnext8 ldmia sp!, {r4 - r12, pc} ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| ; Constant Pool c0x80808080 DCD 0x80808080 c0x03030303 DCD 0x03030303 c0x04040404 DCD 0x04040404 c0x01010101 DCD 0x01010101 c0x7F7F7F7F DCD 0x7F7F7F7F END