shithub: libvpx

Download patch

ref: b02fdf098a4140f87b03b4ce80dd3ab7998a3a94
parent: cc63deba313e2e281bb1fdbe041ae8be13f1a3e0
parent: 17c754fc00881a78e71488e7c544d05cf2f6927f
author: John Koleszar <jkoleszar@google.com>
date: Tue Sep 20 20:05:04 EDT 2011

Merge remote branch 'origin/master' into experimental

Change-Id: I4e515276d197e1dfb1f3e75edfa9823d08c9b366

--- a/libs.mk
+++ b/libs.mk
@@ -281,17 +281,17 @@
 
 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
     $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
-	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
 
     $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
-	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
 
     $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
-	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+	grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
 else
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -58,10 +58,10 @@
 
         /*cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
 
-        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_armv6;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_armv6;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_armv6;
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
 
         /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
@@ -107,8 +107,8 @@
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_neon;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_neon;
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
 
         /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -53,10 +53,10 @@
     sub     r7, r5, #1                  ; range-1
 
     cmp     r1, #0
-    mul     r4, r4, r7                  ; ((range-1) * probability)
+    mul     r6, r4, r7                  ; ((range-1) * probability)
 
     mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
 
     addne   r2, r2, r4                  ; if  (bit) lowvalue += split
     subne   r4, r5, r4                  ; if  (bit) range = range-split
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -71,7 +71,7 @@
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -79,7 +79,7 @@
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -172,12 +172,12 @@
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -93,7 +93,7 @@
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -101,7 +101,7 @@
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -194,12 +194,12 @@
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -123,7 +123,7 @@
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -131,7 +131,7 @@
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -224,12 +224,12 @@
     ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
--- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+++ /dev/null
@@ -1,262 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_fast_fdct4x4_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_fast_fdct4x4_armv6| PROC
-
-    stmfd       sp!, {r4 - r12, lr}
-
-    ; PART 1
-
-    ; coeffs 0-3
-    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
-
-    ldr         r10, c7500
-    ldr         r11, c14500
-    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
-    ldr         lr, c0x00080008
-    ror         r5, r5, #16         ; [i2 | i3]
-
-    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
-    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
-    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
-
-    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
-
-    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
-    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
-
-    str         r6, [r1, #4]
-
-    ; coeffs 4-7
-    ror         r9, r9, #16         ; [i6 | i7]
-
-    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
-    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
-    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
-
-    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
-
-    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
-    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
-
-    str         r6, [r1, #12]
-
-    ; coeffs 8-11
-    ror         r5, r5, #16         ; [i10 | i11]
-
-    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
-    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
-    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
-
-    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
-
-    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
-    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
-
-    str         r6, [r1, #20]
-
-    ; coeffs 12-15
-    ror         r5, r5, #16         ; [i14 | i15]
-
-    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
-    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
-    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
-
-    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
-    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
-
-    str         r6, [r1, #28]
-
-
-    ; PART 2 -------------------------------------------------
-    ldr         r11, c12000
-    ldr         r10, c51000
-    ldr         lr, c0x00070007
-
-    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
-    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
-    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
-    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
-
-    qadd16      r4, r4, lr          ; a1 + 7
-
-    add         r0, r11, #0x10000   ; add (d!=0)
-
-    qadd16      r2, r4, r5          ; a1 + b1 + 7
-    qsub16      r3, r4, r5          ; a1 - b1 + 7
-
-    ldr         r12, c0x08a914e8    ; [2217 | 5352]
-
-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
-    asr         r2, r2, #4          ; scale top halfword
-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
-    asr         r3, r3, #4          ; scale top halfword
-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
-    str         r4, [r1, #0]        ; [     o1 |      o0]
-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
-    str         r5, [r1, #16]       ; [     o9 |      o8]
-
-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
-
-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
-
-    lsls        r6, r7, #16         ; d1 != 0 ?
-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-    asrs        r6, r7, #16
-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
-
-    pkhtb       r9, r9, r8, asr #16
-
-    sub         r4, r4, r2
-    sub         r5, r5, r3
-
-    ldr         r3, [r1, #4]        ; [i3 | i2]
-
-    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
-
-    str         r9, [r1, #8]        ; [o5 | 04]
-
-    ldr         r9, [r1, #12]       ; [i7 | i6]
-    ldr         r8, [r1, #28]       ; [i15|i14]
-    ldr         r2, [r1, #20]       ; [i11|i10]
-    str         r5, [r1, #24]       ; [o13|o12]
-
-    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
-    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
-
-    qadd16      r4, r4, lr          ; a1 + 7
-
-    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
-    qadd16      r2, r4, r5          ; a1 + b1 + 7
-    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
-    qsub16      r3, r4, r5          ; a1 - b1 + 7
-
-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
-    asr         r2, r2, #4          ; scale top halfword
-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
-    asr         r3, r3, #4          ; scale top halfword
-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
-    str         r4, [r1, #4]        ; [     o3 |      o2]
-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
-    str         r5, [r1, #20]       ; [    o11 |     o10]
-
-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
-
-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
-
-    lsls        r6, r7, #16         ; d1 != 0 ?
-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-
-    asrs        r6, r7, #16
-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
-
-    pkhtb       r9, r9, r8, asr #16
-
-    sub         r4, r4, r2
-    sub         r5, r5, r3
-
-    str         r9, [r1, #12]       ; [o7 | o6]
-    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
-
-    str         r5, [r1, #28]       ; [o15|o14]
-
-    ldmfd       sp!, {r4 - r12, pc}
-
-    ENDP
-
-; Used constants
-c7500
-    DCD     7500
-c14500
-    DCD     14500
-c0x22a453a0
-    DCD     0x22a453a0
-c0x00080008
-    DCD     0x00080008
-c12000
-    DCD     12000
-c51000
-    DCD     51000
-c0x00070007
-    DCD     0x00070007
-c0x08a914e8
-    DCD     0x08a914e8
-
-    END
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -1,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
--- a/vp8/encoder/arm/armv6/walsh_v6.asm
+++ b/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -17,129 +17,196 @@
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0    short *input,
+; r1    short *output,
+; r2    int pitch
 |vp8_short_walsh4x4_armv6| PROC
 
     stmdb       sp!, {r4 - r11, lr}
 
-    mov         r12, r2              ; ugh. not clean
-    ldr         r2, [r0]             ; [1  |  0]
-    ldr         r3, [r0, #4]         ; [3  |  2]
-    ldr         r4, [r0, r12]!       ; [5  |  4]
-    ldr         r5, [r0, #4]         ; [7  |  6]
-    ldr         r6, [r0, r12]!       ; [9  |  8]
-    ldr         r7, [r0, #4]         ; [11 | 10]
-    ldr         r8, [r0, r12]!       ; [13 | 12]
-    ldr         r9, [r0, #4]         ; [15 | 14]
+    ldrd        r4, r5, [r0], r2
+    ldr         lr, c00040004
+    ldrd        r6, r7, [r0], r2
 
-    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
-    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
-    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
-    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+    ; 0-3
+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
 
-    qaddsubx    r2, r10, r11         ; [1 | 2] [c1+d1 | a1-b1]
-    qaddsubx    r3, r11, r10         ; [0 | 3] [b1+a1 | d1-c1]
-    qaddsubx    r4, r12, lr          ; [5 | 6] [c1+d1 | a1-b1]
-    qaddsubx    r5, lr, r12          ; [4 | 7] [b1+a1 | d1-c1]
+    ldrd        r8, r9, [r0], r2
+    ; 4-7
+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
 
-    qsubaddx    r10, r6, r7          ; [c1|a1] [9-10  |  8+11]
-    qaddsubx    r11, r6, r7          ; [b1|d1] [9+10  |  8-11]
-    qsubaddx    r12, r8, r9          ; [c1|a1] [13-14 | 12+15]
-    qaddsubx    lr, r8, r9           ; [b1|d1] [13+14 | 12-15]
+    ldrd        r10, r11, [r0]
+    ; 8-11
+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
 
-    qaddsubx    r6, r10, r11         ; [9 |10] [c1+d1 | a1-b1]
-    qaddsubx    r7, r11, r10         ; [8 |11] [b1+a1 | d1-c1]
-    qaddsubx    r8, r12, lr          ; [13|14] [c1+d1 | a1-b1]
-    qaddsubx    r9, lr, r12          ; [12|15] [b1+a1 | d1-c1]
+    ; 12-15
+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
 
-    ; first transform complete
 
-    qadd16      r10, r3, r9          ; a1 [0+12  |  3+15]
-    qadd16      r11, r5, r7          ; b1 [4+8   |  7+11]
-    qsub16      r12, r5, r7          ; c1 [4-8   |  7-11]
-    qsub16      lr, r3, r9           ; d1 [0-12  |  3-15]
+    lsls        r2, r3, #16
+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
+    addne       r11, r11, #1        ; A0 += (a1!=0)
 
-    qadd16      r3, r10, r11         ; a2 [a1+b1] [0 | 3]
-    qadd16      r5, r12, lr          ; b2 [c1+d1] [4 | 7]
-    qsub16      r7, r10, r11         ; c2 [a1-b1] [8 |11]
-    qsub16      r9, lr, r12          ; d2 [d1-c1] [12|15]
+    lsls        r2, r7, #16
+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; C0 += (a1!=0)
 
-    qadd16      r10, r2, r8          ; a1 [1+13  |  2+14]
-    qadd16      r11, r4, r6          ; b1 [5+9   |  6+10]
-    qsub16      r12, r4, r6          ; c1 [5-9   |  6-10]
-    qsub16      lr, r2, r8           ; d1 [1-13  |  2-14]
+    add         r0, r11, r12        ; a1_0 = A0 + C0
+    sub         r11, r11, r12       ; b1_0 = A0 - C0
 
-    qadd16      r2, r10, r11         ; a2 [a1+b1] [1 | 2]
-    qadd16      r4, r12, lr          ; b2 [c1+d1] [5 | 6]
-    qsub16      r6, r10, r11         ; c2 [a1-b1] [9 |10]
-    qsub16      r8, lr, r12          ; d2 [d1-c1] [13|14]
+    lsls        r2, r5, #16
+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; B0 += (a1!=0)
 
-    ; [a-d]2 += ([a-d]2 > 0)
+    lsls        r2, r9, #16
+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
+    addne       r2, r2, #1          ; D0 += (a1!=0)
 
-    asrs        r10, r3, #16
-    addpl       r10, r10, #1         ; [~0]
-    asrs        r11, r2, #16
-    addpl       r11, r11, #1         ; [~1]
-    lsl         r11, r11, #15        ; [1  |  x]
-    pkhtb       r10, r11, r10, asr #1; [1  |  0]
-    str         r10, [r1], #4
+    add         lr, r12, r2         ; d1_0 = B0 + D0
+    sub         r12, r12, r2        ; c1_0 = B0 - D0
 
-    lsls        r11, r2, #16
-    addpl       r11, r11, #0x10000   ; [~2]
-    lsls        r12, r3, #16
-    addpl       r12, r12, #0x10000   ; [~3]
-    asr         r12, r12, #1         ; [3  |  x]
-    pkhtb       r11, r12, r11, asr #17; [3  |  2]
-    str         r11, [r1], #4
+    ; op[0,4,8,12]
+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1]            ; op[0]
 
-    asrs        r2, r5, #16
-    addpl       r2, r2, #1           ; [~4]
-    asrs        r3, r4, #16
-    addpl       r3, r3, #1           ; [~5]
-    lsl         r3, r3, #15          ; [5  |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [5  |  4]
-    str         r2, [r1], #4
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    ldr         lr, c00040004
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #24]       ; op[12]
 
-    lsls        r2, r4, #16
-    addpl       r2, r2, #0x10000     ; [~6]
-    lsls        r3, r5, #16
-    addpl       r3, r3, #0x10000     ; [~7]
-    asr         r3, r3, #1           ; [7  |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [7  |  6]
-    str         r2, [r1], #4
+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #8]        ; op[4]
 
-    asrs        r2, r7, #16
-    addpl       r2, r2, #1           ; [~8]
-    asrs        r3, r6, #16
-    addpl       r3, r3, #1           ; [~9]
-    lsl         r3, r3, #15          ; [9  |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [9  |  8]
-    str         r2, [r1], #4
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #16]       ; op[8]
 
-    lsls        r2, r6, #16
-    addpl       r2, r2, #0x10000     ; [~10]
-    lsls        r3, r7, #16
-    addpl       r3, r3, #0x10000     ; [~11]
-    asr         r3, r3, #1           ; [11 |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [11 | 10]
-    str         r2, [r1], #4
 
-    asrs        r2, r9, #16
-    addpl       r2, r2, #1           ; [~12]
-    asrs        r3, r8, #16
-    addpl       r3, r3, #1           ; [~13]
-    lsl         r3, r3, #15          ; [13 |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [13 | 12]
-    str         r2, [r1], #4
+    ; op[3,7,11,15]
+    add         r0, r3, r7          ; a1_3 = A3 + C3
+    sub         r3, r3, r7          ; b1_3 = A3 - C3
 
-    lsls        r2, r8, #16
-    addpl       r2, r2, #0x10000     ; [~14]
-    lsls        r3, r9, #16
-    addpl       r3, r3, #0x10000     ; [~15]
-    asr         r3, r3, #1           ; [15 |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [15 | 14]
-    str         r2, [r1]
+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
+    add         r7, r5, r9          ; d1_3 = B3 + D3
+    sub         r5, r5, r9          ; c1_3 = B3 - D3
 
+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #6]        ; op[3]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #14]       ; op[7]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #22]       ; op[11]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #30]       ; op[15]
+
+    ; op[1,5,9,13]
+    add         r0, r3, r5          ; a1_1 = A1 + C1
+    sub         r3, r3, r5          ; b1_1 = A1 - C1
+
+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
+    add         r5, r7, r9          ; d1_1 = B1 + D1
+    sub         r7, r7, r9          ; c1_1 = B1 - D1
+
+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #2]        ; op[1]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #10]       ; op[5]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #18]       ; op[9]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #26]       ; op[13]
+
+
+    ; op[2,6,10,14]
+    add         r11, r4, r8         ; a1_2 = A2 + C2
+    sub         r12, r4, r8         ; b1_2 = A2 - C2
+
+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
+    add         r4, r6, r10         ; d1_2 = B2 + D2
+    sub         r8, r6, r10         ; c1_2 = B2 - D2
+
+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #4]        ; op[2]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #12]       ; op[6]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #20]       ; op[10]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #28]       ; op[14]
+
+
     ldmia       sp!, {r4 - r11, pc}
     ENDP        ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+    DCD         0x00040004
 
     END
--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -13,12 +13,10 @@
 
 #if HAVE_ARMV6
 
-void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
 {
-    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
-    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_armv6(input,   output,    pitch);
+    vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
 }
 
 #endif /* HAVE_ARMV6 */
-
-
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -14,18 +14,24 @@
 
 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct8x4_armv6);
+extern prototype_fdct(vp8_short_fdct4x4_armv6);
+extern prototype_fdct(vp8_short_fdct8x4_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
 
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6
+
 #undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6
 
 #undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6
 #endif
 
 #endif /* HAVE_ARMV6 */
@@ -45,10 +51,10 @@
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
 
 #undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon
 
 #undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon
 
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
--- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+++ /dev/null
@@ -1,124 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_fdct4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-;NOTE:
-;The input *src_diff. src_diff is calculated as:
-;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
-;In which *src_ptr and *pred_ptr both are unsigned char.
-;Therefore, *src_diff should be in the range of [-255, 255].
-;CAUTION:
-;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
-;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
-;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
-
-|vp8_fast_fdct4x4_neon| PROC
-    vld1.16         {d2}, [r0], r2              ;load input
-    ldr             r12, _ffdct_coeff_
-    vld1.16         {d3}, [r0], r2
-    vld1.16         {d4}, [r0], r2
-    vld1.16         {d0}, [r12]
-    vld1.16         {d5}, [r0], r2
-
-    ;First for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vadd.s16        d6, d2, d5              ;ip[0]+ip[3]
-    vadd.s16        d7, d3, d4              ;ip[1]+ip[2]
-    vsub.s16        d8, d3, d4              ;ip[1]-ip[2]
-    vsub.s16        d9, d2, d5              ;ip[0]-ip[3]
-    vshl.i16        q3, q3, #1              ; a1, b1
-    vshl.i16        q4, q4, #1              ; c1, d1
-
-    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
-    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
-
-    vqdmulh.s16     q6, q5, d0[1]
-    vqdmulh.s16     q8, q4, d0[0]
-    vqdmulh.s16     q7, q4, d0[2]
-
-    vshr.s16        q6, q6, #1
-    vshr.s16        q8, q8, #1
-    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
-    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
-
-    vadd.s16        d2, d10, d12            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d4, d11, d13            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d3, d14, d17            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
-    vsub.s16        d5, d15, d16            ;op[3] = temp1 - temp2
-
-    ;Second for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
-    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
-    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
-    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
-
-    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
-    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
-
-
-    vqdmulh.s16     q6, q5, d0[1]
-    vqdmulh.s16     q8, q4, d0[0]
-    vqdmulh.s16     q7, q4, d0[2]
-
-    vshr.s16        q6, q6, #1
-    vshr.s16        q8, q8, #1
-    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
-    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
-
-    vadd.s16        d2, d10, d12            ;a2 = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d4, d11, d13            ;c2 = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d3, d14, d17            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
-    vsub.s16        d5, d15, d16            ;d2 = temp1 - temp2
-
-    vclt.s16        q3, q1, #0
-    vclt.s16        q4, q2, #0
-
-    vsub.s16        q1, q1, q3
-    vsub.s16        q2, q2, q4
-
-    vshr.s16        q1, q1, #1
-    vshr.s16        q2, q2, #1
-
-    vst1.16         {q1, q2}, [r1]
-
-    bx              lr
-
-    ENDP
-
-;-----------------
-
-_ffdct_coeff_
-    DCD     ffdct_coeff
-ffdct_coeff
-; 60547 =  0xEC83
-; 46341 =  0xB505
-; 25080 =  0x61F8
-    DCD     0xB505EC83, 0x000061F8
-
-    END
--- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+++ /dev/null
@@ -1,177 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_fdct8x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-;NOTE:
-;The input *src_diff. src_diff is calculated as:
-;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
-;In which *src_ptr and *pred_ptr both are unsigned char.
-;Therefore, *src_diff should be in the range of [-255, 255].
-;CAUTION:
-;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
-;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
-;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
-
-|vp8_fast_fdct8x4_neon| PROC
-    vld1.16         {q1}, [r0], r2              ;load input
-    ldr             r12, _ffdct8_coeff_
-    vld1.16         {q2}, [r0], r2
-    vld1.16         {q3}, [r0], r2
-    vld1.16         {d0}, [r12]
-    vld1.16         {q4}, [r0], r2
-
-    ;First for-loop
-    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
-    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
-    vtrn.32         d2, d6
-    vtrn.32         d3, d7
-    vtrn.32         d4, d8
-    vtrn.32         d5, d9
-    vtrn.16         d2, d4
-    vtrn.16         d3, d5
-    vtrn.16         d6, d8
-    vtrn.16         d7, d9
-
-    vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
-    vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
-    vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
-    vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
-    vadd.s16        d22, d3, d9
-    vadd.s16        d23, d5, d7
-    vsub.s16        d24, d5, d7
-    vsub.s16        d25, d3, d9
-
-    vshl.i16        q5, q5, #1              ; a1, b1
-    vshl.i16        q6, q6, #1              ; c1, d1
-    vshl.i16        q1, q11, #1
-    vshl.i16        q2, q12, #1
-
-    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
-    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
-    vadd.s16        d24, d2, d3
-    vsub.s16        d25, d2, d3
-
-    vqdmulh.s16     q8, q7, d0[1]
-    vqdmulh.s16     q13, q12, d0[1]
-    vqdmulh.s16     q10, q6, d0[0]
-    vqdmulh.s16     q15, q2, d0[0]
-    vqdmulh.s16     q9, q6, d0[2]
-    vqdmulh.s16     q14, q2, d0[2]
-
-    vshr.s16        q8, q8, #1
-    vshr.s16        q13, q13, #1
-    vshr.s16        q10, q10, #1
-    vshr.s16        q15, q15, #1
-    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
-    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
-    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
-    vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
-
-    vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
-    vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
-    vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
-    vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2
-
-    ;Second for-loop
-    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
-    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
-    vtrn.32         d2, d6
-    vtrn.32         d3, d7
-    vtrn.32         d4, d8
-    vtrn.32         d5, d9
-    vtrn.16         d2, d4
-    vtrn.16         d3, d5
-    vtrn.16         d6, d8
-    vtrn.16         d7, d9
-
-    vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
-    vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
-    vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
-    vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
-    vadd.s16        d2, d3, d9
-    vadd.s16        d4, d5, d7
-    vsub.s16        d24, d5, d7
-    vsub.s16        d25, d3, d9
-
-    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
-    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
-    vadd.s16        d22, d2, d4
-    vsub.s16        d23, d2, d4
-
-    vqdmulh.s16     q8, q7, d0[1]
-    vqdmulh.s16     q13, q11, d0[1]
-    vqdmulh.s16     q10, q6, d0[0]
-    vqdmulh.s16     q15, q12, d0[0]
-    vqdmulh.s16     q9, q6, d0[2]
-    vqdmulh.s16     q14, q12, d0[2]
-
-    vshr.s16        q8, q8, #1
-    vshr.s16        q13, q13, #1
-    vshr.s16        q10, q10, #1
-    vshr.s16        q15, q15, #1
-    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
-    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
-    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
-    vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
-
-    vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
-    vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
-    vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
-    vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
-    vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
-    vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2
-
-    vclt.s16        q5, q1, #0
-    vclt.s16        q6, q2, #0
-    vclt.s16        q7, q3, #0
-    vclt.s16        q8, q4, #0
-
-    vsub.s16        q1, q1, q5
-    vsub.s16        q2, q2, q6
-    vsub.s16        q3, q3, q7
-    vsub.s16        q4, q4, q8
-
-    vshr.s16        q1, q1, #1
-    vshr.s16        q2, q2, #1
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vst1.16         {q1, q2}, [r1]!
-    vst1.16         {q3, q4}, [r1]
-
-    bx              lr
-
-    ENDP
-
-;-----------------
-
-_ffdct8_coeff_
-    DCD     ffdct8_coeff
-ffdct8_coeff
-; 60547 =  0xEC83
-; 46341 =  0xB505
-; 25080 =  0x61F8
-    DCD     0xB505EC83, 0x000061F8
-
-    END
--- /dev/null
+++ b/vp8/encoder/arm/neon/picklpf_arm.c
@@ -1,0 +1,50 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int border;
+    int yoffset;
+    int linestocopy;
+
+    border   = src_ybc->border;
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    linestocopy = (yheight >> (Fraction + 4));
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+    yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+    vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16)));
+}
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -11,134 +11,211 @@
 
     EXPORT  |vp8_short_fdct4x4_neon|
     EXPORT  |vp8_short_fdct8x4_neon|
+
     ARM
     REQUIRE8
     PRESERVE8
 
+    AREA ||.text||, CODE, READONLY, ALIGN=4
 
-    AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; r0    short *input
-; r1    short *output
-; r2    int pitch
-; Input has a pitch, output is contiguous
+    ALIGN 16    ; enable use of @128 bit aligned loads
+coeff
+    DCW      5352,  5352,  5352, 5352
+    DCW      2217,  2217,  2217, 2217
+    DCD     14500, 14500, 14500, 14500
+    DCD      7500,  7500,  7500, 7500
+    DCD     12000, 12000, 12000, 12000
+    DCD     51000, 51000, 51000, 51000
+
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 |vp8_short_fdct4x4_neon| PROC
-    ldr             r12, _dct_matrix_
-    vld1.16         d0, [r0], r2
-    vld1.16         d1, [r0], r2
-    vld1.16         d2, [r0], r2
-    vld1.16         d3, [r0]
-    vld1.16         {q2, q3}, [r12]
 
-;first stage
-    vmull.s16       q11, d4, d0[0]              ;i=0
-    vmull.s16       q12, d4, d1[0]              ;i=1
-    vmull.s16       q13, d4, d2[0]              ;i=2
-    vmull.s16       q14, d4, d3[0]              ;i=3
+    ; Part one
+    vld1.16         {d0}, [r0@64], r2
+    adr             r12, coeff
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {d2}, [r0@64], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {d3}, [r0@64], r2
 
-    vmlal.s16       q11, d5, d0[1]
-    vmlal.s16       q12, d5, d1[1]
-    vmlal.s16       q13, d5, d2[1]
-    vmlal.s16       q14, d5, d3[1]
+    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
 
-    vmlal.s16       q11, d6, d0[2]
-    vmlal.s16       q12, d6, d1[2]
-    vmlal.s16       q13, d6, d2[2]
-    vmlal.s16       q14, d6, d3[2]
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
+    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
+    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
 
-    vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
-    vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
-    vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
-    vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
+    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
+    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
 
-    ; rounding
-    vrshrn.i32      d22, q11, #14
-    vrshrn.i32      d24, q12, #14
-    vrshrn.i32      d26, q13, #14
-    vrshrn.i32      d28, q14, #14
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
+    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
 
-;second stage
-    vmull.s16       q4, d22, d4[0]              ;i=0
-    vmull.s16       q5, d22, d4[1]              ;i=1
-    vmull.s16       q6, d22, d4[2]              ;i=2
-    vmull.s16       q7, d22, d4[3]              ;i=3
+    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
+    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
+    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
 
-    vmlal.s16       q4, d24, d5[0]
-    vmlal.s16       q5, d24, d5[1]
-    vmlal.s16       q6, d24, d5[2]
-    vmlal.s16       q7, d24, d5[3]
+    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
 
-    vmlal.s16       q4, d26, d6[0]
-    vmlal.s16       q5, d26, d6[1]
-    vmlal.s16       q6, d26, d6[2]
-    vmlal.s16       q7, d26, d6[3]
 
-    vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
-    vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
-    vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
-    vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
+    ; Part two
 
-    vrshr.s32       q0, q4, #16
-    vrshr.s32       q1, q5, #16
-    vrshr.s32       q2, q6, #16
-    vrshr.s32       q3, q7, #16
+    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
 
-    vmovn.i32       d0, q0
-    vmovn.i32       d1, q1
-    vmovn.i32       d2, q2
-    vmovn.i32       d3, q3
+    vmov.s16        d26, #7
 
-    vst1.16         {q0, q1}, [r1]
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
+    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
+    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
+    vadd.s16        d4, d4, d26     ; a1 + 7
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
 
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
+    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
+
+    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
+    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
+
+    vceq.s16        d4, d7, #0
+
+    vshr.s16        d0, d0, #4
+    vshr.s16        d2, d2, #4
+
+    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        d4, d4
+    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
+    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+    vst1.16         {q0, q1}, [r1@128]
+
     bx              lr
 
     ENDP
 
-; r0    short *input
-; r1    short *output
-; r2    int pitch
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
 |vp8_short_fdct8x4_neon| PROC
-    ; Store link register and input before calling
-    ;  first 4x4 fdct.  Do not need to worry about
-    ;  output or pitch because those pointers are not
-    ;  touched in the 4x4 fdct function
-    stmdb           sp!, {r0, lr}
 
-    bl              vp8_short_fdct4x4_neon
+    ; Part one
 
-    ldmia           sp!, {r0, lr}
+    vld1.16         {q0}, [r0@128], r2
+    adr             r12, coeff
+    vld1.16         {q1}, [r0@128], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {q2}, [r0@128], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {q3}, [r0@128], r2
 
-    ; Move to the next block of data.
-    add             r0, r0, #8
-    add             r1, r1, #32
+    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+    vtrn.32         q0, q2          ; [A0|B0]
+    vtrn.32         q1, q3          ; [A1|B1]
+    vtrn.16         q0, q1          ; [A2|B2]
+    vtrn.16         q2, q3          ; [A3|B3]
 
-    ; Second time through do not store off the
-    ;  link register, just return from the 4x4 fdtc
-    b               vp8_short_fdct4x4_neon
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
+    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
+    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
 
-    ; Should never get to this.
-    bx              lr
+    vshl.s16        q11, q11, #3    ; a1 << 3
+    vshl.s16        q12, q12, #3    ; b1 << 3
+    vshl.s16        q13, q13, #3    ; c1 << 3
+    vshl.s16        q14, q14, #3    ; d1 << 3
 
-    ENDP
+    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
+    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
 
-;-----------------
+    vmov.s16        q11, q9         ; 14500
+    vmov.s16        q12, q10        ; 7500
 
-_dct_matrix_
-    DCD     dct_matrix
-dct_matrix
-;   DCW     23170,  30274,  23170, 12540
-;   DCW     23170,  12540, -23170,-30274
-;   DCW     23170, -12540, -23170, 30274
-;   DCW     23170, -30274,  23170,-12540
-; 23170 =  0x5a82
-; -23170 =  0xa57e
-; 30274 =  0x7642
-; -30274 =  0x89be
-; 12540 =  0x30fc
-; -12540 = 0xcf04
-    DCD     0x76425a82, 0x30fc5a82
-    DCD     0x30fc5a82, 0x89bea57e
-    DCD     0xcf045a82, 0x7642a57e
-    DCD     0x89be5a82, 0xcf045a82
+    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
+    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
+    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
+    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
 
+    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
+    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
+
+    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
+    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
+
+
+    ; Part two
+    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
+
+    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+    vtrn.32         q0, q2          ; q0=[A0 | B0]
+    vtrn.32         q1, q3          ; q1=[A4 | B4]
+    vtrn.16         q0, q1          ; q2=[A8 | B8]
+    vtrn.16         q2, q3          ; q3=[A12|B12]
+
+    vmov.s16        q15, #7
+
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
+    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
+    vadd.s16        q11, q11, q15   ; a1 + 7
+    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
+
+    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
+    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
+
+    vmov.s16        q11, q9         ; 12000
+    vmov.s16        q12, q10        ; 51000
+
+    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
+    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
+
+
+    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
+    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
+    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
+    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
+
+    vceq.s16        q14, q14, #0
+
+    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
+    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        q14, q14
+
+    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
+
+    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
+
+    vst1.16         {q0, q1}, [r1@128]! ; block A
+    vst1.16         {q2, q3}, [r1@128]! ; block B
+
+    bx              lr
+
+    ENDP
+
     END
+
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -16,58 +16,85 @@
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
-
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0   short *input,
+; r1   short *output,
+; r2   int pitch
 |vp8_short_walsh4x4_neon| PROC
-    vld1.16         {d2}, [r0], r2              ;load input
-    vld1.16         {d3}, [r0], r2
-    vld1.16         {d4}, [r0], r2
-    vld1.16         {d5}, [r0], r2
 
+    vld1.16         {d0}, [r0@64], r2   ; load input
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {d2}, [r0@64], r2
+    vld1.16         {d3}, [r0@64]
+
     ;First for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
+    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+
+    vmov.s32        q15, #3             ; add 3 to all values
+
+    vtrn.16         d0, d1
     vtrn.16         d2, d3
-    vtrn.16         d4, d5
 
-    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[3]
-    vadd.s16        d7, d3, d4              ;b1 = ip[1]+ip[2]
-    vsub.s16        d8, d3, d4              ;c1 = ip[1]-ip[2]
-    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[3]
+    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
+    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
+    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
+    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
 
-    vadd.s16        d2, d6, d7             ;op[0] = a1 + b1
-    vsub.s16        d4, d6, d7             ;op[2] = a1 - b1
-    vadd.s16        d3, d8, d9             ;op[1] = c1 + d1
-    vsub.s16        d5, d9, d8             ;op[3] = d1 - c1
+    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
+    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
+    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
+    vceq.s16        d16, d4, #0         ; a1 == 0
+    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
 
+    vadd.s16        d0, d4, d5          ; a1 + d1
+    vmvn            d16, d16            ; a1 != 0
+    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
+    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
+    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
+    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
+
     ;Second for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
+    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d1, d3
+    vtrn.32         d0, d2
     vtrn.16         d2, d3
-    vtrn.16         d4, d5
+    vtrn.16         d0, d1
 
-    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
-    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
-    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
-    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
+    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
+    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
+    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
 
-    vadd.s16        d2, d6, d7              ;a2 = a1 + b1;
-    vsub.s16        d4, d6, d7              ;c2 = a1 - b1;
-    vadd.s16        d3, d8, d9              ;b2 = c1 + d1;
-    vsub.s16        d5, d9, d8              ;d2 = d1 - c1;
+    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
+    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
+    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
+    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
 
-    vcgt.s16        q3, q1, #0
-    vcgt.s16        q4, q2, #0
+    vclt.s32        q8, q0, #0
+    vclt.s32        q9, q1, #0
+    vclt.s32        q10, q2, #0
+    vclt.s32        q11, q3, #0
 
-    vsub.s16        q1, q1, q3
-    vsub.s16        q2, q2, q4
+    ; subtract -1 (or 0)
+    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
+    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
+    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
+    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
 
-    vshr.s16        q1, q1, #1
-    vshr.s16        q2, q2, #1
+    vadd.s32        q8, q0, q15         ; a2 + 3
+    vadd.s32        q9, q1, q15         ; b2 + 3
+    vadd.s32        q10, q2, q15        ; c2 + 3
+    vadd.s32        q11, q3, q15        ; d2 + 3
 
-    vst1.16         {q1, q2}, [r1]
+    ; vrshrn? would add 1 << 3-1 = 2
+    vshrn.s32       d0, q8, #3
+    vshrn.s32       d1, q9, #3
+    vshrn.s32       d2, q10, #3
+    vshrn.s32       d3, q11, #3
+
+    vst1.16         {q0, q1}, [r1@128]
 
     bx              lr
 
--- a/vp8/encoder/arm/picklpf_arm.c
+++ /dev/null
@@ -1,50 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "vp8/encoder/onyx_int.h"
-#include "vp8/encoder/quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/alloccommon.h"
-
-extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-
-
-void
-vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
-{
-    unsigned char *src_y, *dst_y;
-    int yheight;
-    int ystride;
-    int border;
-    int yoffset;
-    int linestocopy;
-
-    border   = src_ybc->border;
-    yheight  = src_ybc->y_height;
-    ystride  = src_ybc->y_stride;
-
-    linestocopy = (yheight >> (Fraction + 4));
-
-    if (linestocopy < 1)
-        linestocopy = 1;
-
-    linestocopy <<= 4;
-
-    yoffset  = ystride * ((yheight >> 5) * 16 - 8);
-    src_y = src_ybc->y_buffer + yoffset;
-    dst_y = dst_ybc->y_buffer + yoffset;
-
-    //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
-    vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16)));
-}
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -18,7 +18,6 @@
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/picklpf_arm.c
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c
@@ -36,7 +35,7 @@
 #File list for armv6
 # encoder
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
@@ -49,9 +48,8 @@
 
 #File list for neon
 # encoder
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastfdct4x4_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastfdct8x4_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad8_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad16_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/shortfdct_neon$(ASM)
--- /dev/null
+++ b/vpx_scale/arm/neon/yv12extend_arm.c
@@ -1,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+
+void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+
+void
+vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+{
+    vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
+    //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width);
+
+    vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+}
--- a/vpx_scale/arm/yv12extend_arm.c
+++ /dev/null
@@ -1,25 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
-
-void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-
-void
-vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-    vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-    //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width);
-
-    vp8_yv12_extend_frame_borders_ptr(dst_ybc);
-}
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -11,7 +11,6 @@
 
 #arm
 SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependent.c
-SCALE_SRCS-$(HAVE_ARMV7)         += arm/yv12extend_arm.c
 SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependent.c
 
 #neon
@@ -19,5 +18,6 @@
 SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copyframeyonly_neon$(ASM)
 SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
 SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/yv12extend_arm.c
 
 SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)