shithub: libvpx

--- a/libs.mk

+++ b/libs.mk

@@ -93,6 +93,7 @@

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm

 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c

 endif

+CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c

 CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm

 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com

 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc

--- /dev/null

+++ b/vp8/common/arm/arm_systemdependent.c

@@ -1,0 +1,134 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

+#include "g_common.h"

+#include "pragmas.h"

+#include "subpixel.h"

+#include "loopfilter.h"

+#include "recon.h"

+#include "idct.h"

+#include "onyxc_int.h"

+extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

+extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

+extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);

+extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);

+extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);

+extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);

+void vp8_arch_arm_common_init(VP8_COMMON *ctx)

+{

+#if CONFIG_RUNTIME_CPU_DETECT

+    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

+    int flags = arm_cpu_caps();

+    int has_edsp = flags & HAS_EDSP;

+    int has_media = flags & HAS_MEDIA;

+    int has_neon = flags & HAS_NEON;

+    rtcd->flags = flags;

+    /* Override default functions with fastest ones for this CPU. */

+#if HAVE_ARMV6

+    if (has_media)

+    {

+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;

+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;

+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;

+        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;

+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;

+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;

+        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;

+        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;

+        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;

+        rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;

+        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;

+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;

+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;

+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;

+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;

+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;

+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;

+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;

+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;

+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;

+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;

+        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;

+        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;

+        rtcd->recon.recon       = vp8_recon_b_armv6;

+        rtcd->recon.recon2      = vp8_recon2b_armv6;

+        rtcd->recon.recon4      = vp8_recon4b_armv6;

+    }

+#endif

+#if HAVE_ARMV7

+    if (has_neon)

+    {

+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;

+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;

+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;

+        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;

+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;

+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;

+        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;

+        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;

+        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;

+        rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;

+        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;

+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;

+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;

+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;

+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;

+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;

+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;

+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;

+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;

+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;

+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;

+        rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;

+        rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;

+        rtcd->recon.recon       = vp8_recon_b_neon;

+        rtcd->recon.recon2      = vp8_recon2b_neon;

+        rtcd->recon.recon4      = vp8_recon4b_neon;

+    }

+#endif

+#endif

+#if HAVE_ARMV6

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (has_media)

+#endif

+    {

+        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;

+        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

+    }

+#endif

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (has_neon)

+#endif

+    {

+        vp8_build_intra_predictors_mby_ptr =

+         vp8_build_intra_predictors_mby_neon;

+        vp8_build_intra_predictors_mby_s_ptr =

+         vp8_build_intra_predictors_mby_s_neon;

+    }

+#endif

+}

--- a/vp8/common/arm/idct_arm.h

+++ b/vp8/common/arm/idct_arm.h

@@ -19,6 +19,7 @@

 extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);

 extern prototype_second_order(vp8_short_inv_walsh4x4_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_idct_idct1

 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6

@@ -34,6 +35,7 @@

 #undef  vp8_idct_iwalsh16

 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_idct(vp8_short_idct4x4llm_1_neon);

@@ -42,6 +44,7 @@

 extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);

 extern prototype_second_order(vp8_short_inv_walsh4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_idct_idct1

 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon

@@ -56,6 +59,7 @@

 #undef  vp8_idct_iwalsh16

 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon

+#endif

 #endif

 #endif

--- a/vp8/common/arm/loopfilter_arm.h

+++ b/vp8/common/arm/loopfilter_arm.h

@@ -22,6 +22,7 @@

 extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);

 extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_lf_normal_mb_v

 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6

@@ -46,6 +47,7 @@

 #undef  vp8_lf_simple_b_h

 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);

@@ -57,6 +59,7 @@

 extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);

 extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_lf_normal_mb_v

 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon

@@ -80,6 +83,7 @@

 #undef  vp8_lf_simple_b_h

 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon

+#endif

 #endif

 #endif

--- a/vp8/common/arm/recon_arm.h

+++ b/vp8/common/arm/recon_arm.h

@@ -21,6 +21,7 @@

 extern prototype_copy_block(vp8_copy_mem8x4_v6);

 extern prototype_copy_block(vp8_copy_mem16x16_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_recon_recon

 #define vp8_recon_recon vp8_recon_b_armv6

@@ -39,6 +40,7 @@

 #undef  vp8_recon_copy16x16

 #define vp8_recon_copy16x16 vp8_copy_mem16x16_v6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_recon_block(vp8_recon_b_neon);

@@ -49,6 +51,7 @@

 extern prototype_copy_block(vp8_copy_mem8x4_neon);

 extern prototype_copy_block(vp8_copy_mem16x16_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_recon_recon

 #define vp8_recon_recon vp8_recon_b_neon

@@ -66,6 +69,7 @@

 #undef  vp8_recon_copy16x16

 #define vp8_recon_copy16x16 vp8_copy_mem16x16_neon

+#endif

 #endif

 #endif

--- a/vp8/common/arm/subpixel_arm.h

+++ b/vp8/common/arm/subpixel_arm.h

@@ -22,6 +22,7 @@

 extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6);

 extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_subpix_sixtap16x16

 #define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6

@@ -46,6 +47,7 @@

 #undef  vp8_subpix_bilinear4x4

 #define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon);

@@ -57,6 +59,7 @@

 extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon);

 extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_subpix_sixtap16x16

 #define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon

@@ -80,6 +83,7 @@

 #undef  vp8_subpix_bilinear4x4

 #define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon

+#endif

 #endif

 #endif

--- a/vp8/common/arm/systemdependent.c

+++ /dev/null

@@ -1,149 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "g_common.h"

-#include "pragmas.h"

-#include "subpixel.h"

-#include "loopfilter.h"

-#include "recon.h"

-#include "idct.h"

-#include "onyxc_int.h"

-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);

-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);

-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);

-void vp8_machine_specific_config(VP8_COMMON *ctx)

-{

-#if CONFIG_RUNTIME_CPU_DETECT

-    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

-#if HAVE_ARMV7

-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;

-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;

-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;

-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;

-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;

-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;

-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;

-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;

-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;

-    rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;

-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;

-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;

-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;

-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;

-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;

-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;

-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;

-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;

-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;

-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;

-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;

-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;

-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;

-    rtcd->recon.recon       = vp8_recon_b_neon;

-    rtcd->recon.recon2      = vp8_recon2b_neon;

-    rtcd->recon.recon4      = vp8_recon4b_neon;

-#elif HAVE_ARMV6

-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;

-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;

-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;

-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;

-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;

-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;

-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;

-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;

-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;

-    rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;

-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_armv6;

-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_armv6;

-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;

-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;

-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;

-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;

-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;

-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;

-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;

-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;

-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;

-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;

-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;

-    rtcd->recon.recon       = vp8_recon_b_armv6;

-    rtcd->recon.recon2      = vp8_recon2b_armv6;

-    rtcd->recon.recon4      = vp8_recon4b_armv6;

-#else

-//pure c

-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;

-    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;

-    rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;

-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;

-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;

-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;

-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;

-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;

-    rtcd->recon.recon      = vp8_recon_b_c;

-    rtcd->recon.recon2      = vp8_recon2b_c;

-    rtcd->recon.recon4     = vp8_recon4b_c;

-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;

-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;

-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;

-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;

-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;

-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;

-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;

-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;

-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;

-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;

-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;

-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;

-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;

-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;

-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;

-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;

-#endif

-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)

-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;

-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;

-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;

-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;

-#endif

-#endif

-#if HAVE_ARMV7

-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon;

-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s_neon;

-#elif HAVE_ARMV6

-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;

-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

-#else

-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;

-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

-#endif

-}

--- a/vp8/common/generic/systemdependent.c

+++ b/vp8/common/generic/systemdependent.c

@@ -18,6 +18,7 @@

 #include "onyxc_int.h"

 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);

+extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);

 void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);

 extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);

@@ -75,6 +76,10 @@

 #if ARCH_X86 || ARCH_X86_64

     vp8_arch_x86_common_init(ctx);

+#endif

+#if ARCH_ARM

+    vp8_arch_arm_common_init(ctx);

 #endif

--- a/vp8/common/onyxc_int.h

+++ b/vp8/common/onyxc_int.h

@@ -74,6 +74,7 @@

     vp8_subpix_rtcd_vtable_t      subpix;

     vp8_loopfilter_rtcd_vtable_t  loopfilter;

     vp8_postproc_rtcd_vtable_t    postproc;

+    int                           flags;

 #else

     int unused;

 #endif

--- /dev/null

+++ b/vp8/decoder/arm/arm_dsystemdependent.c

@@ -1,0 +1,66 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

+#include "blockd.h"

+#include "pragmas.h"

+#include "postproc.h"

+#include "dboolhuff.h"

+#include "dequantize.h"

+#include "onyxd_int.h"

+void vp8_arch_arm_decode_init(VP8D_COMP *pbi)

+{

+#if CONFIG_RUNTIME_CPU_DETECT

+    int flags = pbi->common.rtcd.flags;

+    int has_edsp = flags & HAS_EDSP;

+    int has_media = flags & HAS_MEDIA;

+    int has_neon = flags & HAS_NEON;

+#if HAVE_ARMV6

+    if (has_media)

+    {

+        pbi->dequant.block               = vp8_dequantize_b_v6;

+        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;

+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;

+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;

+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;

+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;

+#if 0 //For use with RTCD, when implemented

+        pbi->dboolhuff.start             = vp8dx_start_decode_c;

+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;

+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;

+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;

+#endif

+    }

+#endif

+#if HAVE_ARMV7

+    if (has_neon)

+    {

+        pbi->dequant.block               = vp8_dequantize_b_neon;

+        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;

+        /*This is not used: NEON always dequants two blocks at once.

+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/

+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;

+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;

+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;

+#if 0 //For use with RTCD, when implemented

+        pbi->dboolhuff.start             = vp8dx_start_decode_c;

+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;

+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;

+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;

+#endif

+    }

+#endif

+#endif

+}

--- a/vp8/decoder/arm/dequantize_arm.h

+++ b/vp8/decoder/arm/dequantize_arm.h

@@ -20,6 +20,7 @@

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_dequant_block

 #define vp8_dequant_block vp8_dequantize_b_v6

@@ -38,6 +39,7 @@

 #undef vp8_dequant_idct_add_uv_block

 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_dequant_block(vp8_dequantize_b_neon);

@@ -47,6 +49,7 @@

 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);

 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_dequant_block

 #define vp8_dequant_block vp8_dequantize_b_neon

@@ -64,6 +67,7 @@

 #undef vp8_dequant_idct_add_uv_block

 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon

+#endif

 #endif

 #endif

--- a/vp8/decoder/arm/dsystemdependent.c

+++ /dev/null

@@ -1,39 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "blockd.h"

-#include "pragmas.h"

-#include "postproc.h"

-#include "dboolhuff.h"

-#include "dequantize.h"

-#include "onyxd_int.h"

-void vp8_dmachine_specific_config(VP8D_COMP *pbi)

-{

-#if CONFIG_RUNTIME_CPU_DETECT

-    pbi->mb.rtcd         = &pbi->common.rtcd;

-#if HAVE_ARMV7

-    pbi->dequant.block   = vp8_dequantize_b_neon;

-    pbi->dboolhuff.start = vp8dx_start_decode_c;

-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;

-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;

-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;

-#elif HAVE_ARMV6

-    pbi->dequant.block   = vp8_dequantize_b_v6;

-    pbi->dboolhuff.start = vp8dx_start_decode_c;

-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;

-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;

-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;

-#endif

-#endif

-}

--- a/vp8/decoder/generic/dsystemdependent.c

+++ b/vp8/decoder/generic/dsystemdependent.c

@@ -14,6 +14,7 @@

 #include "onyxd_int.h"

 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);

+extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);

 void vp8_dmachine_specific_config(VP8D_COMP *pbi)

@@ -36,5 +37,9 @@

 #if ARCH_X86 || ARCH_X86_64

     vp8_arch_x86_decode_init(pbi);

+#endif

+#if ARCH_ARM

+    vp8_arch_arm_decode_init(pbi);

 #endif

--- a/vp8/decoder/onyxd_if.c

+++ b/vp8/decoder/onyxd_if.c

@@ -30,6 +30,9 @@

 #include "systemdependent.h"

 #include "vpx_ports/vpx_timer.h"

 #include "detokenize.h"

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

 extern void vp8_init_loop_filter(VP8_COMMON *cm);

 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);

@@ -224,7 +227,6 @@

 #if HAVE_ARMV7

 extern void vp8_push_neon(INT64 *store);

 extern void vp8_pop_neon(INT64 *store);

-static INT64 dx_store_reg[8];

 #endif

 static int get_free_fb (VP8_COMMON *cm)

@@ -312,6 +314,9 @@

 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, INT64 time_stamp)

+#if HAVE_ARMV7

+    INT64 dx_store_reg[8];

+#endif

     VP8D_COMP *pbi = (VP8D_COMP *) ptr;

     VP8_COMMON *cm = &pbi->common;

     int retcode = 0;

@@ -327,10 +332,27 @@

     pbi->common.error.error_code = VPX_CODEC_OK;

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+        vp8_push_neon(dx_store_reg);

+    }

+#endif

     cm->new_fb_idx = get_free_fb (cm);

     if (setjmp(pbi->common.error.jmp))

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

+#endif

+        {

+            vp8_pop_neon(dx_store_reg);

+        }

+#endif

         pbi->common.error.setjmp = 0;

         if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

           cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

@@ -339,10 +361,6 @@

     pbi->common.error.setjmp = 1;

-#if HAVE_ARMV7

-    vp8_push_neon(dx_store_reg);

-#endif

     vpx_usec_timer_start(&timer);

     //cm->current_video_frame++;

@@ -354,8 +372,13 @@

     if (retcode < 0)

 #if HAVE_ARMV7

-        vp8_pop_neon(dx_store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_pop_neon(dx_store_reg);

+        }

+#endif

         pbi->common.error.error_code = VPX_CODEC_ERROR;

         pbi->common.error.setjmp = 0;

         if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

@@ -367,6 +390,14 @@

         if (swap_frame_buffers (cm))

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+            if (cm->rtcd.flags & HAS_NEON)

+#endif

+            {

+                vp8_pop_neon(dx_store_reg);

+            }

+#endif

             pbi->common.error.error_code = VPX_CODEC_ERROR;

             pbi->common.error.setjmp = 0;

             return -1;

@@ -375,6 +406,14 @@

         if (swap_frame_buffers (cm))

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+            if (cm->rtcd.flags & HAS_NEON)

+#endif

+            {

+                vp8_pop_neon(dx_store_reg);

+            }

+#endif

             pbi->common.error.error_code = VPX_CODEC_ERROR;

             pbi->common.error.setjmp = 0;

             return -1;

@@ -455,7 +494,12 @@

 #endif

 #if HAVE_ARMV7

-    vp8_pop_neon(dx_store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+        vp8_pop_neon(dx_store_reg);

+    }

 #endif

     pbi->common.error.setjmp = 0;

     return retcode;

--- /dev/null

+++ b/vp8/encoder/arm/arm_csystemdependent.c

@@ -1,0 +1,136 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

+#include "variance.h"

+#include "onyx_int.h"

+extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+void vp8_arch_arm_encoder_init(VP8_COMP *cpi)

+{

+#if CONFIG_RUNTIME_CPU_DETECT

+    int flags = cpi->common.rtcd.flags;

+    int has_edsp = flags & HAS_EDSP;

+    int has_media = flags & HAS_MEDIA;

+    int has_neon = flags & HAS_NEON;

+#if HAVE_ARMV6

+    if (has_media)

+    {

+        /*cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;

+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;

+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;

+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;

+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/

+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;

+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;

+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;

+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;

+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/

+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;

+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;

+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;

+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/

+        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;

+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

+        /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;

+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/

+        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;

+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;

+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/

+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;

+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;

+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;

+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;

+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;

+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/

+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/

+    }

+#endif

+#if HAVE_ARMV7

+    if (has_neon)

+    {

+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;

+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;

+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;

+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;

+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;

+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/

+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;

+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;

+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;

+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;

+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/

+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;

+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/

+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;

+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;

+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;

+        /*cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;*/

+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;

+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;

+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;

+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;

+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;

+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;

+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;

+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/

+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;

+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;

+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;

+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/

+        /* The neon quantizer has not been updated to match the new exact

+         * quantizer introduced in commit e04e2935

+         */

+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;*/

+    }

+#endif

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (has_neon)

+#endif

+    {

+        vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;

+    }

+#endif

+#endif

+}

--- /dev/null

+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm

@@ -1,0 +1,286 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_start_encode|

+    EXPORT |vp8_encode_bool|

+    EXPORT |vp8_stop_encode|

+    EXPORT |vp8_encode_value|

+    INCLUDE vpx_vp8_enc_asm_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 BOOL_CODER *br

+; r1 unsigned char *source

+|vp8_start_encode| PROC

+    mov     r12, #0

+    mov     r3,  #255

+    mvn     r2,  #23

+    str     r12, [r0, #vp8_writer_lowvalue]

+    str     r3,  [r0, #vp8_writer_range]

+    str     r12, [r0, #vp8_writer_value]

+    str     r2,  [r0, #vp8_writer_count]

+    str     r12, [r0, #vp8_writer_pos]

+    str     r1,  [r0, #vp8_writer_buffer]

+    bx      lr

+    ENDP

+; r0 BOOL_CODER *br

+; r1 int bit

+; r2 int probability

+|vp8_encode_bool| PROC

+    push    {r4-r9, lr}

+    mov     r4, r2

+    ldr     r2, [r0, #vp8_writer_lowvalue]

+    ldr     r5, [r0, #vp8_writer_range]

+    ldr     r3, [r0, #vp8_writer_count]

+    sub     r7, r5, #1                  ; range-1

+    cmp     r1, #0

+    mul     r4, r4, r7                  ; ((range-1) * probability)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * probability) >> 8)

+    addne   r2, r2, r4                  ; if  (bit) lowvalue += split

+    subne   r4, r5, r4                  ; if  (bit) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r1, [r7, r4]

+    cmpge   r1, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r1, r4, #1                  ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r1, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    str     r2, [r0, #vp8_writer_lowvalue]

+    str     r5, [r0, #vp8_writer_range]

+    str     r3, [r0, #vp8_writer_count]

+    pop     {r4-r9, pc}

+    ENDP

+; r0 BOOL_CODER *br

+|vp8_stop_encode| PROC

+    push    {r4-r10, lr}

+    ldr     r2, [r0, #vp8_writer_lowvalue]

+    ldr     r5, [r0, #vp8_writer_range]

+    ldr     r3, [r0, #vp8_writer_count]

+    mov     r10, #32

+stop_encode_loop

+    sub     r7, r5, #1                  ; range-1

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_se      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_se

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_se

+token_zero_while_loop_se

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_se

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r1, [r7, r4]

+    cmpge   r1, #0xff

+    beq     token_zero_while_loop_se

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set_se

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r1, r4, #1                  ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r1, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero_se

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r10, r10, #1

+    bne     stop_encode_loop

+    str     r2, [r0, #vp8_writer_lowvalue]

+    str     r5, [r0, #vp8_writer_range]

+    str     r3, [r0, #vp8_writer_count]

+    pop     {r4-r10, pc}

+    ENDP

+; r0 BOOL_CODER *br

+; r1 int data

+; r2 int bits

+|vp8_encode_value| PROC

+    push    {r4-r11, lr}

+    mov     r10, r2

+    ldr     r2, [r0, #vp8_writer_lowvalue]

+    ldr     r5, [r0, #vp8_writer_range]

+    ldr     r3, [r0, #vp8_writer_count]

+    rsb     r4, r10, #32                 ; 32-n

+    ; v is kept in r1 during the token pack loop

+    lsl     r1, r1, r4                  ; r1 = v << 32 - n

+encode_value_loop

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r1, r1, #1                  ; bit = v >> n

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bit) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_ev      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_ev

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_ev

+token_zero_while_loop_ev

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_ev

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop_ev

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set_ev

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero_ev

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r10, r10, #1

+    bne     encode_value_loop

+    str     r2, [r0, #vp8_writer_lowvalue]

+    str     r5, [r0, #vp8_writer_range]

+    str     r3, [r0, #vp8_writer_count]

+    pop     {r4-r11, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm

@@ -1,0 +1,293 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_tokens_armv5|

+    INCLUDE vpx_vp8_enc_asm_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 vp8_writer *w

+; r1 const TOKENEXTRA *p

+; r2 int xcount

+; r3 vp8_coef_encodings

+; s0 vp8_extra_bits

+; s1 vp8_coef_tree

+|vp8cx_pack_tokens_armv5| PROC

+    push    {r4-r11, lr}

+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop

+    ;  sizeof (TOKENEXTRA) is 20

+    add     r2, r2, r2, lsl #2          ; xcount

+    sub     sp, sp, #12

+    add     r2, r1, r2, lsl #2          ; stop = p + xcount

+    str     r2, [sp, #0]

+    str     r3, [sp, #8]                ; save vp8_coef_encodings

+    ldr     r2, [r0, #vp8_writer_lowvalue]

+    ldr     r5, [r0, #vp8_writer_range]

+    ldr     r3, [r0, #vp8_writer_count]

+    b       check_p_lt_stop

+while_p_lt_stop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #8]                ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldr     r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp8_token_value]  ; v

+    ldr     r8, [r4, #vp8_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #52]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #52]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #48]               ; vp8_extra_bits

+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

+    ;  element.  Here vp8_extra_bit_struct == 20

+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp8_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp8_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp8_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp8_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    str     r2, [r0, #vp8_writer_lowvalue]

+    str     r5, [r0, #vp8_writer_range]

+    str     r3, [r0, #vp8_writer_count]

+    add     sp, sp, #12

+    pop     {r4-r11, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

@@ -1,0 +1,328 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|

+    INCLUDE vpx_vp8_enc_asm_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 VP8_COMP *cpi

+; r1 vp8_writer *w

+; r2 vp8_coef_encodings

+; r3 vp8_extra_bits

+; s0 vp8_coef_tree

+|vp8cx_pack_mb_row_tokens_armv5| PROC

+    push    {r4-r11, lr}

+    sub     sp, sp, #24

+    ; Compute address of cpi->common.mb_rows

+    ldr     r4, _VP8_COMP_common_

+    ldr     r6, _VP8_COMMON_MBrows_

+    add     r4, r0, r4

+    ldr     r5, [r4, r6]                ; load up mb_rows

+    str     r2, [sp, #20]               ; save vp8_coef_encodings

+    str     r5, [sp, #12]               ; save mb_rows

+    str     r3, [sp, #8]                ; save vp8_extra_bits

+    ldr     r4, _VP8_COMP_tplist_

+    add     r4, r0, r4

+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

+    mov     r0, r1                      ; keep same as other loops

+    ldr     r2, [r0, #vp8_writer_lowvalue]

+    ldr     r5, [r0, #vp8_writer_range]

+    ldr     r3, [r0, #vp8_writer_count]

+mb_row_loop

+    ldr     r1, [r7, #tokenlist_start]

+    ldr     r9, [r7, #tokenlist_stop]

+    str     r9, [sp, #0]                ; save stop for later comparison

+    str     r7, [sp, #16]               ; tokenlist address for next time

+    b       check_p_lt_stop

+    ; actuall work gets done here!

+while_p_lt_stop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #20]               ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldr     r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp8_token_value]  ; v

+    ldr     r8, [r4, #vp8_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #60]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                 ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #60]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #8]                ; vp8_extra_bits

+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

+    ;  element.  Here vp8_extra_bit_struct == 20

+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp8_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp8_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp8_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp8_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    ldr     r6, [sp, #12]               ; mb_rows

+    ldr     r7, [sp, #16]               ; tokenlist address

+    subs    r6, r6, #1

+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array

+    str     r6, [sp, #12]

+    bne     mb_row_loop

+    str     r2, [r0, #vp8_writer_lowvalue]

+    str     r5, [r0, #vp8_writer_range]

+    str     r3, [r0, #vp8_writer_count]

+    add     sp, sp, #24

+    pop     {r4-r11, pc}

+    ENDP

+_VP8_COMP_common_

+    DCD     vp8_comp_common

+_VP8_COMMON_MBrows_

+    DCD     vp8_common_mb_rows

+_VP8_COMP_tplist_

+    DCD     vp8_comp_tplist

+    END

--- /dev/null

+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm

@@ -1,0 +1,464 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|

+    INCLUDE vpx_vp8_enc_asm_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 VP8_COMP *cpi

+; r1 unsigned char *cx_data

+; r2 int num_part

+; r3 *size

+; s0 vp8_coef_encodings

+; s1 vp8_extra_bits,

+; s2 const vp8_tree_index *,

+|vp8cx_pack_tokens_into_partitions_armv5| PROC

+    push    {r4-r11, lr}

+    sub     sp, sp, #44

+    ; Compute address of cpi->common.mb_rows

+    ldr     r4, _VP8_COMP_common_

+    ldr     r6, _VP8_COMMON_MBrows_

+    add     r4, r0, r4

+    ldr     r5, [r4, r6]                ; load up mb_rows

+    str     r5, [sp, #36]               ; save mb_rows

+    str     r1, [sp, #24]               ; save cx_data

+    str     r2, [sp, #20]               ; save num_part

+    str     r3, [sp, #8]                ; save *size

+    ; *size = 3*(num_part -1 );

+    sub     r2, r2, #1                  ; num_part - 1

+    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)

+    str     r2, [r3]

+    add     r2, r2, r1                  ; cx_data + *size

+    str     r2, [sp, #40]               ; ptr

+    ldr     r4, _VP8_COMP_tplist_

+    add     r4, r0, r4

+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

+    str     r7, [sp, #32]               ; store start of cpi->tp_list

+    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi

+    add     r0, r0, r11

+    mov     r11, #0

+    str     r11, [sp, #28]              ; i

+numparts_loop

+    ldr     r10, [sp, #40]              ; ptr

+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section

+    str     r5,  [sp, #12]

+    ; Reset all of the VP8 Writer data for each partition that

+    ; is processed.

+    ; start_encode

+    mov     r2, #0                      ; vp8_writer_lowvalue

+    mov     r5, #255                    ; vp8_writer_range

+    mvn     r3, #23                     ; vp8_writer_count

+    str     r2,  [r0, #vp8_writer_value]

+    str     r2,  [r0, #vp8_writer_pos]

+    str     r10, [r0, #vp8_writer_buffer]

+mb_row_loop

+    ldr     r1, [r7, #tokenlist_start]

+    ldr     r9, [r7, #tokenlist_stop]

+    str     r9, [sp, #0]                ; save stop for later comparison

+    str     r7, [sp, #16]               ; tokenlist address for next time

+    b       check_p_lt_stop

+    ; actual work gets done here!

+while_p_lt_stop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #80]               ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldr     r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp8_token_value]  ; v

+    ldr     r8, [r4, #vp8_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #88]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #88]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldr     r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #84]                ; vp8_extra_bits

+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

+    ;  element.  Here vp8_extra_bit_struct == 20

+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp8_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp8_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp8_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp8_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp8_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    ldr     r10, [sp, #20]              ; num_parts

+    mov     r1, #TOKENLIST_SZ

+    mul     r1, r10, r1

+    ldr     r6, [sp, #12]               ; mb_rows

+    ldr     r7, [sp, #16]               ; tokenlist address

+    subs    r6, r6, r10

+    add     r7, r7, r1                  ; next element in the array

+    str     r6, [sp, #12]

+    bgt     mb_row_loop

+    mov     r12, #32

+stop_encode_loop

+    sub     r7, r5, #1                  ; range-1

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_se      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_se

+    ldr     r4, [r0, #vp8_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_se

+token_zero_while_loop_se

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_se

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp8_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop_se

+    ldr     r7, [r0, #vp8_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set_se

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp8_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp8_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+token_count_lt_zero_se

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r12, r12, #1

+    bne     stop_encode_loop

+    ldr     r10, [sp, #8]               ; *size

+    ldr     r11, [r10]

+    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos

+    add     r11, r11, r4                ; *size += w->pos

+    str     r11, [r10]

+    ldr     r9, [sp, #20]               ; num_parts

+    sub     r9, r9, #1

+    ldr     r10, [sp, #28]              ; i

+    cmp     r10, r9                     ; if(i<(num_part - 1))

+    bge     skip_write_partition

+    ldr     r12, [sp, #40]              ; ptr

+    add     r12, r12, r4                ; ptr += w->pos

+    str     r12, [sp, #40]

+    ldr     r9, [sp, #24]               ; cx_data

+    mov     r8, r4, asr #8

+    strb    r4, [r9, #0]

+    strb    r8, [r9, #1]

+    mov     r4, r4, asr #16

+    strb    r4, [r9, #2]

+    add     r9, r9, #3                  ; cx_data += 3

+    str     r9, [sp, #24]

+skip_write_partition

+    ldr     r11, [sp, #28]              ; i

+    ldr     r10, [sp, #20]              ; num_parts

+    add     r11, r11, #1                ; i++

+    str     r11, [sp, #28]

+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]

+    mov     r1, #TOKENLIST_SZ

+    add     r7, r7, r1                  ; next element in cpi->tp_list

+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]

+    cmp     r10, r11

+    bgt     numparts_loop

+    add     sp, sp, #44

+    pop     {r4-r11, pc}

+    ENDP

+_VP8_COMP_common_

+    DCD     vp8_comp_common

+_VP8_COMMON_MBrows_

+    DCD     vp8_common_mb_rows

+_VP8_COMP_tplist_

+    DCD     vp8_comp_tplist

+_VP8_COMP_bc2_

+    DCD     vp8_comp_bc2

+    END

--- a/vp8/encoder/arm/csystemdependent.c

+++ /dev/null

@@ -1,164 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "variance.h"

-#include "onyx_int.h"

-void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-void vp8_cmachine_specific_config(VP8_COMP *cpi)

-{

-#if CONFIG_RUNTIME_CPU_DETECT

-    cpi->rtcd.common                         = &cpi->common.rtcd;

-#if HAVE_ARMV7

-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;

-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;

-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;

-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;

-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;

-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;

-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;

-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;

-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;

-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;

-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;

-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;

-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;

-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;

-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;

-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;

-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;

-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;

-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;

-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;

-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;

-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;

-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;

-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;

-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;

-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;

-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;

-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;

-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;

-    /* The neon quantizer has not been updated to match the new exact

-     * quantizer introduced in commit e04e2935

-     */

-    /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;*/

-#elif HAVE_ARMV6

-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;

-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;

-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;

-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;

-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;

-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;

-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;

-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;

-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;

-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;

-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;

-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;

-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;

-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;

-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;

-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;

-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;

-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;

-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;

-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;

-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;

-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;

-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;

-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;

-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;

-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;

-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;

-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;

-#else

-    //pure c

-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;

-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;

-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;

-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;

-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;

-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;

-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;

-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;

-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;

-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;

-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;

-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;

-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;

-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;

-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;

-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;

-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;

-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;

-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;

-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;

-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;

-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;

-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;

-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;

-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;

-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;

-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;

-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;

-#endif

-#endif

-#if HAVE_ARMV7

-    vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;

-#else

-    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;

-#endif

-}

--- a/vp8/encoder/arm/dct_arm.h

+++ b/vp8/encoder/arm/dct_arm.h

@@ -15,9 +15,11 @@

 #if HAVE_ARMV6

 extern prototype_fdct(vp8_short_walsh4x4_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_fdct_walsh_short4x4

 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6

 #endif

+#endif

 #if HAVE_ARMV7

 extern prototype_fdct(vp8_short_fdct4x4_neon);

@@ -26,6 +28,7 @@

 extern prototype_fdct(vp8_fast_fdct8x4_neon);

 extern prototype_fdct(vp8_short_walsh4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_fdct_short4x4

 #define vp8_fdct_short4x4 vp8_short_fdct4x4_neon

@@ -40,6 +43,7 @@

 #undef  vp8_fdct_walsh_short4x4

 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon

+#endif

 #endif

--- a/vp8/encoder/arm/encodemb_arm.h

+++ b/vp8/encoder/arm/encodemb_arm.h

@@ -30,6 +30,7 @@

 //#undef  vp8_encodemb_mbuverr

 //#define vp8_encodemb_mbuverr vp8_mbuverror_c

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_encodemb_subb

 #define vp8_encodemb_subb vp8_subtract_b_neon

@@ -38,6 +39,7 @@

 #undef  vp8_encodemb_submbuv

 #define vp8_encodemb_submbuv vp8_subtract_mbuv_neon

+#endif

 #endif

--- a/vp8/encoder/arm/neon/boolhuff_armv7.asm

+++ /dev/null

@@ -1,293 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_start_encode|

-    EXPORT |vp8_encode_bool|

-    EXPORT |vp8_stop_encode|

-    EXPORT |vp8_encode_value|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 BOOL_CODER *br

-; r1 unsigned char *source

-|vp8_start_encode| PROC

-    mov     r12, #0

-    mov     r3,  #255

-    mvn     r2,  #23

-    str     r12, [r0, #vp8_writer_lowvalue]

-    str     r3,  [r0, #vp8_writer_range]

-    str     r12, [r0, #vp8_writer_value]

-    str     r2,  [r0, #vp8_writer_count]

-    str     r12, [r0, #vp8_writer_pos]

-    str     r1,  [r0, #vp8_writer_buffer]

-    bx      lr

-    ENDP

-; r0 BOOL_CODER *br

-; r1 int bit

-; r2 int probability

-|vp8_encode_bool| PROC

-    push    {r4-r9, lr}

-    mov     r4, r2

-    ldr     r2, [r0, #vp8_writer_lowvalue]

-    ldr     r5, [r0, #vp8_writer_range]

-    ldr     r3, [r0, #vp8_writer_count]

-    sub     r7, r5, #1                  ; range-1

-    cmp     r1, #0

-    mul     r4, r4, r7                  ; ((range-1) * probability)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * probability) >> 8)

-    addne   r2, r2, r4                  ; if  (bit) lowvalue += split

-    subne   r4, r5, r4                  ; if  (bit) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r1, [r7, r4]

-    cmpge   r1, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r1, r4, #1                  ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r1, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    str     r2, [r0, #vp8_writer_lowvalue]

-    str     r5, [r0, #vp8_writer_range]

-    str     r3, [r0, #vp8_writer_count]

-    pop     {r4-r9, pc}

-    ENDP

-; r0 BOOL_CODER *br

-|vp8_stop_encode| PROC

-    push    {r4-r10, lr}

-    ldr     r2, [r0, #vp8_writer_lowvalue]

-    ldr     r5, [r0, #vp8_writer_range]

-    ldr     r3, [r0, #vp8_writer_count]

-    mov     r10, #32

-stop_encode_loop

-    sub     r7, r5, #1                  ; range-1

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_se      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_se

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_se

-token_zero_while_loop_se

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_se

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r1, [r7, r4]

-    cmpge   r1, #0xff

-    beq     token_zero_while_loop_se

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set_se

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r1, r4, #1                  ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r1, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero_se

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r10, r10, #1

-    bne     stop_encode_loop

-    str     r2, [r0, #vp8_writer_lowvalue]

-    str     r5, [r0, #vp8_writer_range]

-    str     r3, [r0, #vp8_writer_count]

-    pop     {r4-r10, pc}

-    ENDP

-; r0 BOOL_CODER *br

-; r1 int data

-; r2 int bits

-|vp8_encode_value| PROC

-    push    {r4-r11, lr}

-    mov     r10, r2

-    ldr     r2, [r0, #vp8_writer_lowvalue]

-    ldr     r5, [r0, #vp8_writer_range]

-    ldr     r3, [r0, #vp8_writer_count]

-    ; reverse the stream of bits to be packed.  Normally

-    ; the most significant bit is peeled off and compared

-    ; in the form of (v >> --n) & 1.  ARM architecture has

-    ; the ability to set a flag based on the value of the

-    ; bit shifted off the bottom of the register.  To make

-    ; that happen the bitstream is reversed.

-    rbit    r11, r1

-    rsb     r4, r10, #32                 ; 32-n

-    ; v is kept in r1 during the token pack loop

-    lsr     r1, r11, r4                 ; v >>= 32 - n

-encode_value_loop

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsrs    r1, r1, #1                  ; bit = v >> n

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bit) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_ev      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_ev

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_ev

-token_zero_while_loop_ev

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_ev

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop_ev

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set_ev

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero_ev

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r10, r10, #1

-    bne     encode_value_loop

-    str     r2, [r0, #vp8_writer_lowvalue]

-    str     r5, [r0, #vp8_writer_range]

-    str     r3, [r0, #vp8_writer_count]

-    pop     {r4-r11, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm

+++ /dev/null

@@ -1,301 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_tokens_armv7|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 vp8_writer *w

-; r1 const TOKENEXTRA *p

-; r2 int xcount

-; r3 vp8_coef_encodings

-; s0 vp8_extra_bits

-; s1 vp8_coef_tree

-|vp8cx_pack_tokens_armv7| PROC

-    push    {r4-r11, lr}

-    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop

-    ;  sizeof (TOKENEXTRA) is 20

-    add     r2, r2, r2, lsl #2          ; xcount

-    sub     sp, sp, #12

-    add     r2, r1, r2, lsl #2          ; stop = p + xcount

-    str     r2, [sp, #0]

-    str     r3, [sp, #8]                ; save vp8_coef_encodings

-    ldr     r2, [r0, #vp8_writer_lowvalue]

-    ldr     r5, [r0, #vp8_writer_range]

-    ldr     r3, [r0, #vp8_writer_count]

-    b       check_p_lt_stop

-while_p_lt_stop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #8]                ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp8_token_value]  ; v

-    ldr     r8, [r4, #vp8_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    ; reverse the stream of bits to be packed.  Normally

-    ; the most significant bit is peeled off and compared

-    ; in the form of (v >> --n) & 1.  ARM architecture has

-    ; the ability to set a flag based on the value of the

-    ; bit shifted off the bottom of the register.  To make

-    ; that happen the bitstream is reversed.

-    rbit    r12, r6

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #52]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsr     r12, r12, r4                ; v >>= 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsrs    r12, r12, #1                ; bb = v >> n

-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #52]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #48]               ; vp8_extra_bits

-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

-    ;  element.  Here vp8_extra_bit_struct == 20

-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rbit    r12, r7                     ; reverse v

-    rsb     r4, r8, #32

-    lsr     r12, r12, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsrs    r12, r12, #1                ; v >> n

-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp8_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp8_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp8_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp8_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    str     r2, [r0, #vp8_writer_lowvalue]

-    str     r5, [r0, #vp8_writer_range]

-    str     r3, [r0, #vp8_writer_count]

-    add     sp, sp, #12

-    pop     {r4-r11, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm

+++ /dev/null

@@ -1,336 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_mb_row_tokens_armv7|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 VP8_COMP *cpi

-; r1 vp8_writer *w

-; r2 vp8_coef_encodings

-; r3 vp8_extra_bits

-; s0 vp8_coef_tree

-|vp8cx_pack_mb_row_tokens_armv7| PROC

-    push    {r4-r11, lr}

-    sub     sp, sp, #24

-    ; Compute address of cpi->common.mb_rows

-    ldr     r4, _VP8_COMP_common_

-    ldr     r6, _VP8_COMMON_MBrows_

-    add     r4, r0, r4

-    ldr     r5, [r4, r6]                ; load up mb_rows

-    str     r2, [sp, #20]               ; save vp8_coef_encodings

-    str     r5, [sp, #12]               ; save mb_rows

-    str     r3, [sp, #8]                ; save vp8_extra_bits

-    ldr     r4, _VP8_COMP_tplist_

-    add     r4, r0, r4

-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

-    mov     r0, r1                      ; keep same as other loops

-    ldr     r2, [r0, #vp8_writer_lowvalue]

-    ldr     r5, [r0, #vp8_writer_range]

-    ldr     r3, [r0, #vp8_writer_count]

-mb_row_loop

-    ldr     r1, [r7, #tokenlist_start]

-    ldr     r9, [r7, #tokenlist_stop]

-    str     r9, [sp, #0]                ; save stop for later comparison

-    str     r7, [sp, #16]               ; tokenlist address for next time

-    b       check_p_lt_stop

-    ; actuall work gets done here!

-while_p_lt_stop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #20]               ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp8_token_value]  ; v

-    ldr     r8, [r4, #vp8_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    ; reverse the stream of bits to be packed.  Normally

-    ; the most significant bit is peeled off and compared

-    ; in the form of (v >> --n) & 1.  ARM architecture has

-    ; the ability to set a flag based on the value of the

-    ; bit shifted off the bottom of the register.  To make

-    ; that happen the bitstream is reversed.

-    rbit    r12, r6

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #60]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsr     r12, r12, r4                ; v >>= 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsrs    r12, r12, #1                ; bb = v >> n

-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #60]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #8]                ; vp8_extra_bits

-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

-    ;  element.  Here vp8_extra_bit_struct == 20

-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rbit    r12, r7                     ; reverse v

-    rsb     r4, r8, #32

-    lsr     r12, r12, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsrs    r12, r12, #1                ; v >> n

-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp8_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp8_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp8_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp8_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    ldr     r6, [sp, #12]               ; mb_rows

-    ldr     r7, [sp, #16]               ; tokenlist address

-    subs    r6, r6, #1

-    add     r7, r7, #TOKENLIST_SZ       ; next element in the array

-    str     r6, [sp, #12]

-    bne     mb_row_loop

-    str     r2, [r0, #vp8_writer_lowvalue]

-    str     r5, [r0, #vp8_writer_range]

-    str     r3, [r0, #vp8_writer_count]

-    add     sp, sp, #24

-    pop     {r4-r11, pc}

-    ENDP

-_VP8_COMP_common_

-    DCD     vp8_comp_common

-_VP8_COMMON_MBrows_

-    DCD     vp8_common_mb_rows

-_VP8_COMP_tplist_

-    DCD     vp8_comp_tplist

-    END

--- a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm

+++ /dev/null

@@ -1,472 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_tokens_into_partitions_armv7|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 VP8_COMP *cpi

-; r1 unsigned char *cx_data

-; r2 int num_part

-; r3 *size

-; s0 vp8_coef_encodings

-; s1 vp8_extra_bits,

-; s2 const vp8_tree_index *,

-|vp8cx_pack_tokens_into_partitions_armv7| PROC

-    push    {r4-r11, lr}

-    sub     sp, sp, #44

-    ; Compute address of cpi->common.mb_rows

-    ldr     r4, _VP8_COMP_common_

-    ldr     r6, _VP8_COMMON_MBrows_

-    add     r4, r0, r4

-    ldr     r5, [r4, r6]                ; load up mb_rows

-    str     r5, [sp, #36]               ; save mb_rows

-    str     r1, [sp, #24]               ; save cx_data

-    str     r2, [sp, #20]               ; save num_part

-    str     r3, [sp, #8]                ; save *size

-    ; *size = 3*(num_part -1 );

-    sub     r2, r2, #1                  ; num_part - 1

-    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)

-    str     r2, [r3]

-    add     r2, r2, r1                  ; cx_data + *size

-    str     r2, [sp, #40]               ; ptr

-    ldr     r4, _VP8_COMP_tplist_

-    add     r4, r0, r4

-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

-    str     r7, [sp, #32]               ; store start of cpi->tp_list

-    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi

-    add     r0, r0, r11

-    mov     r11, #0

-    str     r11, [sp, #28]              ; i

-numparts_loop

-    ldr     r10, [sp, #40]              ; ptr

-    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section

-    str     r5,  [sp, #12]

-    ; Reset all of the VP8 Writer data for each partition that

-    ; is processed.

-    ; start_encode

-    mov     r2, #0                      ; vp8_writer_lowvalue

-    mov     r5, #255                    ; vp8_writer_range

-    mvn     r3, #23                     ; vp8_writer_count

-    str     r2,  [r0, #vp8_writer_value]

-    str     r2,  [r0, #vp8_writer_pos]

-    str     r10, [r0, #vp8_writer_buffer]

-mb_row_loop

-    ldr     r1, [r7, #tokenlist_start]

-    ldr     r9, [r7, #tokenlist_stop]

-    str     r9, [sp, #0]                ; save stop for later comparison

-    str     r7, [sp, #16]               ; tokenlist address for next time

-    b       check_p_lt_stop

-    ; actual work gets done here!

-while_p_lt_stop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #80]               ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp8_token_value]  ; v

-    ldr     r8, [r4, #vp8_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    ; reverse the stream of bits to be packed.  Normally

-    ; the most significant bit is peeled off and compared

-    ; in the form of (v >> --n) & 1.  ARM architecture has

-    ; the ability to set a flag based on the value of the

-    ; bit shifted off the bottom of the register.  To make

-    ; that happen the bitstream is reversed.

-    rbit    r12, r6

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #88]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsr     r12, r12, r4                ; v >>= 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsrs    r12, r12, #1                ; bb = v >> n

-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #88]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #84]                ; vp8_extra_bits

-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired

-    ;  element.  Here vp8_extra_bit_struct == 20

-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t

-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L

-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rbit    r12, r7                     ; reverse v

-    rsb     r4, r8, #32

-    lsr     r12, r12, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsrs    r12, r12, #1                ; v >> n

-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp8_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp8_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp8_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp8_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp8_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    ldr     r10, [sp, #20]              ; num_parts

-    mov     r1, #TOKENLIST_SZ

-    mul     r1, r10, r1

-    ldr     r6, [sp, #12]               ; mb_rows

-    ldr     r7, [sp, #16]               ; tokenlist address

-    subs    r6, r6, r10

-    add     r7, r7, r1                  ; next element in the array

-    str     r6, [sp, #12]

-    bgt     mb_row_loop

-    mov     r12, #32

-stop_encode_loop

-    sub     r7, r5, #1                  ; range-1

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_se      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_se

-    ldr     r4, [r0, #vp8_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_se

-token_zero_while_loop_se

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_se

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp8_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop_se

-    ldr     r7, [r0, #vp8_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set_se

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp8_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp8_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-token_count_lt_zero_se

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r12, r12, #1

-    bne     stop_encode_loop

-    ldr     r10, [sp, #8]               ; *size

-    ldr     r11, [r10]

-    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos

-    add     r11, r11, r4                ; *size += w->pos

-    str     r11, [r10]

-    ldr     r9, [sp, #20]               ; num_parts

-    sub     r9, r9, #1

-    ldr     r10, [sp, #28]              ; i

-    cmp     r10, r9                     ; if(i<(num_part - 1))

-    bge     skip_write_partition

-    ldr     r12, [sp, #40]              ; ptr

-    add     r12, r12, r4                ; ptr += w->pos

-    str     r12, [sp, #40]

-    ldr     r9, [sp, #24]               ; cx_data

-    mov     r8, r4, asr #8

-    strb    r4, [r9, #0]

-    strb    r8, [r9, #1]

-    mov     r4, r4, asr #16

-    strb    r4, [r9, #2]

-    add     r9, r9, #3                  ; cx_data += 3

-    str     r9, [sp, #24]

-skip_write_partition

-    ldr     r11, [sp, #28]              ; i

-    ldr     r10, [sp, #20]              ; num_parts

-    add     r11, r11, #1                ; i++

-    str     r11, [sp, #28]

-    ldr     r7, [sp, #32]               ; cpi->tp_list[i]

-    mov     r1, #TOKENLIST_SZ

-    add     r7, r7, r1                  ; next element in cpi->tp_list

-    str     r7, [sp, #32]               ; cpi->tp_list[i+1]

-    cmp     r10, r11

-    bgt     numparts_loop

-    add     sp, sp, #44

-    pop     {r4-r11, pc}

-    ENDP

-_VP8_COMP_common_

-    DCD     vp8_comp_common

-_VP8_COMMON_MBrows_

-    DCD     vp8_common_mb_rows

-_VP8_COMP_tplist_

-    DCD     vp8_comp_tplist

-_VP8_COMP_bc2_

-    DCD     vp8_comp_bc2

-    END

--- a/vp8/encoder/arm/variance_arm.h

+++ b/vp8/encoder/arm/variance_arm.h

@@ -38,6 +38,7 @@

 //extern prototype_variance2(vp8_get16x16var_c);

 extern prototype_sad(vp8_get4x4sse_cs_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

 #undef  vp8_variance_sad4x4

 #define vp8_variance_sad4x4 vp8_sad4x4_neon

@@ -100,6 +101,7 @@

 #undef  vp8_variance_get4x4sse_cs

 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon

+#endif

 #endif

--- a/vp8/encoder/bitstream.h

+++ b/vp8/encoder/bitstream.h

@@ -12,25 +12,25 @@

 #ifndef __INC_BITSTREAM_H

 #define __INC_BITSTREAM_H

-#if HAVE_ARMV7

-void vp8cx_pack_tokens_armv7(vp8_writer *w, const TOKENEXTRA *p, int xcount,

+#if HAVE_ARMV5TE

+void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,

                              vp8_token *,

                              vp8_extra_bit_struct *,

                              const vp8_tree_index *);

-void vp8cx_pack_tokens_into_partitions_armv7(VP8_COMP *, unsigned char *, int , int *,

+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *,

         vp8_token *,

         vp8_extra_bit_struct *,

         const vp8_tree_index *);

-void vp8cx_pack_mb_row_tokens_armv7(VP8_COMP *cpi, vp8_writer *w,

+void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,

                                     vp8_token *,

                                     vp8_extra_bit_struct *,

                                     const vp8_tree_index *);

 # define pack_tokens(a,b,c)                  \

-    vp8cx_pack_tokens_armv7(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

+    vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

 # define pack_tokens_into_partitions(a,b,c,d)  \

-    vp8cx_pack_tokens_into_partitions_armv7(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

+    vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

 # define pack_mb_row_tokens(a,b)               \

-    vp8cx_pack_mb_row_tokens_armv7(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

+    vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)

 #else

 # define pack_tokens(a,b,c)                  pack_tokens_c(a,b,c)

 # define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)

--- a/vp8/encoder/generic/csystemdependent.c

+++ b/vp8/encoder/generic/csystemdependent.c

@@ -15,6 +15,7 @@

 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);

+void vp8_arch_arm_encoder_init(VP8_COMP *cpi);

 void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);

@@ -92,6 +93,10 @@

 #if ARCH_X86 || ARCH_X86_64

     vp8_arch_x86_encoder_init(cpi);

+#endif

+#if ARCH_ARM

+    vp8_arch_arm_encoder_init(cpi);

 #endif

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -31,6 +31,9 @@

 #include "vpx_ports/vpx_timer.h"

 #include "vpxerrors.h"

 #include "temporal_filter.h"

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

 #include <math.h>

 #include <stdio.h>

@@ -2106,8 +2109,8 @@

     CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA)));

     CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));

-    vp8_cmachine_specific_config(cpi);

     vp8_create_common(&cpi->common);

+    vp8_cmachine_specific_config(cpi);

     vp8_init_config((VP8_PTR)cpi, oxcf);

@@ -2852,10 +2855,21 @@

         //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);

 #if HAVE_ARMV7

-        vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);

-#else

-        vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);

+        }

+#if CONFIG_RUNTIME_CPU_DETECT

+        else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+        {

+            vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);

+        }

+#endif

         cpi->Source = &cpi->scaled_source;

@@ -4624,10 +4638,10 @@

 #if HAVE_ARMV7

 extern void vp8_push_neon(INT64 *store);

 extern void vp8_pop_neon(INT64 *store);

-static INT64 store_reg[8];

 #endif

 int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)

+    INT64 store_reg[8];

     VP8_COMP *cpi = (VP8_COMP *) ptr;

     VP8_COMMON *cm = &cpi->common;

     struct vpx_usec_timer  timer;

@@ -4636,8 +4650,13 @@

         return -1;

 #if HAVE_ARMV7

-    vp8_push_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

 #endif

+    {

+        vp8_push_neon(store_reg);

+    }

+#endif

     vpx_usec_timer_start(&timer);

@@ -4645,8 +4664,13 @@

     if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames)

 #if HAVE_ARMV7

-        vp8_pop_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_pop_neon(store_reg);

+        }

+#endif

         return -1;

@@ -4686,10 +4710,21 @@

         s->source_time_stamp = time_stamp;

         s->source_frame_flags = frame_flags;

 #if HAVE_ARMV7

-        vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);

-#else

-        vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);

+        }

+#if CONFIG_RUNTIME_CPU_DETECT

+        else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+        {

+            vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);

+        }

+#endif

         cpi->source_buffer_count = 1;

@@ -4697,14 +4732,19 @@

     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

 #if HAVE_ARMV7

-    vp8_pop_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

 #endif

+    {

+        vp8_pop_neon(store_reg);

+    }

+#endif

     return 0;

 int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)

+    INT64 store_reg[8];

     VP8_COMP *cpi = (VP8_COMP *) ptr;

     VP8_COMMON *cm = &cpi->common;

     struct vpx_usec_timer  tsctimer;

@@ -4715,8 +4755,13 @@

         return -1;

 #if HAVE_ARMV7

-    vp8_push_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

 #endif

+    {

+        vp8_push_neon(store_reg);

+    }

+#endif

     vpx_usec_timer_start(&cmptimer);

@@ -4867,8 +4912,13 @@

 #endif

 #if HAVE_ARMV7

-        vp8_pop_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_pop_neon(store_reg);

+        }

+#endif

         return -1;

@@ -4910,8 +4960,13 @@

     if (!cpi)

 #if HAVE_ARMV7

-        vp8_pop_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+        if (cm->rtcd.flags & HAS_NEON)

 #endif

+        {

+            vp8_pop_neon(store_reg);

+        }

+#endif

         return 0;

@@ -5099,7 +5154,12 @@

 #endif

 #if HAVE_ARMV7

-    vp8_pop_neon(store_reg);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+        vp8_pop_neon(store_reg);

+    }

 #endif

     return 0;

--- a/vp8/encoder/picklpf.c

+++ b/vp8/encoder/picklpf.c

@@ -16,6 +16,9 @@

 #include "vpx_scale/yv12extend.h"

 #include "vpx_scale/vpxscale.h"

 #include "alloccommon.h"

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);

 extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);

@@ -306,10 +309,21 @@

     //  Make a copy of the unfiltered / processed recon buffer

 #if HAVE_ARMV7

-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);

-#else

-    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

 #endif

+    {

+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);

+    }

+#if CONFIG_RUNTIME_CPU_DETECT

+    else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+    {

+        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);

+    }

+#endif

     if (cm->frame_type == KEY_FRAME)

         cm->sharpness_level = 0;

@@ -343,10 +357,21 @@

     //  Re-instate the unfiltered frame

 #if HAVE_ARMV7

-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-#else

-    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

 #endif

+    {

+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+    }

+#if CONFIG_RUNTIME_CPU_DETECT

+    else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+    {

+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+    }

+#endif

     while (filter_step > 0)

@@ -372,10 +397,21 @@

             //  Re-instate the unfiltered frame

 #if HAVE_ARMV7

-            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-#else

-            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+#if CONFIG_RUNTIME_CPU_DETECT

+            if (cm->rtcd.flags & HAS_NEON)

 #endif

+            {

+                vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+            }

+#if CONFIG_RUNTIME_CPU_DETECT

+            else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+            {

+                vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+            }

+#endif

             // If value is close to the best so far then bias towards a lower loop filter value.

             if ((filt_err - Bias) < best_err)

@@ -401,9 +437,20 @@

             //  Re-instate the unfiltered frame

 #if HAVE_ARMV7

-            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-#else

-            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+#if CONFIG_RUNTIME_CPU_DETECT

+            if (cm->rtcd.flags & HAS_NEON)

+#endif

+            {

+                vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+            }

+#if CONFIG_RUNTIME_CPU_DETECT

+            else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+            {

+                vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+            }

 #endif

             // Was it better than the previous best?

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -112,6 +112,8 @@

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm

 endif

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c

 # common (c)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c

@@ -119,14 +121,7 @@

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/recon_arm.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra4x4_arm.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/systemdependent.c

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/vpx_asm_offsets.c

-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/filter_c.c

-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/idctllm.c

-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/recon.c

-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/reconintra4x4.c

-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/generic/systemdependent.c

 # common (armv6)

 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)

--- a/vp8/vp8cx_arm.mk

+++ b/vp8/vp8cx_arm.mk

@@ -13,18 +13,23 @@

 #File list for arm

 # encoder

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/csystemdependent.c

+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/boolhuff_arm.c

+VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/mcomp_arm.c

-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV6)  += encoder/generic/csystemdependent.c

-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV7)  += encoder/boolhuff.c

-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV7)  += encoder/mcomp.c

+VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c

+#File list for armv5te

+# encoder

+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)

+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)

 #File list for armv6

 # encoder

 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

@@ -44,10 +49,6 @@

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_armv7$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_mbrow_armv7$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_partitions_armv7$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/boolhuff_armv7$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/vpx_vp8_enc_asm_offsets.c

--- a/vp8/vp8dx_arm.mk

+++ b/vp8/vp8dx_arm.mk

@@ -11,11 +11,9 @@

 #VP8_DX_SRCS list is modified according to different platforms.

+VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dequantize_arm.c

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dsystemdependent.c

-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/generic/dsystemdependent.c

-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/dequantize.c

-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/idct_blk.c

 VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK)  += decoder/arm/detokenize$(ASM)

 #File list for armv6

--- /dev/null

+++ b/vpx_ports/arm.h

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_PORTS_ARM_H

+#define VPX_PORTS_ARM_H

+#include <stdlib.h>

+#include "config.h"

+/*ARMv5TE "Enhanced DSP" instructions.*/

+#define HAS_EDSP  0x01

+/*ARMv6 "Parallel" or "Media" instructions.*/

+#define HAS_MEDIA 0x02

+/*ARMv7 optional NEON instructions.*/

+#define HAS_NEON  0x04

+int arm_cpu_caps(void);

+#endif

--- /dev/null

+++ b/vpx_ports/arm_cpudetect.c

@@ -1,0 +1,190 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include <string.h>

+#include "arm.h"

+static int arm_cpu_env_flags(int *flags)

+{

+    char *env;

+    env = getenv("VPX_SIMD_CAPS");

+    if (env && *env)

+    {

+        *flags = (int)strtol(env, NULL, 0);

+        return 0;

+    }

+    *flags = 0;

+    return -1;

+}

+static int arm_cpu_env_mask(void)

+{

+    char *env;

+    env = getenv("VPX_SIMD_CAPS_MASK");

+    return env && *env ? (int)strtol(env, NULL, 0) : ~0;

+}

+#if defined(_MSC_VER)

+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/

+#define WIN32_LEAN_AND_MEAN

+#define WIN32_EXTRA_LEAN

+#include <windows.h>

+int arm_cpu_caps(void)

+{

+    int flags;

+    int mask;

+    if (!arm_cpu_env_flags(&flags))

+    {

+        return flags;

+    }

+    mask = arm_cpu_env_mask();

+    /* MSVC has no inline __asm support for ARM, but it does let you __emit

+     *  instructions via their assembled hex code.

+     * All of these instructions should be essentially nops.

+     */

+#if defined(HAVE_ARMV5TE)

+    if (mask & HAS_EDSP)

+    {

+        __try

+        {

+            /*PLD [r13]*/

+            __emit(0xF5DDF000);

+            flags |= HAS_EDSP;

+        }

+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)

+        {

+            /*Ignore exception.*/

+        }

+    }

+#if defined(HAVE_ARMV6)

+    if (mask & HAS_MEDIA)

+        __try

+        {

+            /*SHADD8 r3,r3,r3*/

+            __emit(0xE6333F93);

+            flags |= HAS_MEDIA;

+        }

+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)

+        {

+            /*Ignore exception.*/

+        }

+    }

+#if defined(HAVE_ARMV7)

+    if (mask & HAS_NEON)

+    {

+        __try

+        {

+            /*VORR q0,q0,q0*/

+            __emit(0xF2200150);

+            flags |= HAS_NEON;

+        }

+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)

+        {

+            /*Ignore exception.*/

+        }

+    }

+#endif

+#endif

+#endif

+    return flags & mask;

+}

+#elif defined(__linux__)

+#include <stdio.h>

+int arm_cpu_caps(void)

+{

+    FILE *fin;

+    int flags;

+    int mask;

+    if (!arm_cpu_env_flags(&flags))

+    {

+        return flags;

+    }

+    mask = arm_cpu_env_mask();

+    /* Reading /proc/self/auxv would be easier, but that doesn't work reliably

+     *  on Android.

+     * This also means that detection will fail in Scratchbox.

+     */

+    fin = fopen("/proc/cpuinfo","r");

+    if(fin != NULL)

+    {

+        /* 512 should be enough for anybody (it's even enough for all the flags

+         * that x86 has accumulated... so far).

+         */

+        char buf[512];

+        while (fgets(buf, 511, fin) != NULL)

+        {

+#if defined(HAVE_ARMV5TE) || defined(HAVE_ARMV7)

+            if (memcmp(buf, "Features", 8) == 0)

+            {

+                char *p;

+#if defined(HAVE_ARMV5TE)

+                p=strstr(buf, " edsp");

+                if (p != NULL && (p[5] == ' ' || p[5] == '\n'))

+                {

+                    flags |= HAS_EDSP;

+                }

+#if defined(HAVE_ARMV7)

+                p = strstr(buf, " neon");

+                if (p != NULL && (p[5] == ' ' || p[5] == '\n'))

+                {

+                    flags |= HAS_NEON;

+                }

+#endif

+#endif

+            }

+#endif

+#if defined(HAVE_ARMV6)

+            if (memcmp(buf, "CPU architecture:",17) == 0){

+                int version;

+                version = atoi(buf+17);

+                if (version >= 6)

+                {

+                    flags |= HAS_MEDIA;

+                }

+            }

+#endif

+        }

+        fclose(fin);

+    }

+    return flags & mask;

+}

+#elif !CONFIG_RUNTIME_CPU_DETECT

+int arm_cpu_caps(void)

+{

+    int flags;

+    int mask;

+    if (!arm_cpu_env_flags(&flags))

+    {

+        return flags;

+    }

+    mask = arm_cpu_env_mask();

+#if defined(HAVE_ARMV5TE)

+    flags |= HAS_EDSP;

+#endif

+#if defined(HAVE_ARMV6)

+    flags |= HAS_MEDIA;

+#endif

+#if defined(HAVE_ARMV7)

+    flags |= HAS_NEON;

+#endif

+    return flags & mask;

+}

+#else

+#error "--enable-runtime-cpu-detect selected, but no CPU detection method " \

+ "available for your platform. Reconfigure without --enable-runtime-cpu-detect."

+#endif

--- a/vpx_scale/arm/scalesystemdependant.c

+++ b/vpx_scale/arm/scalesystemdependant.c

@@ -10,6 +10,7 @@

 #include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

 #include "vpx_scale/vpxscale.h"

@@ -47,6 +48,9 @@

  ****************************************************************************/

 void vp8_scale_machine_specific_config()

+#if HAVE_ARMV7 && CONFIG_RUNTIME_CPU_DETECT

+    int flags;

+#endif

/*

     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;

     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;

@@ -73,14 +77,20 @@

     vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;

*/

-#if HAVE_ARMV7

-    vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders_neon;

-    vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly_neon;

-    vp8_yv12_copy_frame_ptr               = vp8_yv12_copy_frame_neon;

-#else

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

     vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders;

     vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly;

     vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame;

 #endif

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    flags = arm_cpu_caps();

+    if (flags & HAS_NEON)

+#endif

+    {

+        vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders_neon;

+        vp8_yv12_copy_frame_yonly_ptr     = vp8_yv12_copy_frame_yonly_neon;

+        vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame_neon;

+    }

+#endif

--

⑨