shithub: libvpx

Download patch

ref: df0d3a415216340a44953c6ed936bc2a4d7a1175
parent: f74e0229d5575af20498a9dbf48f1d305dfc8719
parent: c56ab7d0c6f3fb215d571db3dacc0cc908c1b53c
author: Sigrid Solveig Haflínudóttir <ftrvxmtrx@gmail.com>
date: Tue Oct 26 21:43:39 EDT 2021

Merge remote-tracking branch 'upstream/master'

--- a/.mailmap
+++ b/.mailmap
@@ -12,6 +12,8 @@
 Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Fyodor Kyslov <kyslov@google.com>
+Gregor Jasny <gjasny@gmail.com>
+Gregor Jasny <gjasny@gmail.com> <gjasny@googlemail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,6 +3,7 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam B. Goode <adam.mckee84@gmail.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Aidan Welch <aidansw@yahoo.com>
@@ -25,6 +26,7 @@
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
+Bohan Li <bohanli@google.com>
 Brian Foley <bpfoley@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
@@ -34,9 +36,11 @@
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
+Chunbo Hua <chunbo.hua@intel.com>
 Clement Courbet <courbet@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Daniel Sommermann <dcsommer@gmail.com>
 Dan Zhu <zxdan@google.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
@@ -67,6 +71,7 @@
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
@@ -73,6 +78,7 @@
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Touton <bekenn@gmail.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
@@ -82,16 +88,20 @@
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
+jinbo <jinbo-hf@loongson.cn>
 Jingning Han <jingning@google.com>
+Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jonathan Wright <jonathan.wright@arm.com>
 Jon Kunkee <jkunkee@microsoft.com>
 Jorge E. Moreira <jemoreira@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
@@ -141,6 +151,7 @@
 Peter Boström <pbos@chromium.org>
 Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,59 @@
+2021-09-27 v1.11.0 "Smew Duck"
+  This maintenance release adds support for VBR mode in VP9 rate control
+  interface, new codec controls to get quantization parameters and loop filter
+  levels, and includes several improvements to NEON and numerous bug fixes.
+
+  - Upgrading:
+    New codec control is added to get quantization parameters and loop filter
+    levels.
+
+    VBR mode is supported in VP9 rate control library.
+
+  - Enhancement:
+    Numerous improvements for Neon optimizations.
+    Code clean-up and refactoring.
+    Calculation of rd multiplier is changed with BDRATE gains.
+
+  - Bug fixes:
+    Fix to overflow on duration.
+    Fix to several instances of -Wunused-but-set-variable.
+    Fix to avoid chroma resampling for 420mpeg2 input.
+    Fix to overflow in calc_iframe_target_size.
+    Fix to disallow skipping transform and quantization.
+    Fix some -Wsign-compare warnings in simple_encode.
+    Fix input file path in simple_encode_test.
+    Fix valid range for under/over_shoot pct.
+
+2021-03-09 v1.10.0 "Ruddy Duck"
+  This maintenance release adds support for darwin20 and new codec controls, as
+  well as numerous bug fixes.
+
+  - Upgrading:
+    New codec control is added to disable loopfilter for VP9.
+
+    New encoder control is added to disable feature to increase Q on overshoot
+    detection for CBR.
+
+    Configure support for darwin20 is added.
+
+    New codec control is added for VP9 rate control. The control ID of this
+    interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized
+    external rate control model, users will have to implement each callback
+    function in vpx_rc_funcs_t and register them using libvpx API
+    vpx_codec_control_() with the control ID.
+
+  - Enhancement:
+    Use -std=gnu++11 instead of -std=c++11 for c++ files.
+
+  - Bug fixes:
+    Override assembler with --as option of configure for MSVS.
+    Fix several compilation issues with gcc 4.8.5.
+    Fix to resetting rate control for temporal layers.
+    Fix to the rate control stats of SVC example encoder when number of spatial
+    layers is 1.
+    Fix to reusing motion vectors from the base spatial layer in SVC.
+    2 pass related flags removed from SVC example encoder.
+
 2020-07-29 v1.9.0 "Quacking Duck"
   This release adds support for NV12, a separate library for rate control, as
   well as incremental improvements.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,10 +19,9 @@
 
 All submissions, including submissions by project members, require review. We
 use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
-https://chromium-review.googlesource.com for this purpose.
-
-See https://www.webmproject.org/code/contribute/submitting-patches for an
-example of a typical gerrit workflow.
+https://chromium-review.googlesource.com for this purpose. See the
+[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/)
+for additional details.
 
 ## Community Guidelines
 
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 20 July 2020
+README - 08 March 2021
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -10,14 +10,14 @@
   1. Prerequisites
 
     * All x86 targets require the Yasm[1] assembler be installed[2].
-    * All Windows builds require that Cygwin[3] be installed.
-    * Building the documentation requires Doxygen[4]. If you do not
+    * All Windows builds require that Cygwin[3] or MSYS2[4] be installed.
+    * Building the documentation requires Doxygen[5]. If you do not
       have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+    * Downloading the data for the unit tests requires curl[6] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[6]. These requirements are optional if not running the unit
+      source[7]. These requirements are optional if not running the unit
       tests.
 
     [1]: http://www.tortall.net/projects/yasm
@@ -26,9 +26,10 @@
          yasm-<version>-<arch>.exe to yasm.exe and place it in:
          Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
     [3]: http://www.cygwin.com
-    [4]: http://www.doxygen.org
-    [5]: http://curl.haxx.se
-    [6]: http://www.microbrew.org/tools/md5sha1sum/
+    [4]: http://www.msys2.org/
+    [5]: http://www.doxygen.org
+    [6]: http://curl.haxx.se
+    [7]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
--- a/args.c
+++ b/args.c
@@ -18,8 +18,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
+#if defined(__GNUC__)
+__attribute__((noreturn)) extern void die(const char *fmt, ...);
+#elif defined(_MSC_VER)
+__declspec(noreturn) extern void die(const char *fmt, ...);
 #else
 extern void die(const char *fmt, ...);
 #endif
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -166,6 +166,9 @@
     -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1296,10 +1296,6 @@
           enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
           ;;
         vs*)
-          # When building with Microsoft Visual Studio the assembler is
-          # invoked directly. Checking at configure time is unnecessary.
-          # Skip the check by setting AS arbitrarily
-          AS=msvs
           msvs_arch_dir=x86-msvs
           case ${tgt_cc##vs} in
             14)
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -157,6 +157,8 @@
         ;;
         --lib) proj_kind="lib"
         ;;
+        --as=*) as="${optval}"
+        ;;
         --src-path-bare=*)
             src_path_bare=$(fix_path "$optval")
             src_path_bare=${src_path_bare%/}
@@ -247,13 +249,13 @@
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     arm64*)
         platforms[0]="ARM64"
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -9,7 +9,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+shell_name="$(uname -o 2>/dev/null)"
+if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \
    && cygpath --help >/dev/null 2>&1; then
     FIXPATH='cygpath -m'
 else
--- a/configure
+++ b/configure
@@ -731,17 +731,17 @@
             soft_enable libyuv
         ;;
         *-android-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
         *-darwin-*)
-            check_add_cxxflags -std=c++11
+            check_add_cxxflags -std=gnu++11
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
@@ -748,7 +748,7 @@
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
             # would be disabled for the same reason.
-            check_add_cxxflags -std=c++11 && soft_enable unit_tests \
+            check_add_cxxflags -std=gnu++11 && soft_enable unit_tests \
               && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
@@ -755,9 +755,9 @@
 EOF
         ;;
         *)
-            enabled pthread_h && check_add_cxxflags -std=c++11 \
+            enabled pthread_h && check_add_cxxflags -std=gnu++11 \
               && soft_enable unit_tests
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
--- a/examples.mk
+++ b/examples.mk
@@ -376,6 +376,7 @@
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -66,12 +66,6 @@
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t passes_arg =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name_arg =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 static const arg_def_t min_q_arg =
     ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
 static const arg_def_t max_q_arg =
@@ -125,9 +119,6 @@
                                        &spatial_layers_arg,
                                        &kf_dist_arg,
                                        &scale_factors_arg,
-                                       &passes_arg,
-                                       &pass_arg,
-                                       &fpf_name_arg,
                                        &min_q_arg,
                                        &max_q_arg,
                                        &min_bitrate_arg,
@@ -173,8 +164,6 @@
   uint32_t frames_to_skip;
   struct VpxInputContext input_ctx;
   stats_io_t rc_stats;
-  int passes;
-  int pass;
   int tune_content;
   int inter_layer_pred;
 } AppInput;
@@ -197,9 +186,6 @@
   char **argi = NULL;
   char **argj = NULL;
   vpx_codec_err_t res;
-  int passes = 0;
-  int pass = 0;
-  const char *fpf_file_name = NULL;
   unsigned int min_bitrate = 0;
   unsigned int max_bitrate = 0;
   char string_options[1024] = { 0 };
@@ -289,18 +275,6 @@
               sizeof(string_options) - strlen(string_options) - 1);
       strncat(string_options, arg.val,
               sizeof(string_options) - strlen(string_options) - 1);
-    } else if (arg_match(&arg, &passes_arg, argi)) {
-      passes = arg_parse_uint(&arg);
-      if (passes < 1 || passes > 2) {
-        die("Error: Invalid number of passes (%d)\n", passes);
-      }
-    } else if (arg_match(&arg, &pass_arg, argi)) {
-      pass = arg_parse_uint(&arg);
-      if (pass < 1 || pass > 2) {
-        die("Error: Invalid pass selected (%d)\n", pass);
-      }
-    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
-      fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
       strncat(string_options, " min-quantizers=",
               sizeof(string_options) - strlen(string_options) - 1);
@@ -355,36 +329,8 @@
   if (strlen(string_options) > 0)
     vpx_svc_set_options(svc_ctx, string_options + 1);
 
-  if (passes == 0 || passes == 1) {
-    if (pass) {
-      fprintf(stderr, "pass is ignored since there's only one pass\n");
-    }
-    enc_cfg->g_pass = VPX_RC_ONE_PASS;
-  } else {
-    if (pass == 0) {
-      die("pass must be specified when passes is 2\n");
-    }
+  enc_cfg->g_pass = VPX_RC_ONE_PASS;
 
-    if (fpf_file_name == NULL) {
-      die("fpf must be specified when passes is 2\n");
-    }
-
-    if (pass == 1) {
-      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
-        fatal("Failed to open statistics store");
-      }
-    } else {
-      enc_cfg->g_pass = VPX_RC_LAST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
-        fatal("Failed to open statistics store");
-      }
-      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
-    }
-    app_input->passes = passes;
-    app_input->pass = pass;
-  }
-
   if (enc_cfg->rc_target_bitrate > 0) {
     if (min_bitrate > 0) {
       enc_cfg->rc_2pass_vbr_minsection_pct =
@@ -1004,13 +950,11 @@
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
 
-  if (!(app_input.passes == 2 && app_input.pass == 1)) {
-    // We don't save the bitstream for the 1st pass on two pass rate control
-    writer =
-        vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
-    if (!writer)
-      die("Failed to open %s for writing\n", app_input.output_filename);
-  }
+  writer =
+      vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing\n", app_input.output_filename);
+
 #if OUTPUT_RC_STATS
   // Write out spatial layer stream.
   // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
@@ -1230,7 +1174,6 @@
 #endif
   if (vpx_codec_destroy(&encoder))
     die_codec(&encoder, "Failed to destroy codec");
-  if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1);
   if (writer) {
     vpx_video_writer_close(writer);
   }
--- a/examples/vpx_dec_fuzzer.cc
+++ b/examples/vpx_dec_fuzzer.cc
@@ -41,7 +41,7 @@
    $make -j32
 
  * Build vp9 fuzzer
-   $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \
+   $ $CXX $CXXFLAGS -std=gnu++11 -DDECODER=vp9 \
    -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
    ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
    ./libvpx.a -Wl,--end-group
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -831,6 +831,7 @@
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
+    vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0);
     vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
@@ -930,6 +931,7 @@
           // Update for short-time encoding bitrate states, for moving window
           // of size rc->window, shifted by rc->window / 2.
           // Ignore first window segment, due to key frame.
+          if (rc.window_size == 0) rc.window_size = 15;
           if (frame_cnt > rc.window_size) {
             sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
             if (frame_cnt % rc.window_size == 0) {
--- a/libs.mk
+++ b/libs.mk
@@ -94,15 +94,28 @@
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
 
-  RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
-  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
-  RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h
 endif
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  VP8_PREFIX=vp8/
+  RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+endif
 
 ifeq ($(CONFIG_VP9_DECODER),yes)
   VP9_PREFIX=vp9/
@@ -126,7 +139,7 @@
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
-RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vp9rcmt,vp9rcmd)
+RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
@@ -232,6 +245,7 @@
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
@@ -248,20 +262,21 @@
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
 
-vp9rc.$(VCPROJ_SFX): \
+vpxrc.$(VCPROJ_SFX): \
     VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
 
-vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
+vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
             --target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vp9rc \
+            --name=vpxrc \
             --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
@@ -273,10 +288,10 @@
               $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
-PROJECTS-yes += vp9rc.$(VCPROJ_SFX)
+PROJECTS-yes += vpxrc.$(VCPROJ_SFX)
 
-vp9rc.$(VCPROJ_SFX): vpx_config.asm
-vp9rc.$(VCPROJ_SFX): $(RTCD)
+vpxrc.$(VCPROJ_SFX): vpx_config.asm
+vpxrc.$(VCPROJ_SFX): $(RTCD)
 
 endif # ifeq ($(CONFIG_MSVS),yes)
 else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -285,8 +300,20 @@
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 6
-SO_VERSION_MINOR := 3
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+# For libtool: c=<current>, a=<age>, r=<revision>
+# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# libvpx library file is generated as libvpx.so.<MAJOR>.<MINOR>.<PATCH>
+# MAJOR = c-a, MINOR = a, PATCH = r
+#
+# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
+# SO_VERSION_* then follow the rules in the link to detemine the new version
+# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
+SO_VERSION_MAJOR := 7
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -384,12 +411,11 @@
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
   RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
-  RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
   OBJS-yes += $(RC_RTC_OBJS)
-  LIBS-yes += $(BUILD_PFX)libvp9rc.a $(BUILD_PFX)libvp9rc_g.a
-  $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS)
+  LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a
+  $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS)
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes)
@@ -479,10 +505,12 @@
                            $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
+ifeq ($(CONFIG_ENCODERS),yes)
 RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
 RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
                        $(call enabled,RC_INTERFACE_TEST_SRCS))
 RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS)))
+endif
 
 SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX)
 SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\
@@ -536,6 +564,7 @@
             --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             -D_VARIADIC_MAX=10 \
             --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src"
@@ -552,6 +581,7 @@
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -574,6 +604,7 @@
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -580,10 +611,11 @@
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
 test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
-	vp9rc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             --exe \
@@ -592,6 +624,7 @@
             -D_VARIADIC_MAX=10 \
             --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \
             --ver=$(CONFIG_VS_VERSION) \
+            --as=$(AS) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
@@ -598,6 +631,7 @@
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
 endif  # RC_INTERFACE_TEST
+endif  # CONFIG_VP9_ENCODER
 endif
 else
 
@@ -639,6 +673,7 @@
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
   CXXFLAGS += $(GTEST_INCLUDES)
@@ -645,11 +680,12 @@
 OBJS-yes += $(RC_INTERFACE_TEST_OBJS)
 BINS-yes += $(RC_INTERFACE_TEST_BIN)
 
-$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvp9rc.a
+$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a
 $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
               $(RC_INTERFACE_TEST_OBJS) \
-              -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm))
+              -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm))
 endif  # RC_INTERFACE_TEST
+endif  # CONFIG_ENCODERS
 
 ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),)
 $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -34,6 +34,9 @@
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(BUILD_STATIC_LIBRARY)
 
 #libvpx_test
@@ -48,6 +51,9 @@
   LOCAL_STATIC_LIBRARIES += vpx
 endif
 
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <climits>
+#include <cstring>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
@@ -18,6 +21,12 @@
 
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
+bool IsVP9(const vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
 TEST(EncodeAPI, InvalidParams) {
   static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
@@ -184,15 +193,120 @@
     }
 
     // VP9 should report incapable, VP8 invalid for all configurations.
-    const char kVP9Name[] = "WebM Project VP9";
-    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
-                                sizeof(kVP9Name) - 1) == 0;
-    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+    EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
               vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
 
     for (int i = 0; i < 2; i++) {
       vpx_codec_destroy(&enc[i]);
     }
+  }
+}
+
+TEST(EncodeAPI, SetRoi) {
+  static struct {
+    const vpx_codec_iface_t *iface;
+    int ctrl_id;
+  } kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP },
+#endif
+#if CONFIG_VP9_ENCODER
+    { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP },
+#endif
+  };
+  constexpr int kWidth = 64;
+  constexpr int kHeight = 64;
+
+  for (const auto &codec : kCodecs) {
+    SCOPED_TRACE(vpx_codec_iface_name(codec.iface));
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_w = kWidth;
+    cfg.g_h = kHeight;
+    EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_roi_map_t roi = {};
+    uint8_t roi_map[kWidth * kHeight] = {};
+    if (IsVP9(codec.iface)) {
+      roi.rows = (cfg.g_w + 7) >> 3;
+      roi.cols = (cfg.g_h + 7) >> 3;
+    } else {
+      roi.rows = (cfg.g_w + 15) >> 4;
+      roi.cols = (cfg.g_h + 15) >> 4;
+    }
+    EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+
+    roi.roi_map = roi_map;
+    // VP8 only. This value isn't range checked.
+    roi.static_threshold[1] = 1000;
+    roi.static_threshold[2] = INT_MIN;
+    roi.static_threshold[3] = INT_MAX;
+
+    for (const auto delta : { -63, -1, 0, 1, 63 }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        roi.delta_lf[i] = delta;
+        // VP9 only.
+        roi.skip[i] ^= 1;
+        roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+      }
+    }
+
+    vpx_codec_err_t expected_error;
+    for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) {
+      expected_error = VPX_CODEC_INVALID_PARAM;
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        // The max segment count for VP8 is 4, the remainder of the entries are
+        // ignored.
+        if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK;
+
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_q[" << i << "]: " << delta;
+        roi.delta_q[i] = 0;
+
+        roi.delta_lf[i] = delta;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_lf[" << i << "]: " << delta;
+        roi.delta_lf[i] = 0;
+      }
+    }
+
+    // VP8 should ignore skip[] and ref_frame[] values.
+    expected_error =
+        IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK;
+    for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.skip[i] = skip;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "skip[" << i << "]: " << skip;
+        roi.skip[i] = 0;
+      }
+    }
+
+    // VP9 allows negative values to be used to disable segmentation.
+    for (int ref_frame = -3; ref_frame < 0; ++ref_frame) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
   }
 }
 
--- a/test/ratectrl_rtc_test.cc
+++ /dev/null
@@ -1,229 +1,0 @@
-/*
- *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "vp9/ratectrl_rtc.h"
-
-#include <fstream>  // NOLINT
-#include <string>
-
-#include "./vpx_config.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/video_source.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/bitops.h"
-
-namespace {
-
-const size_t kNumFrame = 850;
-
-struct FrameInfo {
-  friend std::istream &operator>>(std::istream &is, FrameInfo &info) {
-    is >> info.frame_id >> info.spatial_id >> info.temporal_id >> info.base_q >>
-        info.target_bandwidth >> info.buffer_level >> info.filter_level_ >>
-        info.bytes_used;
-    return is;
-  }
-  int frame_id;
-  int spatial_id;
-  int temporal_id;
-  // Base QP
-  int base_q;
-  size_t target_bandwidth;
-  size_t buffer_level;
-  // Loopfilter level
-  int filter_level_;
-  // Frame size for current frame, used for pose encode update
-  size_t bytes_used;
-};
-
-// This test runs the rate control interface and compare against ground truth
-// generated by encoders.
-// Settings for the encoder:
-// For 1 layer:
-//
-// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
-//    1280 720 1 30 7 0 0 1 0 1000
-//
-// For SVC (3 temporal layers, 3 spatial layers):
-//
-// examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3
-// -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1
-// --lag-in-frames=0 --passes=1 --speed=7 --threads=1
-// --temporal-layering-mode=3 -aq 1 -rcstat 1
-// gipsrec_motion1.1280_720.yuv -o out.webm
-//
-// - AQ_Mode 0
-// - Disable golden refresh
-// - Bitrate x 2 at frame/superframe 200
-// - Bitrate / 4 at frame/superframe 400
-//
-// The generated file includes:
-// frame number, spatial layer ID, temporal layer ID, base QP, target
-// bandwidth, buffer level, loopfilter level, encoded frame size
-// TODO(jianj): Remove golden files, and run actual encoding in this test.
-class RcInterfaceTest : public ::testing::Test {
- public:
-  explicit RcInterfaceTest() {}
-
-  virtual ~RcInterfaceTest() {}
-
- protected:
-  void RunOneLayer() {
-    SetConfigOneLayer();
-    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    frame_params.spatial_layer_id = 0;
-    frame_params.temporal_layer_id = 0;
-    std::ifstream one_layer_file;
-    one_layer_file.open(libvpx_test::GetDataPath() +
-                        "/rc_interface_test_one_layer");
-    ASSERT_TRUE(one_layer_file.good());
-    for (size_t i = 0; i < kNumFrame; i++) {
-      one_layer_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth * 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth / 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      ASSERT_EQ(frame_info.spatial_id, 0);
-      ASSERT_EQ(frame_info.temporal_id, 0);
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
-  }
-
-  void RunSVC() {
-    SetConfigSVC();
-    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    std::ifstream svc_file;
-    svc_file.open(std::string(std::getenv("LIBVPX_TEST_DATA_PATH")) +
-                  "/rc_interface_test_svc");
-    ASSERT_TRUE(svc_file.good());
-    for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) {
-      svc_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] *= 2;
-        rc_cfg_.target_bandwidth *= 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] /= 4;
-        rc_cfg_.target_bandwidth /= 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      frame_params.spatial_layer_id = frame_info.spatial_id;
-      frame_params.temporal_layer_id = frame_info.temporal_id;
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
-  }
-
- private:
-  void SetConfigOneLayer() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
-    rc_cfg_.max_quantizer = 52;
-    rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1000;
-    rc_cfg_.buf_initial_sz = 600;
-    rc_cfg_.buf_optimal_sz = 600;
-    rc_cfg_.buf_sz = 1000;
-    rc_cfg_.undershoot_pct = 50;
-    rc_cfg_.overshoot_pct = 50;
-    rc_cfg_.max_intra_bitrate_pct = 1000;
-    rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 1;
-    rc_cfg_.ts_number_layers = 1;
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 1;
-    rc_cfg_.layer_target_bitrate[0] = 1000;
-    rc_cfg_.max_quantizers[0] = 52;
-    rc_cfg_.min_quantizers[0] = 2;
-  }
-
-  void SetConfigSVC() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
-    rc_cfg_.max_quantizer = 56;
-    rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1600;
-    rc_cfg_.buf_initial_sz = 500;
-    rc_cfg_.buf_optimal_sz = 600;
-    rc_cfg_.buf_sz = 1000;
-    rc_cfg_.undershoot_pct = 50;
-    rc_cfg_.overshoot_pct = 50;
-    rc_cfg_.max_intra_bitrate_pct = 900;
-    rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 3;
-    rc_cfg_.ts_number_layers = 3;
-
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 4;
-    rc_cfg_.scaling_factor_num[1] = 2;
-    rc_cfg_.scaling_factor_den[1] = 4;
-    rc_cfg_.scaling_factor_num[2] = 4;
-    rc_cfg_.scaling_factor_den[2] = 4;
-
-    rc_cfg_.ts_rate_decimator[0] = 4;
-    rc_cfg_.ts_rate_decimator[1] = 2;
-    rc_cfg_.ts_rate_decimator[2] = 1;
-
-    rc_cfg_.layer_target_bitrate[0] = 100;
-    rc_cfg_.layer_target_bitrate[1] = 140;
-    rc_cfg_.layer_target_bitrate[2] = 200;
-    rc_cfg_.layer_target_bitrate[3] = 250;
-    rc_cfg_.layer_target_bitrate[4] = 350;
-    rc_cfg_.layer_target_bitrate[5] = 500;
-    rc_cfg_.layer_target_bitrate[6] = 450;
-    rc_cfg_.layer_target_bitrate[7] = 630;
-    rc_cfg_.layer_target_bitrate[8] = 900;
-
-    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
-      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
-        const int i = sl * rc_cfg_.ts_number_layers + tl;
-        rc_cfg_.max_quantizers[i] = 56;
-        rc_cfg_.min_quantizers[i] = 2;
-      }
-    }
-  }
-
-  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
-  libvpx::VP9RateControlRtcConfig rc_cfg_;
-};
-
-TEST_F(RcInterfaceTest, OneLayer) { RunOneLayer(); }
-
-TEST_F(RcInterfaceTest, SVC) { RunSVC(); }
-}  // namespace
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -26,6 +26,10 @@
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
 
+// const[expr] should be sufficient for DECLARE_ALIGNED but early
+// implementations of c++11 appear to have some issues with it.
+#define kDataAlignment 32
+
 template <typename Function>
 struct TestParams {
   TestParams(int w, int h, Function f, int bd = -1)
@@ -117,9 +121,6 @@
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   // crbug.com/webm/1660
-  // const[expr] should be sufficient for DECLARE_ALIGNED but early
-  // implementations of c++11 appear to have some issues with it.
-  enum { kDataAlignment = 32 };
   static const int kDataBlockSize = 64 * 128;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
--- a/test/simple_encode_test.cc
+++ b/test/simple_encode_test.cc
@@ -13,6 +13,7 @@
 #include <string>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/video_source.h"
 #include "vp9/simple_encode.h"
 
 namespace vp9 {
@@ -36,7 +37,8 @@
   const int frame_rate_den_ = 1;
   const int target_bitrate_ = 1000;
   const int num_frames_ = 17;
-  const std::string in_file_path_str_ = "bus_352x288_420_f20_b8.yuv";
+  const std::string in_file_path_str_ =
+      libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv";
 };
 
 TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -84,6 +84,7 @@
       prev_frame_width[i] = 320;
       prev_frame_height[i] = 240;
     }
+    ksvc_flex_noupd_tlenh_ = false;
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
 
@@ -91,9 +92,10 @@
   // bypass/flexible mode. The pattern corresponds to the pattern
   // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
   // non-flexible mode, except that we disable inter-layer prediction.
-  void set_frame_flags_bypass_mode(
-      int tl, int num_spatial_layers, int is_key_frame,
-      vpx_svc_ref_frame_config_t *ref_frame_config) {
+  void set_frame_flags_bypass_mode(int tl, int num_spatial_layers,
+                                   int is_key_frame,
+                                   vpx_svc_ref_frame_config_t *ref_frame_config,
+                                   int noupdate_tlenh) {
     for (int sl = 0; sl < num_spatial_layers; ++sl)
       ref_frame_config->update_buffer_slot[sl] = 0;
 
@@ -151,9 +153,12 @@
           ref_frame_config->reference_last[sl] = 1;
           ref_frame_config->reference_golden[sl] = 0;
           ref_frame_config->reference_alt_ref[sl] = 0;
-          ref_frame_config->update_buffer_slot[sl] |=
-              1 << ref_frame_config->alt_fb_idx[sl];
+          // Non reference frame on top temporal top spatial.
+          ref_frame_config->update_buffer_slot[sl] = 0;
         }
+        // Force no update on all spatial layers for temporal enhancement layer
+        // frames.
+        if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0;
       }
     }
   }
@@ -244,6 +249,22 @@
       }
     }
 
+    if (ksvc_flex_noupd_tlenh_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  1);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
+
     if (update_pattern_ && video->frame() >= 100) {
       vpx_svc_layer_id_t layer_id;
       if (video->frame() == 100) {
@@ -254,11 +275,14 @@
       layer_id.spatial_layer_id = 0;
       layer_id.temporal_layer_id = (video->frame() % 2 != 0);
       temporal_layer_id_ = layer_id.temporal_layer_id;
-      for (int i = 0; i < number_spatial_layers_; i++)
+      for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config.duration[i] = 1;
+      }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config);
+                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  0);
       encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
     }
 
@@ -557,9 +581,14 @@
   }
 
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
+    // TODO(marpan): Look into why an assert is triggered in compute_psnr
+    // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
+    // Has to do with dropped frames in bypass/flexible svc mode.
+    if (!ksvc_flex_noupd_tlenh_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+    }
   }
 
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
@@ -604,6 +633,7 @@
   int num_resize_down_;
   unsigned int prev_frame_width[VPX_MAX_LAYERS];
   unsigned int prev_frame_height[VPX_MAX_LAYERS];
+  bool ksvc_flex_noupd_tlenh_;
 
  private:
   virtual void SetConfig(const int num_temporal_layer) {
@@ -722,8 +752,6 @@
   cfg_.g_threads = 1;
   cfg_.rc_dropframe_thresh = 30;
   cfg_.kf_max_dist = 9999;
-  // Change SVC pattern on the fly.
-  update_pattern_ = 1;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   top_sl_width_ = 640;
@@ -730,6 +758,8 @@
   top_sl_height_ = 480;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
@@ -1104,6 +1134,36 @@
   // encoder will avoid loopfilter on these frames.
   EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
 #endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, for KSVC in flexible mode with no update of reference
+// frames for all spatial layers on TL > 0 superframes.
+// Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ksvc_flex_noupd_tlenh_ = true;
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
 }
 
 // Params: speed setting, inter-layer prediction mode.
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -27,8 +27,6 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
 
 # Test vectors
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -869,5 +869,3 @@
 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
-03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer
-99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc
--- a/test/test.mk
+++ b/test/test.mk
@@ -193,10 +193,8 @@
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
-ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2)))
 LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc
 endif
-endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
@@ -215,7 +213,14 @@
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
 
 endif # CONFIG_SHARED
 
--- /dev/null
+++ b/test/test_rc_interface.cc
@@ -1,0 +1,6 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -40,7 +40,12 @@
   int increase_denoising_;
 };
 
+// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10.
+#if defined(__GNUC__) && __GNUC__ >= 8
+TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) {
+#else
 TEST_P(VP8DenoiserTest, BitexactCheck) {
+#endif
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 4000;
   const int stride = 16;
@@ -87,7 +92,7 @@
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
 
@@ -103,7 +108,7 @@
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
   }
--- /dev/null
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -1,0 +1,343 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+struct Vp8RCTestVideo {
+  Vp8RCTestVideo() {}
+  Vp8RCTestVideo(const char *name_, int width_, int height_,
+                 unsigned int frames_)
+      : name(name_), width(width_), height(height_), frames(frames_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const Vp8RCTestVideo &video) {
+    os << video.name << " " << video.width << " " << video.height << " "
+       << video.frames;
+    return os;
+  }
+  const char *name;
+  int width;
+  int height;
+  unsigned int frames;
+};
+
+const Vp8RCTestVideo kVp8RCTestVectors[] = {
+  Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
+  Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+};
+
+class Vp8RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
+ public:
+  Vp8RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
+  virtual ~Vp8RcInterfaceTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  // From error_resilience_test.cc
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (rc_cfg_.ts_number_layers > 1) {
+      const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+      const int frame_flags =
+          SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_params_.temporal_layer_id = layer_id;
+      if (video->frame() > 0) {
+        encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+      }
+    } else {
+      if (video->frame() == 0) {
+        encoder->Control(VP8E_SET_CPUUSED, -6);
+        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      }
+      if (frame_params_.frame_type == INTER_FRAME) {
+        // Disable golden frame update.
+        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+      }
+    }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    encoder_exit_ = video->frame() == test_video_.frames;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    if (encoder_exit_) {
+      return;
+    }
+    int qp;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunPeriodicKey() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    key_interval_ = 100;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers2TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(2);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(3);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  void SetConfigTemporalLayers(int temporal_layers) {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    if (temporal_layers == 2) {
+      rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (temporal_layers == 3) {
+      rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[2] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    }
+
+    rc_cfg_.ts_number_layers = temporal_layers;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    // 2 Temporal layers, no spatial layers, CBR mode.
+    cfg_.ss_number_layers = 1;
+    cfg_.ts_number_layers = temporal_layers;
+    if (temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.ts_periodicity = 2;
+      cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    } else if (temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.ts_periodicity = 4;
+      cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    }
+  }
+
+  std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
+  libvpx::VP8RateControlRtcConfig rc_cfg_;
+  int key_interval_;
+  int target_bitrate_;
+  Vp8RCTestVideo test_video_;
+  libvpx::VP8FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+};
+
+TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
+
+VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
+                           ::testing::Values(200, 400, 1000),
+                           ::testing::ValuesIn(kVp8RCTestVectors));
+
+}  // namespace
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -31,7 +31,7 @@
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
   { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 },
-  { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
+  { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
 };
 
 typedef struct {
@@ -342,7 +342,7 @@
 VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12,
                            ::testing::Values(::libvpx_test::kRealTime),
                            ::testing::ValuesIn(kTestVectorsNv12),
-                           ::testing::ValuesIn({ 6, 7, 8 }));
+                           ::testing::Values(6, 7, 8));
 
 VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh,
                            ::testing::Values(5, 6, 7), ::testing::Values(8, 9));
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <cstdint>
 #include <new>
 
 #include "test/codec_factory.h"
@@ -20,7 +21,7 @@
 namespace {
 
 constexpr int kModelMagicNumber = 51396;
-constexpr unsigned int PrivMagicNumber = 5566;
+constexpr uintptr_t PrivMagicNumber = 5566;
 constexpr int kFrameNum = 5;
 constexpr int kLosslessCodingIndex = 2;
 
@@ -73,6 +74,7 @@
   EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
 
   if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
@@ -84,6 +86,7 @@
   }
 
   if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
     EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
@@ -104,6 +107,7 @@
   }
 
   if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
@@ -125,6 +129,7 @@
   } else {
     frame_decision->q_index = 100;
   }
+  frame_decision->max_frame_size = 0;
   return VPX_RC_OK;
 }
 
@@ -139,6 +144,11 @@
   EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
   if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
     EXPECT_EQ(encode_frame_result->sse, 0);
+  }
+  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
+  } else {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
   }
   return VPX_RC_OK;
 }
--- /dev/null
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -1,0 +1,373 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/ratectrl_rtc.h"
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+const size_t kNumFrames = 300;
+
+const int kTemporalId[4] = { 0, 2, 1, 2 };
+
+class RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, vpx_rc_mode> {
+ public:
+  RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        encoder_exit_(false) {}
+
+  virtual ~RcInterfaceTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+    }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    if (encoder_exit_) {
+      return;
+    }
+    int loopfilter_level, qp;
+    encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+    ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    SetConfig(GET_PARAM(2));
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerVBRPeriodicKey() {
+    if (GET_PARAM(2) != VPX_VBR) return;
+    key_interval_ = 100;
+    SetConfig(VPX_VBR);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig(vpx_rc_mode rc_mode) {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 52;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1000;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 1;
+    rc_cfg_.ts_number_layers = 1;
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 1;
+    rc_cfg_.layer_target_bitrate[0] = 1000;
+    rc_cfg_.max_quantizers[0] = 52;
+    rc_cfg_.min_quantizers[0] = 2;
+    rc_cfg_.rc_mode = rc_mode;
+    rc_cfg_.aq_mode = aq_mode_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = rc_mode;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  int aq_mode_;
+  int key_interval_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+};
+
+class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
+                           public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {}
+  virtual ~RcInterfaceSvcTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    }
+
+    frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
+    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+    current_superframe_ = video->frame();
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz);
+      for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+        frame_params_.spatial_layer_id = sl;
+        frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4];
+        rc_api_->ComputeQP(frame_params_);
+        frame_params_.frame_type = INTER_FRAME;
+        rc_api_->PostEncodeUpdate(sizes_[sl]);
+      }
+    }
+    if (!encoder_exit_) {
+      int loopfilter_level, qp;
+      encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+      encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    }
+  }
+  // This method needs to be overridden because non-reference frames are
+  // expected to be mismatched frames as the encoder will avoid loopfilter on
+  // these frames.
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {}
+
+  void RunSvc() {
+    SetConfigSvc();
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderSvc();
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) {
+    uint8_t marker = *(data + data_sz - 1);
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+      for (uint32_t i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+        sizes_[i] = this_sz;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  void SetEncoderSvc() {
+    cfg_.ss_number_layers = 3;
+    cfg_.ts_number_layers = 3;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 30;
+    svc_params_.scaling_factor_num[0] = 72;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 144;
+    svc_params_.scaling_factor_den[1] = 288;
+    svc_params_.scaling_factor_num[2] = 288;
+    svc_params_.scaling_factor_den[2] = 288;
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 56;
+      svc_params_.min_quantizers[i] = 2;
+      svc_params_.speed_per_layer[i] = 7;
+    }
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    // 3 temporal layers
+    cfg_.ts_rate_decimator[0] = 4;
+    cfg_.ts_rate_decimator[1] = 2;
+    cfg_.ts_rate_decimator[2] = 1;
+    cfg_.temporal_layering_mode = 3;
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.g_threads = 1;
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_target_bitrate = 1600;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_undershoot_pct = 50;
+
+    cfg_.layer_target_bitrate[0] = 100;
+    cfg_.layer_target_bitrate[1] = 140;
+    cfg_.layer_target_bitrate[2] = 200;
+    cfg_.layer_target_bitrate[3] = 250;
+    cfg_.layer_target_bitrate[4] = 350;
+    cfg_.layer_target_bitrate[5] = 500;
+    cfg_.layer_target_bitrate[6] = 450;
+    cfg_.layer_target_bitrate[7] = 630;
+    cfg_.layer_target_bitrate[8] = 900;
+  }
+
+  void SetConfigSvc() {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 56;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1600;
+    rc_cfg_.buf_initial_sz = 500;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 900;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 3;
+    rc_cfg_.ts_number_layers = 3;
+    rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.aq_mode = aq_mode_;
+
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 4;
+    rc_cfg_.scaling_factor_num[1] = 2;
+    rc_cfg_.scaling_factor_den[1] = 4;
+    rc_cfg_.scaling_factor_num[2] = 4;
+    rc_cfg_.scaling_factor_den[2] = 4;
+
+    rc_cfg_.ts_rate_decimator[0] = 4;
+    rc_cfg_.ts_rate_decimator[1] = 2;
+    rc_cfg_.ts_rate_decimator[2] = 1;
+
+    rc_cfg_.layer_target_bitrate[0] = 100;
+    rc_cfg_.layer_target_bitrate[1] = 140;
+    rc_cfg_.layer_target_bitrate[2] = 200;
+    rc_cfg_.layer_target_bitrate[3] = 250;
+    rc_cfg_.layer_target_bitrate[4] = 350;
+    rc_cfg_.layer_target_bitrate[5] = 500;
+    rc_cfg_.layer_target_bitrate[6] = 450;
+    rc_cfg_.layer_target_bitrate[7] = 630;
+    rc_cfg_.layer_target_bitrate[8] = 900;
+
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
+      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
+        const int i = sl * rc_cfg_.ts_number_layers + tl;
+        rc_cfg_.max_quantizers[i] = 56;
+        rc_cfg_.min_quantizers[i] = 2;
+      }
+    }
+  }
+
+  int aq_mode_;
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  vpx_svc_extra_cfg_t svc_params_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int current_superframe_;
+  uint32_t sizes_[8];
+};
+
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
+
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
+                           ::testing::Values(VPX_CBR, VPX_VBR));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
+}  // namespace
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -188,4 +188,55 @@
 
 INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
                          ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+  libvpx_test::TempOutFile f;
+  fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+    "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+  libvpx_test::TempOutFile f;
+  fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(fseek(f.file(), 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
 }  // namespace
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -3,7 +3,7 @@
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=gnu++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
--- a/tools.mk
+++ b/tools.mk
@@ -79,6 +79,7 @@
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
--- a/tools_common.h
+++ b/tools_common.h
@@ -110,6 +110,8 @@
 
 #if defined(__GNUC__)
 #define VPX_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define VPX_NO_RETURN __declspec(noreturn)
 #else
 #define VPX_NO_RETURN
 #endif
@@ -117,14 +119,14 @@
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) VPX_NO_RETURN;
-void fatal(const char *fmt, ...) VPX_NO_RETURN;
+VPX_NO_RETURN void die(const char *fmt, ...);
+VPX_NO_RETURN void fatal(const char *fmt, ...);
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) VPX_NO_RETURN;
+VPX_NO_RETURN void usage_exit(void);
 
 #undef VPX_NO_RETURN
 
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -10,7 +10,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1
 #include <unistd.h>
 #endif
 #include "onyxd_int.h"
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -866,7 +866,6 @@
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   vp8_writer *const w = cpi->bc;
 #endif
-  int savings = 0;
 
   vpx_clear_system_state();
 
@@ -940,8 +939,6 @@
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
             vp8_write_literal(w, newp, 8);
 #endif
-
-            savings += s;
           }
 
         } while (++t < ENTROPY_NODES);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -183,7 +183,7 @@
 extern FILE *vpxlogc;
 #endif
 
-static void save_layer_context(VP8_COMP *cpi) {
+void vp8_save_layer_context(VP8_COMP *cpi) {
   LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
   /* Save layer dependent coding state */
@@ -222,7 +222,7 @@
          sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
-static void restore_layer_context(VP8_COMP *cpi, const int layer) {
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   /* Restore layer dependent coding state */
@@ -269,9 +269,9 @@
   return (int)(llval * llnum / llden);
 }
 
-static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int layer,
-                                        double prev_layer_framerate) {
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
@@ -301,9 +301,9 @@
   /* Work out the average size of a frame within this layer */
   if (layer > 0) {
     lc->avg_frame_size_for_layer =
-        (int)((cpi->oxcf.target_bitrate[layer] -
-               cpi->oxcf.target_bitrate[layer - 1]) *
-              1000 / (lc->framerate - prev_layer_framerate));
+        (int)round((cpi->oxcf.target_bitrate[layer] -
+                    cpi->oxcf.target_bitrate[layer - 1]) *
+                   1000 / (lc->framerate - prev_layer_framerate));
   }
 
   lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
@@ -336,12 +336,12 @@
   // We need this to set the layer context for the new layers below.
   if (prev_num_layers == 1) {
     cpi->current_layer = 0;
-    save_layer_context(cpi);
+    vp8_save_layer_context(cpi);
   }
   for (i = 0; i < curr_num_layers; ++i) {
     LAYER_CONTEXT *lc = &cpi->layer_context[i];
     if (i >= prev_num_layers) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
     }
     // The initial buffer levels are set based on their starting levels.
     // We could set the buffer levels based on the previous state (normalized
@@ -356,7 +356,7 @@
     // state (to smooth-out quality dips/rate fluctuation at transition)?
 
     // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
-    // is not set for 1 layer, and the restore_layer_context/save_context()
+    // is not set for 1 layer, and the vp8_restore_layer_context/save_context()
     // are not called in the encoding loop, so we need to call it here to
     // pass the layer context state to |cpi|.
     if (curr_num_layers == 1) {
@@ -364,7 +364,7 @@
       lc->buffer_level =
           cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000;
       lc->bits_off_target = lc->buffer_level;
-      restore_layer_context(cpi, 0);
+      vp8_restore_layer_context(cpi, 0);
     }
     prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i];
   }
@@ -1274,7 +1274,7 @@
   cpi->framerate = framerate;
   cpi->output_framerate = framerate;
   cpi->per_frame_bandwidth =
-      (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+      (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate);
   cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
   cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
                                    cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -1365,7 +1365,7 @@
     double prev_layer_framerate = 0;
 
     for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
       prev_layer_framerate =
           cpi->output_framerate / cpi->oxcf.rate_decimator[i];
     }
@@ -1382,7 +1382,7 @@
 #endif
 }
 
-static void update_layer_contexts(VP8_COMP *cpi) {
+void vp8_update_layer_contexts(VP8_COMP *cpi) {
   VP8_CONFIG *oxcf = &cpi->oxcf;
 
   /* Update snapshots of the layer contexts to reflect new parameters */
@@ -1417,8 +1417,8 @@
       /* Work out the average size of a frame within this layer */
       if (i > 0) {
         lc->avg_frame_size_for_layer =
-            (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
-                  1000 / (lc->framerate - prev_layer_framerate));
+            (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
+                       1000 / (lc->framerate - prev_layer_framerate));
       }
 
       prev_layer_framerate = lc->framerate;
@@ -1910,6 +1910,7 @@
 
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
+  cpi->rt_always_update_correction_factor = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -3260,7 +3261,7 @@
 #endif  // !CONFIG_REALTIME_ONLY
     default:
       cpi->per_frame_bandwidth =
-          (int)(cpi->target_bandwidth / cpi->output_framerate);
+          (int)round(cpi->target_bandwidth / cpi->output_framerate);
       break;
   }
 
@@ -3480,7 +3481,7 @@
    * Note that dropping a key frame can be problematic if spatial
    * resampling is also active
    */
-  if (cpi->decimation_factor > 0) {
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
     switch (cpi->decimation_factor) {
       case 1:
         cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
@@ -4016,7 +4017,8 @@
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
     /* Are we are overshooting and up against the limit of active max Q. */
-    if (((cpi->pass != 2) ||
+    if (!cpi->rt_always_update_correction_factor &&
+        ((cpi->pass != 2) ||
          (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
         (Q == cpi->active_worst_quality) &&
         (cpi->active_worst_quality < cpi->worst_quality) &&
@@ -4514,10 +4516,10 @@
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
 
-  // If the frame dropper is not enabled, don't let the buffer level go below
-  // some threshold, given here by -|maximum_buffer_size|. For now we only do
-  // this for screen content input.
-  if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+  // Don't let the buffer level go below some threshold, given here
+  // by -|maximum_buffer_size|. For now we only do this for
+  // screen content input.
+  if (cpi->oxcf.screen_content_mode &&
       cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) {
     cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
   }
@@ -4552,8 +4554,8 @@
 
     for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
-      int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-                                          cpi->projected_frame_size);
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi->projected_frame_size);
 
       lc->bits_off_target += bits_off_for_this_layer;
 
@@ -4919,6 +4921,8 @@
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
       last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+      // Cap this to avoid overflow of (this_duration - last_duration) * 10
+      this_duration = VPXMIN(this_duration, INT64_MAX / 10);
       /* do a step update if the duration changes by 10% */
       if (last_duration) {
         step = (int)(((this_duration - last_duration) * 10 / last_duration));
@@ -4988,7 +4992,7 @@
   if (cpi->oxcf.number_of_layers > 1) {
     int layer;
 
-    update_layer_contexts(cpi);
+    vp8_update_layer_contexts(cpi);
 
     /* Restore layer specific context & set frame rate */
     if (cpi->temporal_layer_id >= 0) {
@@ -4998,7 +5002,7 @@
           cpi->oxcf
               .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
     }
-    restore_layer_context(cpi, layer);
+    vp8_restore_layer_context(cpi, layer);
     vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
   }
 
@@ -5129,7 +5133,7 @@
   }
 
   /* Save layer specific state */
-  if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi);
+  if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi);
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
@@ -5316,17 +5320,13 @@
     return -1;
   }
 
-  // Range check the delta Q values and convert the external Q range values
-  // to internal ones.
-  if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
-      (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) {
-    return -1;
-  }
-
-  // Range check the delta lf values
-  if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
-      (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) {
-    return -1;
+  for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range ||
+        delta_lf[i] < -range) {
+      return -1;
+    }
   }
 
   // Also disable segmentation if no deltas are specified.
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -702,6 +702,10 @@
   int use_roi_static_threshold;
 
   int ext_refresh_frame_flags_pending;
+
+  // Always update correction factor used for rate control after each frame for
+  // realtime encoding.
+  int rt_always_update_correction_factor;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
@@ -708,6 +712,12 @@
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate);
+void vp8_update_layer_contexts(VP8_COMP *cpi);
+void vp8_save_layer_context(VP8_COMP *cpi);
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -327,7 +327,8 @@
     int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
     /* Boost depends somewhat on frame rate: only used for 1 layer case. */
     if (cpi->oxcf.number_of_layers == 1) {
-      kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+      kf_boost =
+          VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16));
     } else {
       /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
       kf_boost = initial_boost;
@@ -349,8 +350,12 @@
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    unsigned int max_rate =
-        cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+    unsigned int max_rate;
+    // This product may overflow unsigned int
+    uint64_t product = cpi->per_frame_bandwidth;
+    product *= cpi->oxcf.rc_max_intra_bitrate_pct;
+    product /= 100;
+    max_rate = (unsigned int)VPXMIN(INT_MAX, product);
 
     if (target > max_rate) target = max_rate;
   }
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -152,8 +152,8 @@
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 25);
 #endif
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
 
@@ -257,6 +257,23 @@
     ERROR("g_threads cannot be bigger than number of token partitions");
 #endif
 
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -378,6 +395,9 @@
 #endif
 
   oxcf->cpu_used = vp8_cfg.cpu_used;
+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+    oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used);
+  }
   oxcf->encode_breakout = vp8_cfg.static_thresh;
   oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
@@ -585,6 +605,17 @@
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP8_COMP *cpi = ctx->cpi;
+  const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  if (data) {
+    cpi->cyclic_refresh_mode_enabled = 0;
+    cpi->rt_always_update_correction_factor = 1;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -1223,6 +1254,7 @@
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
   { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { -1, NULL },
 };
 
@@ -1256,7 +1288,7 @@
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
         { NULL, 0 }, /* rc_firstpass_mb_stats_in */
-        256,         /* rc_target_bandwidth */
+        256,         /* rc_target_bitrate */
         4,           /* rc_min_quantizer */
         63,          /* rc_max_quantizer */
         100,         /* rc_undershoot_pct */
@@ -1278,14 +1310,30 @@
 
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         { 0 },
-        { 0 }, /* ss_target_bitrate */
-        1,     /* ts_number_layers */
-        { 0 }, /* ts_target_bitrate */
-        { 0 }, /* ts_rate_decimator */
-        0,     /* ts_periodicity */
-        { 0 }, /* ts_layer_id */
-        { 0 }, /* layer_target_bitrate */
-        0      /* temporal_layering_mode */
+        { 0 },    /* ss_target_bitrate */
+        1,        /* ts_number_layers */
+        { 0 },    /* ts_target_bitrate */
+        { 0 },    /* ts_rate_decimator */
+        0,        /* ts_periodicity */
+        { 0 },    /* ts_layer_id */
+        { 0 },    /* layer_target_bitrate */
+        0,        /* temporal_layering_mode */
+        0,        /* use_vizier_rc_params */
+        { 1, 1 }, /* active_wq_factor */
+        { 1, 1 }, /* err_per_mb_factor */
+        { 1, 1 }, /* sr_default_decay_limit */
+        { 1, 1 }, /* sr_diff_factor */
+        { 1, 1 }, /* kf_err_per_mb_factor */
+        { 1, 1 }, /* kf_frame_min_boost_factor */
+        { 1, 1 }, /* kf_frame_max_boost_first_factor */
+        { 1, 1 }, /* kf_frame_max_boost_subs_factor */
+        { 1, 1 }, /* kf_max_total_boost_factor */
+        { 1, 1 }, /* gf_max_total_boost_factor */
+        { 1, 1 }, /* gf_frame_max_boost_factor */
+        { 1, 1 }, /* zm_factor */
+        { 1, 1 }, /* rd_mult_inter_qp_fac */
+        { 1, 1 }, /* rd_mult_arf_qp_fac */
+        { 1, 1 }, /* rd_mult_key_qp_fac */
     } },
 };
 
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -1,0 +1,347 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <new>
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/ratectrl.h"
+#include "vpx_ports/system_state.h"
+
+namespace libvpx {
+/* Quant MOD */
+static const int kQTrans[] = {
+  0,  1,  2,  3,  4,  5,  7,   8,   9,   10,  12,  13,  15,  17,  18,  19,
+  20, 21, 23, 24, 25, 26, 27,  28,  29,  30,  31,  33,  35,  37,  39,  41,
+  43, 45, 47, 49, 51, 53, 55,  57,  59,  61,  64,  67,  70,  73,  76,  79,
+  82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
+  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,  10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16,
+  16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+  22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30
+};
+
+static const unsigned char inter_minq[QINDEX_RANGE] = {
+  0,  0,  1,  1,  2,  3,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9,  9,  10, 11,
+  11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24,
+  24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+  39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53,
+  54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86,
+  87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
+};
+
+static int rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (int)(llval * llnum / llden);
+}
+
+std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
+    const VP8RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP8RateControlRTC> rc_api(new (std::nothrow)
+                                                VP8RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP8_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp8_zero(*rc_api->cpi_);
+
+  rc_api->InitRateControl(cfg);
+
+  return rc_api;
+}
+
+void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+  cpi_->pass = 0;
+  cm->show_frame = 1;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  cpi_->auto_gold = 1;
+  cpi_->key_frame_count = 1;
+  cpi_->rate_correction_factor = 1.0;
+  cpi_->key_frame_rate_correction_factor = 1.0;
+  cpi_->cyclic_refresh_mode_enabled = 0;
+  cpi_->auto_worst_q = 1;
+  cpi_->kf_overspend_bits = 0;
+  cpi_->kf_bitrate_adjustment = 0;
+  cpi_->gf_overspend_bits = 0;
+  cpi_->non_gf_bitrate_adjustment = 0;
+  UpdateRateControl(rc_cfg);
+  cpi_->buffer_level = oxcf->starting_buffer_level;
+  cpi_->bits_off_target = oxcf->starting_buffer_level;
+}
+
+void VP8RateControlRTC::UpdateRateControl(
+    const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  vpx_clear_system_state();
+  cm->Width = rc_cfg.width;
+  cm->Height = rc_cfg.height;
+  oxcf->Width = rc_cfg.width;
+  oxcf->Height = rc_cfg.height;
+  oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer];
+  oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer];
+  cpi_->worst_quality = oxcf->worst_allowed_q;
+  cpi_->best_quality = oxcf->best_allowed_q;
+  cpi_->output_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth =
+      static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
+  cpi_->ref_framerate = cpi_->output_framerate;
+  oxcf->fixed_q = -1;
+  oxcf->error_resilient_mode = 1;
+  oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz;
+  oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size = rc_cfg.buf_sz;
+  oxcf->number_of_layers = rc_cfg.ts_number_layers;
+  cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
+    cpi_->prior_key_frame_distance[i] =
+        static_cast<int>(cpi_->output_framerate);
+  }
+
+  if (oxcf->number_of_layers > 1) {
+    memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
+           sizeof(rc_cfg.layer_target_bitrate));
+    memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
+           sizeof(rc_cfg.ts_rate_decimator));
+    oxcf->periodicity = 2;
+
+    double prev_layer_framerate = 0;
+    for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+      vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+      prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+    }
+  }
+
+  cpi_->total_actual_bits = 0;
+  cpi_->total_target_vs_actual = 0;
+
+  cm->mb_rows = cm->Height >> 4;
+  cm->mb_cols = cm->Width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+
+  oxcf->starting_buffer_level =
+      rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+  /* Set or reset optimal and maximum buffer levels. */
+  if (oxcf->optimal_buffer_level == 0) {
+    oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->optimal_buffer_level =
+        rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
+  }
+  if (oxcf->maximum_buffer_size == 0) {
+    oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->maximum_buffer_size =
+        rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+  }
+
+  if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
+    cpi_->bits_off_target = oxcf->maximum_buffer_size;
+    cpi_->buffer_level = cpi_->bits_off_target;
+  }
+
+  vp8_new_framerate(cpi_, cpi_->framerate);
+  vpx_clear_system_state();
+}
+
+void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  if (cpi_->oxcf.number_of_layers > 1) {
+    cpi_->temporal_layer_id = frame_params.temporal_layer_id;
+    const int layer = frame_params.temporal_layer_id;
+    vp8_update_layer_contexts(cpi_);
+    /* Restore layer specific context & set frame rate */
+    vp8_restore_layer_context(cpi_, layer);
+    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
+  }
+  cm->frame_type = frame_params.frame_type;
+  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
+    cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
+  }
+
+  vp8_pick_frame_size(cpi_);
+
+  if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
+      cpi_->buffered_mode) {
+    /* Max adjustment is 1/4 */
+    int Adjustment = cpi_->active_worst_quality / 4;
+    if (Adjustment) {
+      int buff_lvl_step;
+      if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) {
+        buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size -
+                               cpi_->oxcf.optimal_buffer_level) /
+                              Adjustment);
+        if (buff_lvl_step) {
+          Adjustment =
+              (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) /
+                    buff_lvl_step);
+        } else {
+          Adjustment = 0;
+        }
+      }
+      cpi_->active_worst_quality -= Adjustment;
+      if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+        cpi_->active_worst_quality = cpi_->active_best_quality;
+      }
+    }
+  }
+
+  if (cpi_->ni_frames > 150) {
+    int q = cpi_->active_worst_quality;
+    if (cm->frame_type == KEY_FRAME) {
+      cpi_->active_best_quality = kf_high_motion_minq[q];
+    } else {
+      cpi_->active_best_quality = inter_minq[q];
+    }
+
+    if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) {
+      cpi_->active_best_quality = cpi_->best_quality;
+
+    } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) {
+      int Fraction =
+          (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) /
+                (cpi_->oxcf.maximum_buffer_size -
+                 cpi_->oxcf.optimal_buffer_level));
+      int min_qadjustment =
+          ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128;
+
+      cpi_->active_best_quality -= min_qadjustment;
+    }
+  }
+
+  /* Clip the active best and worst quality values to limits */
+  if (cpi_->active_worst_quality > cpi_->worst_quality) {
+    cpi_->active_worst_quality = cpi_->worst_quality;
+  }
+  if (cpi_->active_best_quality < cpi_->best_quality) {
+    cpi_->active_best_quality = cpi_->best_quality;
+  }
+  if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+    cpi_->active_worst_quality = cpi_->active_best_quality;
+  }
+
+  q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
+  vp8_set_quantizer(cpi_, q_);
+  vpx_clear_system_state();
+}
+
+int VP8RateControlRTC::GetQP() const { return q_; }
+
+void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  cpi_->total_byte_count += encoded_frame_size;
+  cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      cpi_->layer_context[i].total_byte_count += encoded_frame_size;
+    }
+  }
+
+  vp8_update_rate_correction_factors(cpi_, 2);
+
+  cpi_->last_q[cm->frame_type] = cm->base_qindex;
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp8_adjust_key_frame_context(cpi_);
+  }
+
+  /* Keep a record of ambient average Q. */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->avg_frame_qindex =
+        (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2;
+  }
+  /* Keep a record from which we can calculate the average Q excluding
+   * key frames.
+   */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->ni_frames++;
+    /* Damp value for first few frames */
+    if (cpi_->ni_frames > 150) {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames);
+    } else {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi =
+          ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2;
+    }
+
+    /* If the average Q is higher than what was used in the last
+     * frame (after going through the recode loop to keep the frame
+     * size within range) then use the last frame value - 1. The -1
+     * is designed to stop Q and hence the data rate, from
+     * progressively falling away during difficult sections, but at
+     * the same time reduce the number of itterations around the
+     * recode loop.
+     */
+    if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1;
+  }
+
+  cpi_->bits_off_target +=
+      cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size;
+  if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) {
+    cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size;
+  }
+
+  cpi_->total_actual_bits += cpi_->projected_frame_size;
+  cpi_->buffer_level = cpi_->bits_off_target;
+
+  /* Propagate values to higher temporal layers */
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      LAYER_CONTEXT *lc = &cpi_->layer_context[i];
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size);
+
+      lc->bits_off_target += bits_off_for_this_layer;
+
+      /* Clip buffer level to maximum buffer size for the layer */
+      if (lc->bits_off_target > lc->maximum_buffer_size) {
+        lc->bits_off_target = lc->maximum_buffer_size;
+      }
+
+      lc->total_actual_bits += cpi_->projected_frame_size;
+      lc->total_target_vs_actual += bits_off_for_this_layer;
+      lc->buffer_level = lc->bits_off_target;
+    }
+  }
+
+  cpi_->common.current_video_frame++;
+  cpi_->frames_since_key++;
+
+  if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+  vpx_clear_system_state();
+}
+}  // namespace libvpx
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -1,0 +1,63 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_RATECTRL_RTC_H_
+#define VPX_VP8_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/common/common.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+namespace libvpx {
+struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP8RateControlRtcConfig() {
+    vp8_zero(layer_target_bitrate);
+    vp8_zero(ts_rate_decimator);
+  }
+};
+
+struct VP8FrameParamsQpRTC {
+  FRAME_TYPE frame_type;
+  int temporal_layer_id;
+};
+
+class VP8RateControlRTC {
+ public:
+  static std::unique_ptr<VP8RateControlRTC> Create(
+      const VP8RateControlRtcConfig &cfg);
+  ~VP8RateControlRTC() {
+    if (cpi_) {
+      vpx_free(cpi_->gf_active_flags);
+      vpx_free(cpi_);
+    }
+  }
+
+  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  // int GetLoopfilterLevel() const;
+  void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  VP8RateControlRTC() {}
+  void InitRateControl(const VP8RateControlRtcConfig &cfg);
+  VP8_COMP *cpi_;
+  int q_;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP8_RATECTRL_RTC_H_
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,6 +21,9 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if defined(__aarch64__)
+  return vaddlvq_s8(v_sum_diff_total);
+#else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
   const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
   const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -28,6 +31,7 @@
                                 vget_low_s64(fedcba98_76543210));
   const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
   return sum_diff;
+#endif
 }
 
 // Denoise a 16x1 vector.
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -48,6 +48,7 @@
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   cr->counter_encode_maxq_scene_change = 0;
+  cr->content_mode = 1;
   return cr;
 }
 
@@ -326,7 +327,8 @@
   else
     rc->baseline_gf_interval = 40;
   if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 &&
+      cr->content_mode)
     rc->baseline_gf_interval = 10;
 }
 
@@ -388,7 +390,8 @@
           ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+      cr->content_mode) {
     consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
@@ -409,7 +412,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) compute_content = 0;
 #endif
-    if (cpi->Last_Source == NULL ||
+    if (cr->content_mode == 0 || cpi->Last_Source == NULL ||
         cpi->Last_Source->y_width != cpi->Source->y_width ||
         cpi->Last_Source->y_height != cpi->Source->y_height)
       compute_content = 0;
@@ -430,7 +433,8 @@
         // reset to 0 later depending on the coding mode.
         if (cr->map[bl_index2] == 0) {
           count_tot++;
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+          if (cr->content_mode == 0 ||
+              cr->last_coded_q_map[bl_index2] > qindex_thresh ||
               cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
@@ -489,7 +493,8 @@
       rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (cpi->use_svc &&
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
-      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
+      (!cpi->use_svc && cr->content_mode &&
+       rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40) ||
       (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
        rc->frames_since_key > 20)) {
@@ -511,7 +516,8 @@
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
-    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    if (cr->content_mode && cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level >= kMedium) {
       // Reduce the delta-qp if the estimated source noise is above threshold.
       cr->rate_ratio_qdelta = 1.7;
       cr->rate_boost_fac = 13;
@@ -528,7 +534,7 @@
     cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
     // Increase the amount of refresh on scene change that is encoded at max Q,
     // increase for a few cycles of the refresh period (~100 / percent_refresh).
-    if (cr->counter_encode_maxq_scene_change < 30)
+    if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30)
       cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
     cr->rate_ratio_qdelta = 2.0;
     cr->rate_boost_fac = 10;
@@ -575,6 +581,12 @@
         (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
         num8x8bl;
   cr->weight_segment = weight_segment;
+  if (cr->content_mode == 0) {
+    cr->actual_num_seg1_blocks =
+        cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+    cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -70,6 +70,7 @@
   int apply_cyclic_refresh;
   int counter_encode_maxq_scene_change;
   int skip_flat_static_blocks;
+  int content_mode;
 };
 
 struct VP9_COMP;
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -157,6 +157,9 @@
   // skip forward transform and quantization
   uint8_t skip_txfm[MAX_MB_PLANE << 2];
 #define SKIP_TXFM_NONE 0
+// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely
+// since it increases risks of bad perceptual quality.
+// https://crbug.com/webm/1729
 #define SKIP_TXFM_AC_DC 1
 #define SKIP_TXFM_AC_ONLY 2
 
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -159,37 +159,6 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if !CONFIG_REALTIME_ONLY
-static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
 static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int segment_index) {
   VP9_COMMON *const cm = &cpi->common;
@@ -815,8 +784,8 @@
 
 // Check if most of the superblock is skin content, and if so, force split to
 // 32x32, and set x->sb_is_skin for use in mode selection.
-static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
-                         int mi_row, int mi_col, int *force_split) {
+static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row,
+                         int mi_col, int *force_split) {
   VP9_COMMON *const cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) return 0;
@@ -828,11 +797,6 @@
                    mi_row + 8 < cm->mi_rows)) {
     int num_16x16_skin = 0;
     int num_16x16_nonskin = 0;
-    uint8_t *ysignal = x->plane[0].src.buf;
-    uint8_t *usignal = x->plane[1].src.buf;
-    uint8_t *vsignal = x->plane[2].src.buf;
-    int sp = x->plane[0].src.stride;
-    int spuv = x->plane[1].src.stride;
     const int block_index = mi_row * cm->mi_cols + mi_col;
     const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
     const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
@@ -851,13 +815,7 @@
           i = ymis;
           break;
         }
-        ysignal += 16;
-        usignal += 8;
-        vsignal += 8;
       }
-      ysignal += (sp << 4) - 64;
-      usignal += (spuv << 3) - 32;
-      vsignal += (spuv << 3) - 32;
     }
     if (num_16x16_skin > 12) {
       *force_split = 1;
@@ -1534,8 +1492,7 @@
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     if (cpi->use_skin_detection)
-      x->sb_is_skin =
-          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+      x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split);
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -1842,7 +1799,8 @@
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     }
@@ -2539,7 +2497,8 @@
 
   if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
     // Setting segmentation map for cyclic_refresh.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     } else {
@@ -3119,54 +3078,6 @@
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
-#if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        1, 2, 2, 2, 4, 4 };
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        2, 1, 2, 4, 2, 4 };
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0,   10,  10, 30, 40,
-                                                        40,  60,  80, 80, 90,
-                                                        100, 100, 120 };
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0,  3,  3,  7,  15,
-                                                         15, 30, 40, 40, 60,
-                                                         80, 80, 120 };
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1,
-                                                             1, 1, 1, 1, 1,
-                                                             4, 4, 6 };
-
-typedef enum {
-  MV_ZERO = 0,
-  MV_LEFT = 1,
-  MV_UP = 2,
-  MV_RIGHT = 3,
-  MV_DOWN = 4,
-  MV_INVALID
-} MOTION_DIRECTION;
-
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
-  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
-    return MV_ZERO;
-  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
-    return MV_LEFT;
-  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
-    return MV_RIGHT;
-  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
-    return MV_UP;
-  } else {
-    return MV_DOWN;
-  }
-}
-
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
-  }
-}
-#endif
-
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
@@ -4064,11 +3975,6 @@
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
-#endif
-
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed =
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
@@ -4155,65 +4061,6 @@
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
-                                                  mi_col, bsize);
-  }
-#endif
-
-#if CONFIG_FP_MB_STATS
-  // Decide whether we shall split directly and skip searching NONE by using
-  // the first pass block statistics
-  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
-      partition_none_allowed && src_diff_var > 4 &&
-      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
-    int mb_row = mi_row >> 1;
-    int mb_col = mi_col >> 1;
-    int mb_row_end =
-        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-    int mb_col_end =
-        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-    int r, c;
-
-    // compute a complexity measure, basically measure inconsistency of motion
-    // vectors obtained from the first pass in the current block
-    for (r = mb_row; r < mb_row_end; r++) {
-      for (c = mb_col; c < mb_col_end; c++) {
-        const int mb_index = r * cm->mb_cols + c;
-
-        MOTION_DIRECTION this_mv;
-        MOTION_DIRECTION right_mv;
-        MOTION_DIRECTION bottom_mv;
-
-        this_mv =
-            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
-        // to its right
-        if (c != mb_col_end - 1) {
-          right_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
-          none_complexity += get_motion_inconsistency(this_mv, right_mv);
-        }
-
-        // to its bottom
-        if (r != mb_row_end - 1) {
-          bottom_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
-          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
-        }
-
-        // do not count its left and top neighbors to avoid double counting
-      }
-    }
-
-    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
-      partition_none_allowed = 0;
-    }
-  }
-#endif
-
   pc_tree->partitioning = PARTITION_NONE;
 
   if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
@@ -4291,53 +4138,6 @@
             }
           }
         }
-
-#if CONFIG_FP_MB_STATS
-        // Check if every 16x16 first pass block statistics has zero
-        // motion and the corresponding first pass residue is small enough.
-        // If that is the case, check the difference variance between the
-        // current frame and the last frame. If the variance is small enough,
-        // stop further splitting in RD optimization
-        if (cpi->use_fp_mb_stats && do_split != 0 &&
-            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
-          int mb_row = mi_row >> 1;
-          int mb_col = mi_col >> 1;
-          int mb_row_end =
-              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-          int mb_col_end =
-              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-          int r, c;
-
-          int skip = 1;
-          for (r = mb_row; r < mb_row_end; r++) {
-            for (c = mb_col; c < mb_col_end; c++) {
-              const int mb_index = r * cm->mb_cols + c;
-              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_MOTION_ZERO_MASK) ||
-                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_ERROR_SMALL_MASK)) {
-                skip = 0;
-                break;
-              }
-            }
-            if (skip == 0) {
-              break;
-            }
-          }
-
-          if (skip) {
-            if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-              src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
-            }
-            if (src_diff_var < 8) {
-              do_split = 0;
-              do_rect = 0;
-            }
-          }
-        }
-#endif
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -4603,15 +4403,18 @@
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
 #if CONFIG_RATE_CTRL
-    // Store partition, motion vector of the superblock.
-    if (output_enabled) {
-      const int num_unit_rows = get_num_unit_4x4(cpi->frame_info.frame_height);
-      const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
-      store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
-                            num_4x4_blocks_wide_lookup[BLOCK_64X64],
-                            num_unit_rows, num_unit_cols, mi_row << 1,
-                            mi_col << 1, cpi->partition_info,
-                            cpi->motion_vector_info);
+    if (oxcf->use_simple_encode_api) {
+      // Store partition, motion vector of the superblock.
+      if (output_enabled) {
+        const int num_unit_rows =
+            get_num_unit_4x4(cpi->frame_info.frame_height);
+        const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
+        store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
+                              num_4x4_blocks_wide_lookup[BLOCK_64X64],
+                              num_unit_rows, num_unit_cols, mi_row << 1,
+                              mi_col << 1, cpi->partition_info,
+                              cpi->motion_vector_info);
+      }
     }
 #endif  // CONFIG_RATE_CTRL
   }
@@ -4700,13 +4503,6 @@
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (cpi->partition_search_skippable_frame) {
-      BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
@@ -5981,9 +5777,14 @@
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_RATE_CTRL
+            if (cpi->oxcf.use_simple_encode_api) {
+              tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
+            }
+#endif  // CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
             tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#endif  // CONFIG_CONSISTENT_RECODE
             tile_data->mode_map[i][j] = j;
           }
         }
@@ -6072,20 +5873,6 @@
       vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
-                            VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
-  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
-                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
-
-  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
-  *this_frame_mb_stats = mb_stats_in;
-
-  return 1;
-}
-#endif
-
 static int compare_kmeans_data(const void *a, const void *b) {
   if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
     return 1;
@@ -6292,13 +6079,6 @@
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                       &cpi->twopass.this_frame_mb_stats);
-    }
-#endif
-
     if (!cpi->row_mt) {
       cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
       cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
@@ -6406,7 +6186,12 @@
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    restore_encode_params(cpi);
+  }
+#endif  // CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
   restore_encode_params(cpi);
 #endif
 
@@ -6703,7 +6488,8 @@
 
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
-    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
     if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
         (!cpi->use_svc ||
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -654,10 +654,15 @@
 }
 
 static int check_seg_range(int seg_data[8], int range) {
-  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
-           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
-           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
-           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (seg_data[i] > range || seg_data[i] < -range) {
+      return 0;
+    }
+  }
+  return 1;
 }
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
@@ -1022,10 +1027,12 @@
   cpi->mi_ssim_rdmult_scaling_factors = NULL;
 
 #if CONFIG_RATE_CTRL
-  free_partition_info(cpi);
-  free_motion_vector_info(cpi);
-  free_fp_motion_vector_info(cpi);
-  free_tpl_stats_info(cpi);
+  if (cpi->oxcf.use_simple_encode_api) {
+    free_partition_info(cpi);
+    free_motion_vector_info(cpi);
+    free_fp_motion_vector_info(cpi);
+    free_tpl_stats_info(cpi);
+  }
 #endif
 
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -2302,6 +2309,7 @@
       cm, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
+  cpi->compute_frame_low_motion_onepass = 1;
   cpi->use_svc = 0;
   cpi->resize_state = ORIG;
   cpi->external_resize = 0;
@@ -2317,9 +2325,9 @@
   cpi->frame_info = vp9_get_frame_info(oxcf);
 
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  vp9_init_rd_parameters(cpi);
 
   init_frame_indexes(cm);
-  cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
 
   realloc_segmentation_maps(cpi);
@@ -2360,17 +2368,6 @@
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder used to store the first pass mb stats in the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
-                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
-  } else {
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   cpi->refresh_alt_ref_frame = 0;
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
@@ -2464,7 +2461,12 @@
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
-  vp9_extrc_init(&cpi->ext_ratectrl);
+  {
+    vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed");
+    }
+  }
 
 #if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
@@ -2518,19 +2520,7 @@
       vp9_init_second_pass_spatial_svc(cpi);
     } else {
       int num_frames;
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
 
-        cpi->twopass.firstpass_mb_stats.mb_stats_start =
-            oxcf->firstpass_mb_stats_in.buf;
-        cpi->twopass.firstpass_mb_stats.mb_stats_end =
-            cpi->twopass.firstpass_mb_stats.mb_stats_start +
-            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-      }
-#endif
-
       cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
       cpi->twopass.stats_in = cpi->twopass.stats_in_start;
       cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
@@ -2663,10 +2653,12 @@
 
 #if CONFIG_RATE_CTRL
   encode_command_init(&cpi->encode_command);
-  partition_info_init(cpi);
-  motion_vector_info_init(cpi);
-  fp_motion_vector_info_init(cpi);
-  tpl_stats_info_init(cpi);
+  if (oxcf->use_simple_encode_api) {
+    partition_info_init(cpi);
+    motion_vector_info_init(cpi);
+    fp_motion_vector_info_init(cpi);
+    tpl_stats_info_init(cpi);
+  }
 #endif
 
   return cpi;
@@ -2831,13 +2823,6 @@
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vpx_free(cpi->twopass.frame_mb_stats_buf);
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   vp9_extrc_delete(&cpi->ext_ratectrl);
 
   vp9_remove_common(cm);
@@ -3698,6 +3683,10 @@
     cpi->rc.force_max_q = 0;
   }
 
+  if (cpi->use_svc) {
+    cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q;
+  }
+
   if (!frame_is_intra_only(cm)) {
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
@@ -4198,7 +4187,7 @@
 
   // Update some stats from cyclic refresh, and check for golden frame update.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      !frame_is_intra_only(cm))
+      !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode)
     vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution
@@ -4392,11 +4381,24 @@
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  int last_q_attempt = 0;
   int enable_acl;
 #ifdef AGGRESSIVE_VBR
   int qrange_adj = 1;
 #endif
 
+  // A flag which indicates whether we are recoding the current frame
+  // when the current frame size is larger than the max frame size in the
+  // external rate control model.
+  // This flag doesn't have any impact when external rate control is not used.
+  int ext_rc_recode = 0;
+  // Maximal frame size allowed by the external rate control.
+  // case: 0, we ignore the max frame size limit, and encode with the qindex
+  // passed in by the external rate control model.
+  // case: -1, we take VP9's decision for the max frame size.
+  int ext_rc_max_frame_size = 0;
+  const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
+
 #if CONFIG_RATE_CTRL
   const FRAME_UPDATE_TYPE update_type =
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
@@ -4451,11 +4453,6 @@
       loop_at_this_size = 0;
     }
 
-#if CONFIG_RATE_CTRL
-    if (cpi->encode_command.use_external_target_frame_bits) {
-      q = rq_model_predict_q_index(rq_model, rq_history, rc->this_frame_target);
-    }
-#endif  // CONFIG_RATE_CTRL
     // Decide frame size bounds first time through.
     if (loop_count == 0) {
       vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
@@ -4498,22 +4495,36 @@
 #if CONFIG_RATE_CTRL
     // TODO(angiebird): This is a hack for making sure the encoder use the
     // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      q = cpi->encode_command.external_quantize_index;
+    if (cpi->oxcf.use_simple_encode_api) {
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        q = rq_model_predict_q_index(rq_model, rq_history,
+                                     rc->this_frame_target);
+      }
+      if (cpi->encode_command.use_external_quantize_index) {
+        q = cpi->encode_command.external_quantize_index;
+      }
     }
-#endif
-    if (cpi->ext_ratectrl.ready) {
+#endif  // CONFIG_RATE_CTRL
+    if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
+      vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
       FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
       const int ref_frame_flags = get_ref_frame_flags(cpi);
       RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+      const RefCntBuffer *curr_frame_buf =
+          get_ref_cnt_buffer(cm, cm->new_fb_idx);
       get_ref_frame_bufs(cpi, ref_frame_bufs);
-      vp9_extrc_get_encodeframe_decision(
-          &cpi->ext_ratectrl, cm->current_video_frame,
+      codec_status = vp9_extrc_get_encodeframe_decision(
+          &cpi->ext_ratectrl, curr_frame_buf->frame_index,
           cm->current_frame_coding_index, gf_group->index, update_type,
           ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
+      if (codec_status != VPX_CODEC_OK) {
+        vpx_internal_error(&cm->error, codec_status,
+                           "vp9_extrc_get_encodeframe_decision() failed");
+      }
       q = encode_frame_decision.q_index;
+      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
     vp9_set_quantizer(cpi, q);
@@ -4555,36 +4566,61 @@
     }
 
     if (cpi->ext_ratectrl.ready) {
-      break;
+      last_q_attempt = q;
+      // In general, for the external rate control, we take the qindex provided
+      // as input and encode the frame with this qindex faithfully. However,
+      // in some extreme scenarios, the provided qindex leads to a massive
+      // overshoot of frame size. In this case, we fall back to VP9's decision
+      // to pick a new qindex and recode the frame. We return the new qindex
+      // through the API to the external model.
+      if (ext_rc_max_frame_size == 0) {
+        break;
+      } else if (ext_rc_max_frame_size == -1) {
+        if (rc->projected_frame_size < rc->max_frame_bandwidth) {
+          break;
+        }
+      } else {
+        if (rc->projected_frame_size < ext_rc_max_frame_size) {
+          break;
+        }
+      }
+      rc->max_frame_bandwidth = ext_rc_max_frame_size;
+      // If the current frame size exceeds the ext_rc_max_frame_size,
+      // we adjust the worst qindex to meet the frame size constraint.
+      q_high = 255;
+      ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
-    // This part needs to be after save_coding_context() because
-    // restore_coding_context will be called in the end of this function.
-    // TODO(angiebird): This is a hack for making sure the encoder use the
-    // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      break;
-    }
+    if (cpi->oxcf.use_simple_encode_api) {
+      // This part needs to be after save_coding_context() because
+      // restore_coding_context will be called in the end of this function.
+      // TODO(angiebird): This is a hack for making sure the encoder use the
+      // external_quantize_index exactly. Avoid this kind of hack later.
+      if (cpi->encode_command.use_external_quantize_index) {
+        break;
+      }
 
-    if (cpi->encode_command.use_external_target_frame_bits) {
-      const double percent_diff = get_bits_percent_diff(
-          rc->this_frame_target, rc->projected_frame_size);
-      update_rq_history(rq_history, rc->this_frame_target,
-                        rc->projected_frame_size, q);
-      loop_count += 1;
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        const double percent_diff = get_bits_percent_diff(
+            rc->this_frame_target, rc->projected_frame_size);
+        update_rq_history(rq_history, rc->this_frame_target,
+                          rc->projected_frame_size, q);
+        loop_count += 1;
 
-      rq_model_update(rq_history, rc->this_frame_target, rq_model);
+        rq_model_update(rq_history, rc->this_frame_target, rq_model);
 
-      // Check if we hit the target bitrate.
-      if (percent_diff <= cpi->encode_command.target_frame_bits_error_percent ||
-          rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
-          rq_history->q_index_low >= rq_history->q_index_high) {
-        break;
-      }
+        // Check if we hit the target bitrate.
+        if (percent_diff <=
+                cpi->encode_command.target_frame_bits_error_percent ||
+            rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
+            rq_history->q_index_low >= rq_history->q_index_high) {
+          break;
+        }
 
-      loop = 1;
-      restore_coding_context(cpi);
-      continue;
+        loop = 1;
+        restore_coding_context(cpi);
+        continue;
+      }
     }
 #endif  // CONFIG_RATE_CTRL
 
@@ -4754,6 +4790,23 @@
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
+    // Special handling of external max frame size constraint
+    if (ext_rc_recode) {
+      // If the largest q is not able to meet the max frame size limit,
+      // do nothing.
+      if (rc->projected_frame_size > ext_rc_max_frame_size &&
+          last_q_attempt == 255) {
+        break;
+      }
+      // If VP9's q selection leads to a smaller q, we force it to use
+      // a larger q to better approximate the external max frame size
+      // constraint.
+      if (rc->projected_frame_size > ext_rc_max_frame_size &&
+          q <= last_q_attempt) {
+        q = VPXMIN(255, last_q_attempt + 1);
+      }
+    }
+
     if (loop) {
       ++loop_count;
       ++loop_at_this_size;
@@ -4767,6 +4820,8 @@
       if (loop) restore_coding_context(cpi);
   } while (loop);
 
+  rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
+
 #ifdef AGGRESSIVE_VBR
   if (two_pass_first_group_inter(cpi)) {
     cpi->twopass.active_worst_quality =
@@ -5300,17 +5355,81 @@
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void update_encode_frame_result(
+static void update_encode_frame_result_basic(
+    FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = show_idx;
+  encode_frame_result->update_type = update_type;
+  encode_frame_result->quantize_index = quantize_index;
+}
+
+#if CONFIG_RATE_CTRL
+static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
+                                        IMAGE_BUFFER *image_buffer) {
+  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
+                                   yv12_buffer->v_buffer };
+  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
+                                 yv12_buffer->uv_stride };
+  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
+                        yv12_buffer->uv_crop_width };
+  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
+                        yv12_buffer->uv_crop_height };
+  int plane;
+  for (plane = 0; plane < 3; ++plane) {
+    const int src_stride = src_stride_ls[plane];
+    const int w = w_ls[plane];
+    const int h = h_ls[plane];
+    const uint8_t *src_buf = src_buf_ls[plane];
+    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
+    int r;
+    assert(image_buffer->plane_width[plane] == w);
+    assert(image_buffer->plane_height[plane] == h);
+    for (r = 0; r < h; ++r) {
+      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
+      src_buf += src_stride;
+      dst_buf += w;
+    }
+  }
+}
+// This function will update extra information specific for simple_encode APIs
+static void update_encode_frame_result_simple_encode(
     int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
     const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_buf[MAX_INTER_REF_FRAMES], int quantize_index,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
     uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
     const PARTITION_INFO *partition_info,
     const MOTION_VECTOR_INFO *motion_vector_info,
     const TplDepStats *tpl_stats_info,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  PSNR_STATS psnr;
+  update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                   quantize_index, encode_frame_result);
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
+                       input_bit_depth);
+#else   // CONFIG_VP9_HIGHBITDEPTH
+  (void)bit_depth;
+  (void)input_bit_depth;
+  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_result->ref_frame_coding_indexes,
+                         encode_frame_result->ref_frame_valid_list);
+
+  encode_frame_result->psnr = psnr.psnr[0];
+  encode_frame_result->sse = psnr.sse[0];
+  encode_frame_result->frame_counts = *counts;
+  encode_frame_result->partition_info = partition_info;
+  encode_frame_result->motion_vector_info = motion_vector_info;
+  encode_frame_result->tpl_stats_info = tpl_stats_info;
+  if (encode_frame_result->coded_frame.allocated) {
+    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
+                                &encode_frame_result->coded_frame);
+  }
+}
 #endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result);
 #endif  // !CONFIG_REALTIME_ONLY
 
 static void encode_frame_to_data_rate(
@@ -5405,10 +5524,14 @@
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#if CONFIG_CONSISTENT_RECODE
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+#elif CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    save_encode_params(cpi);
+  }
+#endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (!encode_without_recode_loop(cpi, size, dest)) return;
@@ -5487,9 +5610,13 @@
   {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
-    vp9_extrc_update_encodeframe_result(
+    vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
         &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf,
-        cm->bit_depth, cpi->oxcf.input_bit_depth);
+        cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_update_encodeframe_result() failed");
+    }
   }
 #if CONFIG_REALTIME_ONLY
   (void)encode_frame_result;
@@ -5496,10 +5623,12 @@
   assert(encode_frame_result == NULL);
 #else  // CONFIG_REALTIME_ONLY
   if (encode_frame_result != NULL) {
-    const int ref_frame_flags = get_ref_frame_flags(cpi);
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    int quantize_index = vp9_get_quantizer(cpi);
     get_ref_frame_bufs(cpi, ref_frame_bufs);
     // update_encode_frame_result() depends on twopass.gf_group.index and
     // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and
@@ -5517,15 +5646,21 @@
     // This function needs to be called before vp9_update_reference_frames().
     // TODO(angiebird): Improve the codebase to make the update of frame
     // dependent variables more robust.
-    update_encode_frame_result(
-        ref_frame_flags,
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-        cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi),
-        cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+
+    update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                     quantize_index, encode_frame_result);
 #if CONFIG_RATE_CTRL
-        cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+    if (cpi->oxcf.use_simple_encode_api) {
+      const int ref_frame_flags = get_ref_frame_flags(cpi);
+      update_encode_frame_result_simple_encode(
+          ref_frame_flags,
+          cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+          cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index,
+          cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+          cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+          encode_frame_result);
+    }
 #endif  // CONFIG_RATE_CTRL
-        encode_frame_result);
   }
 #endif  // CONFIG_REALTIME_ONLY
 
@@ -5591,7 +5726,8 @@
 
   vp9_rc_postencode_update(cpi, *size);
 
-  if (oxcf->pass == 0 && !frame_is_intra_only(cm) &&
+  if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 &&
+      !frame_is_intra_only(cm) &&
       (!cpi->use_svc ||
        (cpi->use_svc &&
         !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -5680,8 +5816,13 @@
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (cpi->common.current_frame_coding_index == 0) {
-    vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl,
-                                   &cpi->twopass.first_pass_info);
+    VP9_COMMON *cm = &cpi->common;
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
   }
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_w();
@@ -7440,7 +7581,9 @@
 #endif  // CONFIG_NON_GREEDY_MV
 
 #if CONFIG_RATE_CTRL
-  accumulate_frame_tpl_stats(cpi);
+  if (cpi->oxcf.use_simple_encode_api) {
+    accumulate_frame_tpl_stats(cpi);
+  }
 #endif  // CONFIG_RATE_CTRL
 }
 
@@ -7468,206 +7611,6 @@
   }
 }
 
-#if !CONFIG_REALTIME_ONLY
-#if CONFIG_RATE_CTRL
-static void copy_frame_counts(const FRAME_COUNTS *input_counts,
-                              FRAME_COUNTS *output_counts) {
-  int i, j, k, l, m, n;
-  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->y_mode[i][j] = input_counts->y_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_MODES; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->uv_mode[i][j] = input_counts->uv_mode[i][j];
-    }
-  }
-  for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-    for (j = 0; j < PARTITION_TYPES; ++j) {
-      output_counts->partition[i][j] = input_counts->partition[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; ++i) {
-    for (j = 0; j < PLANE_TYPES; ++j) {
-      for (k = 0; k < REF_TYPES; ++k) {
-        for (l = 0; l < COEF_BANDS; ++l) {
-          for (m = 0; m < COEFF_CONTEXTS; ++m) {
-            output_counts->eob_branch[i][j][k][l][m] =
-                input_counts->eob_branch[i][j][k][l][m];
-            for (n = 0; n < UNCONSTRAINED_NODES + 1; ++n) {
-              output_counts->coef[i][j][k][l][m][n] =
-                  input_counts->coef[i][j][k][l][m][n];
-            }
-          }
-        }
-      }
-    }
-  }
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
-      output_counts->switchable_interp[i][j] =
-          input_counts->switchable_interp[i][j];
-    }
-  }
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    for (j = 0; j < INTER_MODES; ++j) {
-      output_counts->inter_mode[i][j] = input_counts->inter_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->intra_inter[i][j] = input_counts->intra_inter[i][j];
-    }
-  }
-  for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_inter[i][j] = input_counts->comp_inter[i][j];
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      for (k = 0; k < 2; ++k) {
-        output_counts->single_ref[i][j][k] = input_counts->single_ref[i][j][k];
-      }
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_ref[i][j] = input_counts->comp_ref[i][j];
-    }
-  }
-  for (i = 0; i < SKIP_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->skip[i][j] = input_counts->skip[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++) {
-      output_counts->tx.p32x32[i][j] = input_counts->tx.p32x32[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      output_counts->tx.p16x16[i][j] = input_counts->tx.p16x16[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      output_counts->tx.p8x8[i][j] = input_counts->tx.p8x8[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; i++) {
-    output_counts->tx.tx_totals[i] = input_counts->tx.tx_totals[i];
-  }
-  for (i = 0; i < MV_JOINTS; i++) {
-    output_counts->mv.joints[i] = input_counts->mv.joints[i];
-  }
-  for (k = 0; k < 2; k++) {
-    nmv_component_counts *const comps = &output_counts->mv.comps[k];
-    const nmv_component_counts *const comps_t = &input_counts->mv.comps[k];
-    for (i = 0; i < 2; i++) {
-      comps->sign[i] = comps_t->sign[i];
-      comps->class0_hp[i] = comps_t->class0_hp[i];
-      comps->hp[i] = comps_t->hp[i];
-    }
-    for (i = 0; i < MV_CLASSES; i++) {
-      comps->classes[i] = comps_t->classes[i];
-    }
-    for (i = 0; i < CLASS0_SIZE; i++) {
-      comps->class0[i] = comps_t->class0[i];
-      for (j = 0; j < MV_FP_SIZE; j++) {
-        comps->class0_fp[i][j] = comps_t->class0_fp[i][j];
-      }
-    }
-    for (i = 0; i < MV_OFFSET_BITS; i++) {
-      for (j = 0; j < 2; j++) {
-        comps->bits[i][j] = comps_t->bits[i][j];
-      }
-    }
-    for (i = 0; i < MV_FP_SIZE; i++) {
-      comps->fp[i] = comps_t->fp[i];
-    }
-  }
-}
-
-static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
-                                        IMAGE_BUFFER *image_buffer) {
-  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
-                                   yv12_buffer->v_buffer };
-  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
-                                 yv12_buffer->uv_stride };
-  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
-                        yv12_buffer->uv_crop_width };
-  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
-                        yv12_buffer->uv_crop_height };
-  int plane;
-  for (plane = 0; plane < 3; ++plane) {
-    const int src_stride = src_stride_ls[plane];
-    const int w = w_ls[plane];
-    const int h = h_ls[plane];
-    const uint8_t *src_buf = src_buf_ls[plane];
-    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
-    int r;
-    assert(image_buffer->plane_width[plane] == w);
-    assert(image_buffer->plane_height[plane] == h);
-    for (r = 0; r < h; ++r) {
-      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
-      src_buf += src_stride;
-      dst_buf += w;
-    }
-  }
-}
-#endif  // CONFIG_RATE_CTRL
-
-static void update_encode_frame_result(
-    int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
-    const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
-    uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
-    const PARTITION_INFO *partition_info,
-    const MOTION_VECTOR_INFO *motion_vector_info,
-    const TplDepStats *tpl_stats_info,
-#endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result) {
-#if CONFIG_RATE_CTRL
-  PSNR_STATS psnr;
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
-                       input_bit_depth);
-#else   // CONFIG_VP9_HIGHBITDEPTH
-  (void)bit_depth;
-  (void)input_bit_depth;
-  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
-
-  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
-                         encode_frame_result->ref_frame_coding_indexes,
-                         encode_frame_result->ref_frame_valid_list);
-
-  encode_frame_result->psnr = psnr.psnr[0];
-  encode_frame_result->sse = psnr.sse[0];
-  copy_frame_counts(counts, &encode_frame_result->frame_counts);
-  encode_frame_result->partition_info = partition_info;
-  encode_frame_result->motion_vector_info = motion_vector_info;
-  encode_frame_result->tpl_stats_info = tpl_stats_info;
-  if (encode_frame_result->coded_frame.allocated) {
-    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
-                                &encode_frame_result->coded_frame);
-  }
-#else   // CONFIG_RATE_CTRL
-  (void)ref_frame_flags;
-  (void)bit_depth;
-  (void)input_bit_depth;
-  (void)source_frame;
-  (void)coded_frame_buf;
-  (void)ref_frame_bufs;
-  (void)counts;
-#endif  // CONFIG_RATE_CTRL
-  encode_frame_result->show_idx = coded_frame_buf->frame_index;
-  encode_frame_result->update_type = update_type;
-  encode_frame_result->quantize_index = quantize_index;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
 void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
   encode_frame_result->show_idx = -1;  // Actual encoding doesn't happen.
 #if CONFIG_RATE_CTRL
@@ -7861,9 +7804,12 @@
   cm->new_fb_idx = get_free_fb(cm);
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
-
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
+  // If the frame buffer for current frame is the same as previous frame, MV in
+  // the base layer shouldn't be used as it'll cause data race.
+  if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) {
+    cpi->svc.use_base_mv = 0;
+  }
   // Start with a 0 size frame.
   *size = 0;
 
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -273,10 +273,6 @@
 
   vpx_fixed_buf_t two_pass_stats_in;
 
-#if CONFIG_FP_MB_STATS
-  vpx_fixed_buf_t firstpass_mb_stats_in;
-#endif
-
   vp8e_tuning tuning;
   vp9e_tune_content content;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -291,6 +287,7 @@
   int row_mt;
   unsigned int motion_vector_unit_test;
   int delta_q_uv;
+  int use_simple_encode_api;  // Use SimpleEncode APIs or not
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -710,9 +707,6 @@
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
-  // For a still frame, this flag is set to 1 to skip partition search.
-  int partition_search_skippable_frame;
-
   int scaled_ref_idx[REFS_PER_FRAME];
   int lst_fb_idx;
   int gld_fb_idx;
@@ -746,6 +740,7 @@
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
 
+  RD_CONTROL rd_ctrl;
   RD_OPT rd;
 
   CODING_CONTEXT coding_context;
@@ -804,10 +799,6 @@
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
 
-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
-#endif
-
   TWO_PASS twopass;
 
   // Force recalculation of segment_ids for each mode info
@@ -958,6 +949,8 @@
   uint8_t *content_state_sb_fd;
 
   int compute_source_sad_onepass;
+
+  int compute_frame_low_motion_onepass;
 
   LevelConstraint level_constraint;
 
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -13,31 +13,56 @@
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/psnr.h"
 
-void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); }
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vp9_zero(*ext_ratectrl);
+  return VPX_CODEC_OK;
+}
 
-void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config,
-                      EXT_RATECTRL *ext_ratectrl) {
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl) {
+  vpx_rc_status_t rc_status;
   vpx_rc_firstpass_stats_t *rc_firstpass_stats;
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   vp9_extrc_delete(ext_ratectrl);
   ext_ratectrl->funcs = funcs;
   ext_ratectrl->ratectrl_config = ratectrl_config;
-  ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
-                                   &ext_ratectrl->ratectrl_config,
-                                   &ext_ratectrl->model);
+  rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
+                                               &ext_ratectrl->ratectrl_config,
+                                               &ext_ratectrl->model);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
   rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats;
   rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count;
   rc_firstpass_stats->frame_stats =
       vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) *
                  rc_firstpass_stats->num_frames);
+  if (rc_firstpass_stats->frame_stats == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
   ext_ratectrl->ready = 1;
+  return VPX_CODEC_OK;
 }
 
-void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   if (ext_ratectrl->ready) {
-    ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    vpx_rc_status_t rc_status =
+        ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
     vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats);
   }
-  vp9_extrc_init(ext_ratectrl);
+  return vp9_extrc_init(ext_ratectrl);
 }
 
 static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
@@ -69,9 +94,13 @@
   rc_frame_stats->count = stats->count;
 }
 
-void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
-                                    const FIRST_PASS_INFO *first_pass_info) {
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
     vpx_rc_firstpass_stats_t *rc_firstpass_stats =
         &ext_ratectrl->rc_firstpass_stats;
     int i;
@@ -80,9 +109,13 @@
       gen_rc_firstpass_stats(&first_pass_info->stats[i],
                              &rc_firstpass_stats->frame_stats[i]);
     }
-    ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
-                                             rc_firstpass_stats);
+    rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
+                                                         rc_firstpass_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
 
 static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
@@ -102,12 +135,16 @@
   }
 }
 
-void vp9_extrc_get_encodeframe_decision(
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
     encode_frame_info.coding_index = coding_index;
@@ -118,24 +155,32 @@
                            encode_frame_info.ref_frame_coding_indexes,
                            encode_frame_info.ref_frame_valid_list);
 
-    ext_ratectrl->funcs.get_encodeframe_decision(
+    rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
         ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
 
-void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
-                                         int64_t bit_count,
-                                         const YV12_BUFFER_CONFIG *source_frame,
-                                         const YV12_BUFFER_CONFIG *coded_frame,
-                                         uint32_t bit_depth,
-                                         uint32_t input_bit_depth) {
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth, const int actual_encoding_qindex) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   if (ext_ratectrl->ready) {
     PSNR_STATS psnr;
+    vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_result_t encode_frame_result;
     encode_frame_result.bit_count = bit_count;
     encode_frame_result.pixel_count =
-        source_frame->y_width * source_frame->y_height +
-        2 * source_frame->uv_width * source_frame->uv_height;
+        source_frame->y_crop_width * source_frame->y_crop_height +
+        2 * source_frame->uv_crop_width * source_frame->uv_crop_height;
+    encode_frame_result.actual_encoding_qindex = actual_encoding_qindex;
 #if CONFIG_VP9_HIGHBITDEPTH
     vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
                          input_bit_depth);
@@ -145,7 +190,11 @@
     vpx_calc_psnr(source_frame, coded_frame, &psnr);
 #endif
     encode_frame_result.sse = psnr.sse[0];
-    ext_ratectrl->funcs.update_encodeframe_result(ext_ratectrl->model,
-                                                  &encode_frame_result);
+    rc_status = ext_ratectrl->funcs.update_encodeframe_result(
+        ext_ratectrl->model, &encode_frame_result);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
   }
+  return VPX_CODEC_OK;
 }
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -22,27 +22,27 @@
   vpx_rc_firstpass_stats_t rc_firstpass_stats;
 } EXT_RATECTRL;
 
-void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config,
-                      EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 
-void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl,
-                                    const FIRST_PASS_INFO *first_pass_info);
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
 
-void vp9_extrc_get_encodeframe_decision(
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
-void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
-                                         int64_t bit_count,
-                                         const YV12_BUFFER_CONFIG *source_frame,
-                                         const YV12_BUFFER_CONFIG *coded_frame,
-                                         uint32_t bit_depth,
-                                         uint32_t input_bit_depth);
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth, const int actual_encoding_qindex);
 
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -54,6 +54,30 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_FRAME_BOOST 96.0
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
+
+#define DEFAULT_ZM_FACTOR 0.5
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+#define AV_WQ_FACTOR 4.0
+
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
 
 #if ARF_STATS_OUTPUT
@@ -111,17 +135,6 @@
 #endif
 }
 
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
-                              struct vpx_codec_pkt_list *pktlist) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
-  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
 static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
@@ -929,10 +942,6 @@
     int level_sample;
     const int mb_index = mb_row * cm->mb_cols + mb_col;
 
-#if CONFIG_FP_MB_STATS
-    const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
     // Adjust to the next column of MBs.
@@ -1068,13 +1077,6 @@
     // Accumulate the intra error.
     fp_acc_data->intra_error += (int64_t)this_error;
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      // initialization
-      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-    }
-#endif
-
     // Set up limit values for motion vectors to prevent them extending
     // outside the UMV borders.
     x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
@@ -1081,8 +1083,8 @@
     x->mv_limits.col_max =
         ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-    // Other than for the first frame do a motion search.
-    if (cm->current_video_frame > 0) {
+    // Other than for intra-only frame do a motion search.
+    if (!frame_is_intra_only(cm)) {
       int tmp_err, motion_error, this_motion_error, raw_motion_error;
       // Assume 0,0 motion with no mv overhead.
       MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
@@ -1090,8 +1092,10 @@
       vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
 
 #if CONFIG_RATE_CTRL
-      // Store zero mv as default
-      store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+      if (cpi->oxcf.use_simple_encode_api) {
+        // Store zero mv as default
+        store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+      }
 #endif  // CONFIG_RAGE_CTRL
 
       xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
@@ -1159,7 +1163,9 @@
           }
         }
 #if CONFIG_RATE_CTRL
-        store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+        if (cpi->oxcf.use_simple_encode_api) {
+          store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+        }
 #endif  // CONFIG_RAGE_CTRL
 
         // Search in an older reference frame.
@@ -1183,7 +1189,10 @@
 
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
 #if CONFIG_RATE_CTRL
-          store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, 1);
+          if (cpi->oxcf.use_simple_encode_api) {
+            store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME,
+                                   1);
+          }
 #endif  // CONFIG_RAGE_CTRL
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
@@ -1213,20 +1222,6 @@
       best_ref_mv->row = 0;
       best_ref_mv->col = 0;
 
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // intra prediction statistics
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-        if (this_error > FPMB_ERROR_LARGE_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-        } else if (this_error < FPMB_ERROR_SMALL_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-        }
-      }
-#endif
-
       if (motion_error <= this_error) {
         vpx_clear_system_state();
 
@@ -1271,47 +1266,9 @@
 
         *best_ref_mv = mv;
 
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // inter prediction statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
         if (!is_zero_mv(&mv)) {
           ++(fp_acc_data->mvcount);
 
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
-            // check estimated motion direction
-            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-              // right direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_RIGHT_MASK;
-            } else if (mv.as_mv.row < 0 &&
-                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-              // up direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
-            } else if (mv.as_mv.col < 0 &&
-                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-              // left direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_LEFT_MASK;
-            } else {
-              // down direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_DOWN_MASK;
-            }
-          }
-#endif
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
             if (mv.row > 0)
@@ -1359,7 +1316,9 @@
     } else {
       fp_acc_data->sr_coded_error += (int64_t)this_error;
 #if CONFIG_RATE_CTRL
-      store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+      if (cpi->oxcf.use_simple_encode_api) {
+        store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+      }
 #endif  // CONFIG_RAGE_CTRL
     }
     fp_acc_data->coded_error += (int64_t)this_error;
@@ -1388,9 +1347,11 @@
   vp9_tile_init(tile, cm, 0, 0);
 
 #if CONFIG_RATE_CTRL
-  fp_motion_vector_info_reset(cpi->frame_info.frame_width,
-                              cpi->frame_info.frame_height,
-                              cpi->fp_motion_vector_info);
+  if (cpi->oxcf.use_simple_encode_api) {
+    fp_motion_vector_info_reset(cpi->frame_info.frame_width,
+                                cpi->frame_info.frame_height,
+                                cpi->fp_motion_vector_info);
+  }
 #endif
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
@@ -1424,12 +1385,6 @@
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
-  }
-#endif
-
   set_first_pass_params(cpi);
   vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth));
 
@@ -1490,12 +1445,6 @@
     twopass->this_frame_stats = fps;
     output_stats(&twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
-    }
-#endif
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
@@ -1807,61 +1756,60 @@
   twopass->arnr_strength_adjustment = 0;
 }
 
-#define SR_DIFF_PART 0.0015
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-#define LOW_CODED_ERR_PER_MB 10.0
-#define NCOUNT_FRAME_II_THRESH 6.0
-
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It compares the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitrate boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
-  double modified_pct_inter;
-  double modified_pcnt_intra;
-  const double motion_amplitude_part =
-      frame->pcnt_motion *
-      ((frame->mvc_abs + frame->mvr_abs) /
-       (frame_info->frame_height + frame_info->frame_width));
 
-  modified_pct_inter = frame->pcnt_inter;
-  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
-      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-       (double)NCOUNT_FRAME_II_THRESH)) {
-    modified_pct_inter =
-        frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
-  }
-  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
-
+  // Do nothing if the second ref to last frame error difference is
+  // very small or even negative.
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
-               (INTRA_PART * modified_pcnt_intra);
+    const double sr_diff_part =
+        twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+    double modified_pct_inter = frame->pcnt_inter;
+    double modified_pcnt_intra;
+
+    if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+        ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+         (double)NCOUNT_FRAME_II_THRESH)) {
+      modified_pct_inter =
+          frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
+    }
+    modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+  return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
+static double get_zero_motion_factor(const TWO_PASS *const twopass,
                                      const FIRSTPASS_STATS *frame_stats) {
   const double zero_motion_pct =
       frame_stats->pcnt_inter - frame_stats->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame_stats);
+  double sr_decay = get_sr_decay_rate(twopass, frame_stats);
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
-
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
+static double get_prediction_decay_rate(const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats);
-  const double zero_motion_factor =
-      (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
+  double zero_motion_factor =
+      twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
+  // Check that the zero motion factor is valid
+  assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0);
+
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
 }
@@ -1942,10 +1890,9 @@
   }
 }
 
-#define BASELINE_ERR_PER_MB 12500.0
-#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
+                               const TWO_PASS *const twopass,
                                int avg_frame_qindex,
                                double this_frame_mv_in_out) {
   double frame_boost;
@@ -1954,8 +1901,8 @@
   const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
   const double active_area = calculate_active_area(frame_info, this_frame);
 
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * active_area) /
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
@@ -1965,28 +1912,15 @@
   // Q correction and scalling
   frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
+  return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction);
 }
 
-static double kf_err_per_mb(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  unsigned int screen_area = (cm->width * cm->height);
-
-  // Use a different error per mb factor for calculating boost for
-  //  different formats.
-  if (screen_area < 1280 * 720) {
-    return 2000.0;
-  } else if (screen_area < 1920 * 1080) {
-    return 500.0;
-  }
-  return 250.0;
-}
-
 static double calc_kf_frame_boost(VP9_COMP *cpi,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator,
                                   double this_frame_mv_in_out,
-                                  double max_boost) {
+                                  double zm_factor) {
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
@@ -1993,9 +1927,10 @@
   const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
   const double active_area =
       calculate_active_area(&cpi->frame_info, this_frame);
+  double max_boost;
 
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (kf_err_per_mb(cpi) * active_area) /
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->kf_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
   // Update the accumulator for second ref error difference.
@@ -2012,15 +1947,23 @@
   // The 40.0 value here is an experimentally derived baseline minimum.
   // This value is in line with the minimum per frame boost in the alt_ref
   // boost calculation.
-  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+  frame_boost =
+      (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction;
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  // Maximum allowed boost this frame. May be different for first vs subsequent
+  // key frames.
+  max_boost = (cpi->common.current_video_frame == 0)
+                  ? twopass->kf_frame_max_boost_first
+                  : twopass->kf_frame_max_boost_subs;
+  max_boost *= zm_factor * boost_q_correction;
+
+  return VPXMIN(frame_boost, max_boost);
 }
 
 static int compute_arf_boost(const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int arf_show_idx, int f_frames, int b_frames,
-                             int avg_frame_qindex) {
+                             TWO_PASS *const twopass, int arf_show_idx,
+                             int f_frames, int b_frames, int avg_frame_qindex) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -2051,14 +1994,14 @@
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
 
   arf_boost = (int)boost_score;
@@ -2091,14 +2034,14 @@
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
   arf_boost += (int)boost_score;
 
@@ -2114,8 +2057,8 @@
   TWO_PASS *const twopass = &cpi->twopass;
   const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME];
   int arf_show_idx = get_show_idx(twopass);
-  return compute_arf_boost(frame_info, &twopass->first_pass_info, arf_show_idx,
-                           f_frames, b_frames, avg_inter_frame_qindex);
+  return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames,
+                           b_frames, avg_inter_frame_qindex);
 }
 
 // Calculate a section intra ratio used in setting max loop filter.
@@ -2530,6 +2473,9 @@
  * (The following fields will remain unchanged after initialization of encoder.)
  *   rc->static_scene_max_gf_interval
  *   rc->min_gf_interval
+ *   twopass->sr_diff_factor
+ *   twopass->sr_default_decay_limit
+ *   twopass->zm_factor
  *
  * Dynamic fields:
  * (The following fields will be updated before or after coding each frame.)
@@ -2545,9 +2491,10 @@
  */
 static int get_gop_coding_frame_num(
     int *use_alt_ref, const FRAME_INFO *frame_info,
-    const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc,
+    const TWO_PASS *const twopass, const RATE_CONTROL *rc,
     int gf_start_show_idx, const RANGE *active_gf_interval,
     double gop_intra_factor, int lag_in_frames) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
   double this_frame_mv_in_out = 0.0;
@@ -2588,15 +2535,14 @@
 
     // Monitor for static sections.
     if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
-      zero_motion_accumulator =
-          VPXMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, next_frame));
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame));
     }
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       double last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+      loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2656,25 +2602,25 @@
   return gop_coding_frames;
 }
 
-static RANGE get_active_gf_inverval_range(
-    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
-    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+static RANGE get_active_gf_inverval_range_simple(int min_gf_interval,
+                                                 int arf_active_or_kf,
+                                                 int frames_to_key) {
   RANGE active_gf_interval;
-#if CONFIG_RATE_CTRL
-  (void)frame_info;
-  (void)gf_start_show_idx;
-  (void)active_worst_quality;
-  (void)last_boosted_qindex;
-  active_gf_interval.min = rc->min_gf_interval + arf_active_or_kf + 2;
-
+  active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2;
   active_gf_interval.max = 16 + arf_active_or_kf;
 
-  if ((active_gf_interval.max <= rc->frames_to_key) &&
-      (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) {
-    active_gf_interval.min = rc->frames_to_key / 2;
-    active_gf_interval.max = rc->frames_to_key / 2;
+  if ((active_gf_interval.max <= frames_to_key) &&
+      (active_gf_interval.max >= (frames_to_key - min_gf_interval))) {
+    active_gf_interval.min = frames_to_key / 2;
+    active_gf_interval.max = frames_to_key / 2;
   }
-#else
+  return active_gf_interval;
+}
+
+static RANGE get_active_gf_inverval_range(
+    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
+    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+  RANGE active_gf_interval;
   int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality,
                                                 frame_info->bit_depth));
   int q_term = (gf_start_show_idx == 0)
@@ -2712,7 +2658,6 @@
   }
   active_gf_interval.max =
       VPXMAX(active_gf_interval.max, active_gf_interval.min);
-#endif
   return active_gf_interval;
 }
 
@@ -2773,9 +2718,14 @@
 
   vpx_clear_system_state();
 
-  active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, gf_start_show_idx,
-      twopass->active_worst_quality, rc->last_boosted_qindex);
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, gf_start_show_idx,
+        twopass->active_worst_quality, rc->last_boosted_qindex);
+  }
 
   if (cpi->multi_layer_arf) {
     int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf,
@@ -2785,25 +2735,21 @@
     gop_intra_factor = 1.0;
   }
 
+  gop_coding_frames = get_gop_coding_frame_num(
+      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+  use_alt_ref &= allow_alt_ref;
 #if CONFIG_RATE_CTRL
-  {
+  // If the external gop_command is on, we will override the decisions
+  // of gop_coding_frames and use_alt_ref.
+  if (cpi->oxcf.use_simple_encode_api) {
     const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command;
     assert(allow_alt_ref == 1);
     if (gop_command->use) {
       gop_coding_frames = gop_command_coding_frame_count(gop_command);
       use_alt_ref = gop_command->use_alt_ref;
-    } else {
-      gop_coding_frames = get_gop_coding_frame_num(
-          &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
-          &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-      use_alt_ref &= allow_alt_ref;
     }
   }
-#else
-  gop_coding_frames = get_gop_coding_frame_num(
-      &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
-      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-  use_alt_ref &= allow_alt_ref;
 #endif
 
   // Was the group length constrained by the requirement for a new KF?
@@ -2823,8 +2769,8 @@
 
     // Calculate the boost for alt ref.
     rc->gfu_boost =
-        compute_arf_boost(frame_info, first_pass_info, arf_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
+        compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
     rc->source_alt_ref_pending = 1;
   } else {
     const int f_frames = gop_coding_frames - 1;
@@ -2834,9 +2780,9 @@
     const int gld_show_idx =
         VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info));
     const int arf_boost =
-        compute_arf_boost(frame_info, first_pass_info, gld_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
-    rc->gfu_boost = VPXMIN(MAX_GF_BOOST, arf_boost);
+        compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
+    rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -2939,7 +2885,9 @@
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+        (int)((tmp_q + (twopass->active_worst_quality *
+                        (twopass->active_wq_factor - 1))) /
+              twopass->active_wq_factor);
 
 #if CONFIG_ALWAYS_ADJUST_BPM
     // Reset rolling actual and target bits counters for ARF groups.
@@ -3159,18 +3107,10 @@
 #define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
 #define KF_ABS_ZOOM_THRESH 6.0
 
-#ifdef AGGRESSIVE_VBR
-#define KF_MAX_FRAME_BOOST 80.0
-#define MAX_KF_TOT_BOOST 4800
-#else
-#define KF_MAX_FRAME_BOOST 96.0
-#define MAX_KF_TOT_BOOST 5400
-#endif
-
 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval) {
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   int j;
   int frames_to_key;
@@ -3197,7 +3137,7 @@
           break;
 
         // How fast is the prediction quality decaying?
-        loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+        loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
         // We want to know something about the recent past... rather than
         // as used elsewhere where we are concerned with decay in prediction
@@ -3283,8 +3223,8 @@
   kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats,
                                      mean_mod_score, av_err);
 
-  rc->frames_to_key = vp9_get_frames_to_next_key(
-      oxcf, frame_info, first_pass_info, kf_show_idx, rc->min_gf_interval);
+  rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx,
+                                                 rc->min_gf_interval);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3366,7 +3306,7 @@
       if (i > 0) {
         zero_motion_accumulator =
             VPXMIN(zero_motion_accumulator,
-                   get_zero_motion_factor(&cpi->frame_info, &next_frame));
+                   get_zero_motion_factor(twopass, &next_frame));
       } else {
         zero_motion_accumulator =
             next_frame.pcnt_inter - next_frame.pcnt_motion;
@@ -3380,8 +3320,8 @@
       // the first key frame or it points to a refernce before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
-                                        KF_MAX_FRAME_BOOST * zm_factor);
+      frame_boost =
+          calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor);
 
       boost_score += frame_boost;
 
@@ -3410,12 +3350,12 @@
   // Special case for static / slide show content but dont apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = MAX_KF_TOT_BOOST;
+    rc->kf_boost = (int)(twopass->kf_max_total_boost);
   } else {
-    // Apply various clamps for min and max boost
+    // Apply various clamps for min and max oost
     rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
     rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost));
   }
 
   // Work out how many bits to allocate for the key frame itself.
@@ -3451,23 +3391,66 @@
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
+// Configure image size specific vizier parameters.
+// Later these will be set via additional command line options
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
+  // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
+  // have been initialised on the command line as adjustment factors such
+  // that a factor of 1.0 will match the default behavior when
+  // |use_vizier_rc_params| is 0
+  if (twopass->use_vizier_rc_params) {
+    twopass->active_wq_factor *= AV_WQ_FACTOR;
+    twopass->err_per_mb *= BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+    if (twopass->sr_default_decay_limit > 1.0)  // > 1.0 here makes no sense
+      twopass->sr_default_decay_limit = 1.0;
+    twopass->sr_diff_factor *= 1.0;
+    twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost *= MAX_GF_BOOST;
+    // NOTE: In use max boost has precedence over min boost. So even if min is
+    // somehow set higher than max the final boost value will be clamped to the
+    // appropriate maximum.
+    twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
+    twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+    if (twopass->zm_factor > 1.0)  // > 1.0 here makes no sense
+      twopass->zm_factor = 1.0;
 
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+    // Correction for the fact that the kf_err_per_mb_factor default is
+    // already different for different video formats and ensures that a passed
+    // in value of 1.0 on the vizier command line will still match the current
+    // default.
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb *= 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb *= 500.0;
+    } else {
+      twopass->kf_err_per_mb *= 250.0;
+    }
+  } else {
+    // When |use_vizier_rc_params| is 0, use defaults.
+    twopass->active_wq_factor = AV_WQ_FACTOR;
+    twopass->err_per_mb = BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_factor = 1.0;
+    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost = MAX_GF_BOOST;
+    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    twopass->zm_factor = DEFAULT_ZM_FACTOR;
+
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb = 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb = 500.0;
+    } else {
+      twopass->kf_err_per_mb = 250.0;
+    }
+  }
 }
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
@@ -3480,6 +3463,13 @@
 
   if (!twopass->stats_in) return;
 
+  // Configure image size specific vizier parameters
+  if (cm->current_video_frame == 0) {
+    unsigned int screen_area = (cm->width * cm->height);
+
+    vp9_init_vizier_params(twopass, screen_area);
+  }
+
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
@@ -3498,13 +3488,6 @@
 
     cm->frame_type = INTER_FRAME;
 
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-        !cpi->use_svc) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
-
     // The multiplication by 256 reverses a scaling factor of (>> 8)
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
@@ -3587,13 +3570,6 @@
 
   vp9_configure_buffer_updates(cpi, gf_group->index);
 
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      !cpi->use_svc) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
 
   // The multiplication by 256 reverses a scaling factor of (>> 8)
@@ -3605,9 +3581,6 @@
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3747,8 +3720,7 @@
   *first_is_key_frame = 0;
   if (rc.frames_to_key == 0) {
     rc.frames_to_key = vp9_get_frames_to_next_key(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-        *first_show_idx, rc.min_gf_interval);
+        &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval);
     rc.frames_since_key = 0;
     *first_is_key_frame = 1;
   }
@@ -3756,18 +3728,18 @@
   if (gop_command->use) {
     *coding_frame_count = gop_command_coding_frame_count(gop_command);
     *use_alt_ref = gop_command->use_alt_ref;
-    assert(*coding_frame_count < rc.frames_to_key);
+    assert(gop_command->show_frame_count <= rc.frames_to_key);
   } else {
     *coding_frame_count = vp9_get_gop_coding_frame_count(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, &rc,
-        *first_show_idx, multi_layer_arf, allow_alt_ref, *first_is_key_frame,
+        &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx,
+        multi_layer_arf, allow_alt_ref, *first_is_key_frame,
         *last_gop_use_alt_ref, use_alt_ref);
   }
 }
 
 int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
@@ -3775,12 +3747,19 @@
   int frame_count;
   double gop_intra_factor;
   const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
-  RANGE active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
-      /*last_boosted_qindex=*/0);
+  RANGE active_gf_interval;
+  int arf_layers;
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
+        /*last_boosted_qindex=*/0);
+  }
 
-  const int arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
-                                        active_gf_interval.max);
+  arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
+                              active_gf_interval.max);
   if (multi_layer_arf) {
     gop_intra_factor = 1.0 + 0.25 * arf_layers;
   } else {
@@ -3787,9 +3766,9 @@
     gop_intra_factor = 1.0;
   }
 
-  frame_count = get_gop_coding_frame_num(
-      use_alt_ref, frame_info, first_pass_info, rc, show_idx,
-      &active_gf_interval, gop_intra_factor, oxcf->lag_in_frames);
+  frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
+                                         show_idx, &active_gf_interval,
+                                         gop_intra_factor, oxcf->lag_in_frames);
   *use_alt_ref &= allow_alt_ref;
   return frame_count;
 }
@@ -3797,9 +3776,10 @@
 // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref) {
+                             const TWO_PASS *const twopass,
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int coding_frame_num = 0;
   RATE_CONTROL rc;
   int gop_coding_frame_count;
@@ -3812,14 +3792,14 @@
     int use_alt_ref;
     int first_is_key_frame = 0;
     if (rc.frames_to_key == 0) {
-      rc.frames_to_key = vp9_get_frames_to_next_key(
-          oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+      rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx,
+                                                    rc.min_gf_interval);
       rc.frames_since_key = 0;
       first_is_key_frame = 1;
     }
 
     gop_coding_frame_count = vp9_get_gop_coding_frame_count(
-        oxcf, frame_info, first_pass_info, &rc, show_idx, multi_layer_arf,
+        oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf,
         allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref);
 
     rc.source_alt_ref_active = use_alt_ref;
@@ -3834,9 +3814,8 @@
 }
 
 void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map) {
+                           const TWO_PASS *const twopass, int *key_frame_map) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int show_idx = 0;
   RATE_CONTROL rc;
   vp9_rc_init(oxcf, 1, &rc);
@@ -3849,8 +3828,8 @@
   while (show_idx < first_pass_info->num_frames) {
     int key_frame_group_size;
     key_frame_map[show_idx] = 1;
-    key_frame_group_size = vp9_get_frames_to_next_key(
-        oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+    key_frame_group_size =
+        vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval);
     assert(key_frame_group_size > 0);
     show_idx += key_frame_group_size;
   }
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -21,27 +21,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_FP_MB_STATS
-
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
-
-typedef struct {
-  uint8_t *mb_stats_start;
-  uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
-
 #define INVALID_ROW (-1)
 
 #define MAX_ARF_LAYERS 6
@@ -188,12 +167,6 @@
   double mb_av_energy;
   double mb_smooth_pct;
 
-#if CONFIG_FP_MB_STATS
-  uint8_t *frame_mb_stats_buf;
-  uint8_t *this_frame_mb_stats;
-  FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
-
   FP_MB_FLOAT_STATS *fp_mb_float_stats;
 
   // An indication of the content type of the current frame
@@ -221,6 +194,24 @@
   int last_qindex_of_arf_layer[MAX_ARF_LAYERS];
 
   GF_GROUP gf_group;
+
+  // Vizeir project experimental two pass rate control parameters.
+  // When |use_vizier_rc_params| is 1, the following parameters will
+  // be overwritten by pass in values. Otherwise, they are initialized
+  // by default values.
+  int use_vizier_rc_params;
+  double active_wq_factor;
+  double err_per_mb;
+  double sr_default_decay_limit;
+  double sr_diff_factor;
+  double kf_err_per_mb;
+  double kf_frame_min_boost;
+  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
+  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
+  double kf_max_total_boost;
+  double gf_max_total_boost;
+  double gf_frame_max_boost;
+  double zm_factor;
 } TWO_PASS;
 
 struct VP9_COMP;
@@ -239,6 +230,7 @@
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
@@ -248,9 +240,8 @@
 
 struct VP9EncoderConfig;
 int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval);
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval);
 #if CONFIG_RATE_CTRL
 /* Call this function to get info about the next group of pictures.
  * This function should be called after vp9_create_compressor() when encoding
@@ -265,8 +256,8 @@
 /*!\brief Call this function before coding a new group of pictures to get
  * information about it.
  * \param[in] oxcf                 Encoder config
+ * \param[in] twopass              Twopass info
  * \param[in] frame_info           Frame info
- * \param[in] first_pass_info      First pass stats
  * \param[in] rc                   Rate control state
  * \param[in] show_idx             Show index of the first frame in the group
  * \param[in] multi_layer_arf      Is multi-layer alternate reference used
@@ -279,8 +270,8 @@
  * \return Returns coding frame count
  */
 int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
@@ -287,19 +278,17 @@
                                    int last_gop_use_alt_ref, int *use_alt_ref);
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref);
+                             const TWO_PASS *const twopass,
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref);
 
 /*!\brief Compute a key frame binary map indicates whether key frames appear
  * in the corresponding positions. The passed in key_frame_map must point to an
- * integer array with length equal to first_pass_info->num_frames, which is the
- * number of show frames in the video.
+ * integer array with length equal to twopass->first_pass_info.num_frames,
+ * which is the number of show frames in the video.
  */
 void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map);
+                           const TWO_PASS *const twopass, int *key_frame_map);
 #endif  // CONFIG_RATE_CTRL
 
 FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass);
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -82,15 +82,11 @@
  * This function will copy the source image into a new framebuffer with
  * the expected stride/border.
  *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
  * \param[in] ctx         Pointer to the lookahead context
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -39,9 +39,6 @@
 #define MAX_MB_RATE 250
 #define MAXRATE_1080P 4000000
 
-#define DEFAULT_KF_BOOST 2000
-#define DEFAULT_GF_BOOST 2000
-
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
 #define MIN_BPB_FACTOR 0.005
@@ -280,9 +277,9 @@
         svc->current_superframe > 0) {
       // TODO(marpan): This may need to be modified for temporal layers.
       const double framerate_pts = 10000000.0 / ts_delta;
-      lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts);
     } else {
-      lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate);
     }
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
@@ -410,6 +407,7 @@
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
+  rc->constrain_gf_key_freq_onepass_vbr = 1;
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
@@ -1720,10 +1718,12 @@
   }
 
 #if CONFIG_RATE_CTRL
-  if (cpi->encode_command.use_external_target_frame_bits) {
-    rc->this_frame_target = cpi->encode_command.target_frame_bits;
+  if (cpi->oxcf.use_simple_encode_api) {
+    if (cpi->encode_command.use_external_target_frame_bits) {
+      rc->this_frame_target = cpi->encode_command.target_frame_bits;
+    }
   }
-#endif
+#endif  // CONFIG_RATE_CTRL
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) /
@@ -2009,7 +2009,7 @@
   }
 }
 
-static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const int af_ratio = rc->af_ratio_onepass_vbr;
   int64_t target =
@@ -2024,7 +2024,7 @@
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
 
-static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
   const int target = rc->avg_frame_bandwidth * kf_ratio;
@@ -2050,22 +2050,9 @@
   }
 }
 
-void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0)) {
-    cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
+  VP9_COMMON *const cm = &cpi->common;
   if (rc->frames_till_gf_update_due == 0) {
     double rate_err = 1.0;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -2084,18 +2071,23 @@
           rate_err > 3.5) {
         rc->baseline_gf_interval =
             VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
-      } else if (rc->avg_frame_low_motion < 20) {
+      } else if (rc->avg_frame_low_motion > 0 &&
+                 rc->avg_frame_low_motion < 20) {
         // Decrease gf interval for high motion case.
         rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
       }
-      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
-      // between 0 and 100 (stationary, 100% zero/small motion).
-      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+      // Adjust boost and af_ratio based on avg_frame_low_motion, which
+      // varies between 0 and 100 (stationary, 100% zero/small motion).
+      if (rc->avg_frame_low_motion > 0)
+        rc->gfu_boost =
+            VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                            (rc->avg_frame_low_motion + 100));
+      else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0)
+        rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
-    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+    if (rc->constrain_gf_key_freq_onepass_vbr)
+      adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = 0;
@@ -2105,10 +2097,29 @@
       rc->alt_ref_gf_group = 1;
     }
   }
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0)) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  vp9_set_gf_update_one_pass_vbr(cpi);
   if (cm->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
     vp9_cyclic_refresh_update_parameters(cpi);
@@ -2526,26 +2537,25 @@
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
   } else {
+    double framerate = cpi->framerate;
     // Set Maximum gf/arf interval
     rc->max_gf_interval = oxcf->max_gf_interval;
     rc->min_gf_interval = oxcf->min_gf_interval;
 #if CONFIG_RATE_CTRL
+    if (oxcf->use_simple_encode_api) {
+      // In this experiment, we avoid framerate being changed dynamically during
+      // encoding.
+      framerate = oxcf->init_framerate;
+    }
+#endif  // CONFIG_RATE_CTRL
     if (rc->min_gf_interval == 0) {
       rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, oxcf->init_framerate);
+          oxcf->width, oxcf->height, framerate);
     }
     if (rc->max_gf_interval == 0) {
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          oxcf->init_framerate, rc->min_gf_interval);
+      rc->max_gf_interval =
+          vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval);
     }
-#else
-    if (rc->min_gf_interval == 0)
-      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
-    if (rc->max_gf_interval == 0)
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          cpi->framerate, rc->min_gf_interval);
-#endif
 
     // Extended max interval for genuinely static scenes like slide shows.
     rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
@@ -2953,7 +2963,7 @@
         }
       }
     }
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
   rc->prev_avg_source_sad_lag = avg_source_sad_lag;
@@ -3163,7 +3173,7 @@
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
       adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
       rc->count_last_scene_change = 0;
     } else {
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -27,6 +27,9 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
@@ -204,6 +207,10 @@
   int preserve_arf_as_gld;
   int preserve_next_arf_as_gld;
   int show_arf_as_gld;
+
+  // Flag to constrain golden frame interval on key frame frequency for 1 pass
+  // VBR.
+  int constrain_gf_key_freq_onepass_vbr;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -255,6 +262,9 @@
 void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
 int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
 int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi);
 void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi);
 void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
 
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -197,28 +197,68 @@
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
                                                               128, 144, 144 };
 
+// Configure Vizier RD parameters.
+// Later this function will use passed in command line values.
+void vp9_init_rd_parameters(VP9_COMP *cpi) {
+  RD_CONTROL *const rdc = &cpi->rd_ctrl;
+
+  // When |use_vizier_rc_params| is 1, we expect the rd parameters have been
+  // initialized by the pass in values.
+  // Be careful that parameters below are only initialized to 1, if we do not
+  // pass values to them. It is desired to take care of each parameter when
+  // using |use_vizier_rc_params|.
+  if (cpi->twopass.use_vizier_rc_params) return;
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  rdc->rd_mult_inter_qp_fac = 1.0;
+  rdc->rd_mult_arf_qp_fac = 1.0;
+  rdc->rd_mult_key_qp_fac = 1.0;
+}
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 4.15 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 4.25 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 4.35 + (0.001 * (double)qindex);
+}
+
 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
-  // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
+  const RD_CONTROL *rdc = &cpi->rd_ctrl;
   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
-  uint32_t rdmult = q * q;
+  // largest dc_quant is 21387, therefore rdmult should fit in int32_t
+  int rdmult = q * q;
 
-  if (cpi->common.frame_type != KEY_FRAME) {
-    if (qindex < 128)
-      rdmult = rdmult * 4;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 3;
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac);
+  } else if (!cpi->rc.is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac);
   } else {
-    if (qindex < 64)
-      rdmult = rdmult * 4;
-    else if (qindex <= 128)
-      rdmult = rdmult * 3 + rdmult / 2;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 7 + rdmult / 2;
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac);
   }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -101,6 +101,13 @@
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
+typedef struct {
+  // RD multiplier control factors added for Vizier project.
+  double rd_mult_inter_qp_fac;
+  double rd_mult_arf_qp_fac;
+  double rd_mult_key_qp_fac;
+} RD_CONTROL;
+
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
@@ -143,6 +150,8 @@
 struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
+
+void vp9_init_rd_parameters(struct VP9_COMP *cpi);
 
 int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
 
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -745,8 +745,8 @@
   MODE_INFO *const mi = xd->mi[0];
   int64_t rd1, rd2, rd;
   int rate;
-  int64_t dist;
-  int64_t sse;
+  int64_t dist = INT64_MAX;
+  int64_t sse = INT64_MAX;
   const int coeff_ctx =
       combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
   struct buf_2d *recon = args->this_recon;
@@ -799,6 +799,13 @@
     if (max_txsize_lookup[plane_bsize] == tx_size)
       skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
 
+    // This reduces the risk of bad perceptual quality due to bad prediction.
+    // We always force the encoder to perform transform and quantization.
+    if (!args->cpi->sf.allow_skip_txfm_ac_dc &&
+        skip_txfm_flag == SKIP_TXFM_AC_DC) {
+      skip_txfm_flag = SKIP_TXFM_NONE;
+    }
+
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
       // full forward transform and quantization
@@ -827,17 +834,7 @@
         dist = VPXMAX(0, sse - dc_correct);
       }
     } else {
-      // SKIP_TXFM_AC_DC
-      // skip forward transform. Because this is handled here, the quantization
-      // does not need to do it.
-      x->plane[plane].eobs[block] = 0;
-      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
-      if (recon) {
-        uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
-        copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
-                           blk_row, blk_col, plane_bsize, tx_bsize);
-      }
+      assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC.");
     }
   }
 
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -345,7 +345,6 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-    sf->allow_partition_search_skip = 1;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -931,7 +930,6 @@
   sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
-  sf->allow_partition_search_skip = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
@@ -940,6 +938,7 @@
   sf->enable_tpl_model = oxcf->enable_tpl_model;
   sf->prune_ref_frame_for_rect_partitions = 0;
   sf->temporal_filter_search_method = MESH;
+  sf->allow_skip_txfm_ac_dc = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -525,9 +525,6 @@
     int prune_rect_thresh[4];
   } rd_ml_partition;
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
   // Fast approximation of vp9_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
@@ -612,6 +609,12 @@
   // For real-time mode: force DC only under intra search when content
   // does not have high souce SAD.
   int rt_intra_dc_only_low_content;
+
+  // The encoder has a feature that skips forward transform and quantization
+  // based on a model rd estimation to reduce encoding time.
+  // However, this feature is dangerous since it could lead to bad perceptual
+  // quality. This flag is added to guard the feature.
+  int allow_skip_txfm_ac_dc;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -322,8 +322,8 @@
     const int prev_layer_target_bandwidth =
         oxcf->layer_target_bitrate[st_idx - 1];
     lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -956,7 +956,7 @@
   if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
       !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
     svc->non_reference_frame = 1;
-  // For non-flexible mode, where update_buffer_slot is used, need to check if
+  // For flexible mode, where update_buffer_slot is used, need to check if
   // all buffer slots are not refreshed.
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
     if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -173,6 +173,8 @@
   uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
 
   int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+  // Quantizer for each spatial layer.
+  int base_qindex[VPX_SS_MAX_LAYERS];
   uint8_t set_intra_only_frame;
   uint8_t previous_frame_is_intra_only;
   uint8_t superframe_has_layer_sync;
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -11,6 +11,7 @@
 
 #include <new>
 
+#include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vpx/vp8cx.h"
@@ -24,10 +25,19 @@
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (rc_api->cpi_ == nullptr) {
-    return nullptr;
-  }
+  if (!rc_api->cpi_) return nullptr;
+  vp9_zero(*rc_api->cpi_);
+
   rc_api->InitRateControl(cfg);
+  if (cfg.aq_mode) {
+    VP9_COMP *const cpi = rc_api->cpi_;
+    cpi->segmentation_map = static_cast<uint8_t *>(
+        vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                   sizeof(*cpi->segmentation_map)));
+    cpi->cyclic_refresh =
+        vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
+    cpi->cyclic_refresh->content_mode = 0;
+  }
   return rc_api;
 }
 
@@ -38,13 +48,18 @@
   cm->profile = PROFILE_0;
   cm->bit_depth = VPX_BITS_8;
   cm->show_frame = 1;
-  oxcf->rc_mode = VPX_CBR;
+  oxcf->profile = cm->profile;
+  oxcf->bit_depth = cm->bit_depth;
+  oxcf->rc_mode = rc_cfg.rc_mode;
   oxcf->pass = 0;
-  oxcf->aq_mode = NO_AQ;
+  oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
   oxcf->content = VP9E_CONTENT_DEFAULT;
   oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  rc->kf_boost = DEFAULT_KF_BOOST;
 
   UpdateRateControl(rc_cfg);
+  vp9_set_mb_mi(cm, cm->width, cm->height);
 
   cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
                    cpi_->svc.number_temporal_layers > 1)
@@ -55,8 +70,8 @@
   rc->rc_2_frame = 0;
   vp9_rc_init_minq_luts();
   vp9_rc_init(oxcf, 0, rc);
+  rc->constrain_gf_key_freq_onepass_vbr = 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
-  cm->current_video_frame = 0;
 }
 
 void VP9RateControlRTC::UpdateRateControl(
@@ -73,6 +88,7 @@
   oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer);
   rc->worst_quality = oxcf->worst_allowed_q;
   rc->best_quality = oxcf->best_allowed_q;
+  oxcf->init_framerate = rc_cfg.framerate;
   oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
   oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz;
   oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
@@ -85,10 +101,11 @@
       (rc_cfg.ts_number_layers > 1) ? rc_cfg.ts_number_layers : 0);
 
   cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
-
+  vp9_set_mb_mi(cm, cm->width, cm->height);
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =
@@ -138,11 +155,27 @@
   cpi_->sf.use_nonrd_pick_mode = 1;
   if (cpi_->svc.number_spatial_layers == 1 &&
       cpi_->svc.number_temporal_layers == 1) {
-    int target;
-    if (frame_is_intra_only(cm))
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
-    else
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    int target = 0;
+    if (cpi_->oxcf.rc_mode == VPX_CBR) {
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    } else if (cpi_->oxcf.rc_mode == VPX_VBR) {
+      if (cm->frame_type == KEY_FRAME) {
+        cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0;
+        cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
+      }
+      vp9_set_gf_update_one_pass_vbr(cpi_);
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_);
+    }
     vp9_rc_set_frame_target(cpi_, target);
     vp9_update_buffer_level_preencode(cpi_);
   } else {
@@ -153,6 +186,8 @@
   int bottom_index, top_index;
   cpi_->common.base_qindex =
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
+
+  if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
@@ -161,6 +196,14 @@
   struct loopfilter *const lf = &cpi_->common.lf;
   vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q);
   return lf->filter_level;
+}
+
+signed char *VP9RateControlRTC::GetCyclicRefreshMap() const {
+  return cpi_->cyclic_refresh->map;
+}
+
+int *VP9RateControlRTC::GetDeltaQ() const {
+  return cpi_->cyclic_refresh->qindex_delta;
 }
 
 void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -18,27 +18,30 @@
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
 namespace libvpx {
 
-struct VP9RateControlRtcConfig {
-  int width;
-  int height;
-  // 0-63
-  int max_quantizer;
-  int min_quantizer;
-  int64_t target_bandwidth;
-  int64_t buf_initial_sz;
-  int64_t buf_optimal_sz;
-  int64_t buf_sz;
-  int undershoot_pct;
-  int overshoot_pct;
-  int max_intra_bitrate_pct;
-  double framerate;
+struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP9RateControlRtcConfig() {
+    vp9_zero(max_quantizers);
+    vp9_zero(min_quantizers);
+    vp9_zero(scaling_factor_den);
+    vp9_zero(scaling_factor_num);
+    vp9_zero(layer_target_bitrate);
+    vp9_zero(ts_rate_decimator);
+    scaling_factor_num[0] = 1;
+    scaling_factor_den[0] = 1;
+    max_quantizers[0] = max_quantizer;
+    min_quantizers[0] = min_quantizer;
+  }
+
   // Number of spatial layers
   int ss_number_layers;
   // Number of temporal layers
@@ -47,8 +50,6 @@
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int layer_target_bitrate[VPX_MAX_LAYERS];
-  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 };
 
 struct VP9FrameParamsQpRTC {
@@ -58,7 +59,7 @@
 };
 
 // This interface allows using VP9 real-time rate control without initializing
-// the encoder. To use this interface, you need to link with libvp9rc.a.
+// the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
 // VP9RateControlRTC rc_api;
@@ -84,15 +85,23 @@
       const VP9RateControlRtcConfig &cfg);
   ~VP9RateControlRTC() {
     if (cpi_) {
-      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-          vpx_free(lc->map);
-          vpx_free(lc->last_coded_q_map);
-          vpx_free(lc->consec_zero_mv);
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+          for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+            int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+            LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+            vpx_free(lc->map);
+            vpx_free(lc->last_coded_q_map);
+            vpx_free(lc->consec_zero_mv);
+          }
         }
       }
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        vpx_free(cpi_->segmentation_map);
+        cpi_->segmentation_map = NULL;
+        vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+      }
       vpx_free(cpi_);
     }
   }
@@ -101,6 +110,8 @@
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
+  signed char *GetCyclicRefreshMap() const;
+  int *GetDeltaQ() const;
   void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -793,6 +793,7 @@
   if (enc_pass == VPX_RC_FIRST_PASS) {
     oxcf.lag_in_frames = 0;
   }
+  oxcf.use_simple_encode_api = 1;
   return oxcf;
 }
 
@@ -872,14 +873,14 @@
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
-  VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
-  struct lookahead_ctx *lookahead = cpi->lookahead;
+  impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+  struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead;
   int i;
   int use_highbitdepth = 0;
   const int num_rows_16x16 = get_num_unit_16x16(frame_height_);
   const int num_cols_16x16 = get_num_unit_16x16(frame_width_);
 #if CONFIG_VP9_HIGHBITDEPTH
-  use_highbitdepth = cpi->common.use_highbitdepth;
+  use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
 #endif
   vpx_image_t img;
   vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
@@ -905,30 +906,35 @@
         ENCODE_FRAME_RESULT encode_frame_info;
         vp9_init_encode_frame_result(&encode_frame_info);
         // TODO(angiebird): Call vp9_first_pass directly
-        vp9_get_compressed_data(cpi, &frame_flags, &size, nullptr, &time_stamp,
-                                &time_end, flush, &encode_frame_info);
+        vp9_get_compressed_data(impl_ptr_->cpi, &frame_flags, &size, nullptr,
+                                &time_stamp, &time_end, flush,
+                                &encode_frame_info);
         // vp9_get_compressed_data only generates first pass stats not
         // compresses data
         assert(size == 0);
         // Get vp9 first pass motion vector info.
         std::vector<MotionVectorInfo> mv_info(num_rows_16x16 * num_cols_16x16);
-        update_motion_vector_info(cpi->fp_motion_vector_info, num_rows_16x16,
-                                  num_cols_16x16, mv_info.data(),
-                                  kMotionVectorFullPixelPrecision);
+        update_motion_vector_info(
+            impl_ptr_->cpi->fp_motion_vector_info, num_rows_16x16,
+            num_cols_16x16, mv_info.data(), kMotionVectorFullPixelPrecision);
         fp_motion_vector_info_.push_back(mv_info);
       }
-      impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass));
+      impl_ptr_->first_pass_stats.push_back(
+          vp9_get_frame_stats(&impl_ptr_->cpi->twopass));
     }
   }
-  vp9_end_first_pass(cpi);
   // TODO(angiebird): Store the total_stats apart form first_pass_stats
-  impl_ptr_->first_pass_stats.push_back(vp9_get_total_stats(&cpi->twopass));
-  free_encoder(cpi);
-  rewind(in_file_);
-  vpx_img_free(&img);
+  impl_ptr_->first_pass_stats.push_back(
+      vp9_get_total_stats(&impl_ptr_->cpi->twopass));
+  vp9_end_first_pass(impl_ptr_->cpi);
 
   // Generate key_frame_map based on impl_ptr_->first_pass_stats.
   key_frame_map_ = ComputeKeyFrameMap();
+
+  free_encoder(impl_ptr_->cpi);
+  impl_ptr_->cpi = nullptr;
+  rewind(in_file_);
+  vpx_img_free(&img);
 }
 
 std::vector<std::vector<double>> SimpleEncode::ObserveFirstPassStats() {
@@ -1004,8 +1010,7 @@
 static GOP_COMMAND GetGopCommand(const std::vector<int> &gop_map,
                                  int start_show_index) {
   GOP_COMMAND gop_command;
-  if (gop_map.size() > 0) {
-    assert(static_cast<size_t>(start_show_index) < gop_map.size());
+  if (static_cast<size_t>(start_show_index) < gop_map.size()) {
     assert((gop_map[start_show_index] & kGopMapFlagStart) != 0);
     int end_show_index = start_show_index + 1;
     // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is
@@ -1049,6 +1054,11 @@
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
 
+  assert(impl_ptr_->cpi != nullptr);
+  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&impl_ptr_->cpi->twopass, screen_area);
+
   UpdateKeyFrameGroup(show_frame_count_);
 
   const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_);
@@ -1084,8 +1094,7 @@
   const VP9_COMP *cpi = impl_ptr_->cpi;
   key_frame_group_index_ = 0;
   key_frame_group_size_ = vp9_get_frames_to_next_key(
-      &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-      key_frame_show_index, cpi->rc.min_gf_interval);
+      &cpi->oxcf, &cpi->twopass, key_frame_show_index, cpi->rc.min_gf_interval);
   assert(key_frame_group_size_ > 0);
   // Init the reference frame info when a new key frame group appears.
   InitRefFrameInfo(&ref_frame_info_);
@@ -1239,7 +1248,7 @@
     start_show_index += gop_command.show_frame_count;
     coding_frame_count += gop_command_coding_frame_count(&gop_command);
   }
-  assert(start_show_index == gop_map.size());
+  assert(static_cast<size_t>(start_show_index) == gop_map.size());
   return coding_frame_count;
 }
 
@@ -1250,6 +1259,7 @@
   }
 
   // These are the default settings for now.
+  TWO_PASS twopass;
   const int multi_layer_arf = 0;
   const int allow_alt_ref = 1;
   vpx_rational_t frame_rate =
@@ -1258,30 +1268,30 @@
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
+  fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
-  return vp9_get_coding_frame_num(&oxcf, &frame_info, &first_pass_info,
-                                  multi_layer_arf, allow_alt_ref);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&twopass, screen_area);
+  return vp9_get_coding_frame_num(&oxcf, &twopass, &frame_info, multi_layer_arf,
+                                  allow_alt_ref);
 }
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
   // The last entry of first_pass_stats is the overall stats.
-  assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1);
+  assert(impl_ptr_->first_pass_stats.size() ==
+         static_cast<size_t>(num_frames_) + 1);
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
       VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
-  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
+  TWO_PASS twopass;
+  fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
   std::vector<int> key_frame_map(num_frames_, 0);
-  vp9_get_key_frame_map(&oxcf, &frame_info, &first_pass_info,
-                        GetVectorData(key_frame_map));
+  vp9_get_key_frame_map(&oxcf, &twopass, GetVectorData(key_frame_map));
   return key_frame_map;
 }
 
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -348,6 +348,24 @@
   }
   RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
+
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -565,10 +583,6 @@
 
   vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in);
 
-#if CONFIG_FP_MB_STATS
-  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
-
   oxcf->color_space = extra_cfg->color_space;
   oxcf->color_range = extra_cfg->color_range;
   oxcf->render_width = extra_cfg->render_width;
@@ -634,10 +648,135 @@
   }
 
   if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
+  oxcf->use_simple_encode_api = 0;
   // vp9_dump_encoder_config(oxcf, stderr);
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t set_twopass_params_from_config(
+    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK;
+  if (cpi == NULL) return VPX_CODEC_ERROR;
+
+  cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
+
+  // The values set here are factors that will be applied to default values
+  // to get the final value used in the two pass code. Hence 1.0 will
+  // match the default behaviour when not using passed in values.
+  // We also apply limits here to prevent the user from applying settings
+  // that make no sense.
+  cpi->twopass.active_wq_factor =
+      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  if (cpi->twopass.active_wq_factor < 0.25)
+    cpi->twopass.active_wq_factor = 0.25;
+  else if (cpi->twopass.active_wq_factor > 16.0)
+    cpi->twopass.active_wq_factor = 16.0;
+
+  cpi->twopass.err_per_mb =
+      (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den;
+  if (cpi->twopass.err_per_mb < 0.25)
+    cpi->twopass.err_per_mb = 0.25;
+  else if (cpi->twopass.err_per_mb > 4.0)
+    cpi->twopass.err_per_mb = 4.0;
+
+  cpi->twopass.sr_default_decay_limit =
+      (double)cfg->sr_default_decay_limit.num /
+      (double)cfg->sr_default_decay_limit.den;
+  if (cpi->twopass.sr_default_decay_limit < 0.25)
+    cpi->twopass.sr_default_decay_limit = 0.25;
+  // If the default changes this will need to change.
+  else if (cpi->twopass.sr_default_decay_limit > 1.33)
+    cpi->twopass.sr_default_decay_limit = 1.33;
+
+  cpi->twopass.sr_diff_factor =
+      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  if (cpi->twopass.sr_diff_factor < 0.25)
+    cpi->twopass.sr_diff_factor = 0.25;
+  else if (cpi->twopass.sr_diff_factor > 4.0)
+    cpi->twopass.sr_diff_factor = 4.0;
+
+  cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num /
+                               (double)cfg->kf_err_per_mb_factor.den;
+  if (cpi->twopass.kf_err_per_mb < 0.25)
+    cpi->twopass.kf_err_per_mb = 0.25;
+  else if (cpi->twopass.kf_err_per_mb > 4.0)
+    cpi->twopass.kf_err_per_mb = 4.0;
+
+  cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num /
+                                    (double)cfg->kf_frame_min_boost_factor.den;
+  if (cpi->twopass.kf_frame_min_boost < 0.25)
+    cpi->twopass.kf_frame_min_boost = 0.25;
+  else if (cpi->twopass.kf_frame_min_boost > 4.0)
+    cpi->twopass.kf_frame_min_boost = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_first =
+      (double)cfg->kf_frame_max_boost_first_factor.num /
+      (double)cfg->kf_frame_max_boost_first_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_first < 0.25)
+    cpi->twopass.kf_frame_max_boost_first = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_first > 4.0)
+    cpi->twopass.kf_frame_max_boost_first = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_subs =
+      (double)cfg->kf_frame_max_boost_subs_factor.num /
+      (double)cfg->kf_frame_max_boost_subs_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_subs < 0.25)
+    cpi->twopass.kf_frame_max_boost_subs = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_subs > 4.0)
+    cpi->twopass.kf_frame_max_boost_subs = 4.0;
+
+  cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num /
+                                    (double)cfg->kf_max_total_boost_factor.den;
+  if (cpi->twopass.kf_max_total_boost < 0.25)
+    cpi->twopass.kf_max_total_boost = 0.25;
+  else if (cpi->twopass.kf_max_total_boost > 4.0)
+    cpi->twopass.kf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num /
+                                    (double)cfg->gf_max_total_boost_factor.den;
+  if (cpi->twopass.gf_max_total_boost < 0.25)
+    cpi->twopass.gf_max_total_boost = 0.25;
+  else if (cpi->twopass.gf_max_total_boost > 4.0)
+    cpi->twopass.gf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num /
+                                    (double)cfg->gf_frame_max_boost_factor.den;
+  if (cpi->twopass.gf_frame_max_boost < 0.25)
+    cpi->twopass.gf_frame_max_boost = 0.25;
+  else if (cpi->twopass.gf_frame_max_boost > 4.0)
+    cpi->twopass.gf_frame_max_boost = 4.0;
+
+  cpi->twopass.zm_factor =
+      (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
+  if (cpi->twopass.zm_factor < 0.25)
+    cpi->twopass.zm_factor = 0.25;
+  else if (cpi->twopass.zm_factor > 2.0)
+    cpi->twopass.zm_factor = 2.0;
+
+  cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
+                                      (double)cfg->rd_mult_inter_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_arf_qp_fac =
+      (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_key_qp_fac =
+      (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
@@ -664,6 +803,7 @@
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     // On profile change, request a key frame
     force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -690,6 +830,25 @@
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  int i;
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; i++) {
+    arg[i] = ctx->cpi->svc.base_qindex[i];
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->cpi->common.lf.filter_level;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
                                         const struct vp9_extracfg *extra_cfg) {
   const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
@@ -696,6 +855,7 @@
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -886,6 +1046,18 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  if (data) {
+    cpi->compute_frame_low_motion_onepass = 0;
+    cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
+    cpi->cyclic_refresh->content_mode = 0;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -940,6 +1112,7 @@
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
+      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 
@@ -1744,6 +1917,7 @@
   if (oxcf->pass == 2) {
     const FRAME_INFO *frame_info = &cpi->frame_info;
     vpx_rc_config_t ratectrl_config;
+    vpx_codec_err_t codec_status;
 
     ratectrl_config.frame_width = frame_info->frame_width;
     ratectrl_config.frame_height = frame_info->frame_height;
@@ -1755,7 +1929,10 @@
     ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
 
-    vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      return codec_status;
+    }
   }
   return VPX_CODEC_OK;
 }
@@ -1812,11 +1989,14 @@
   { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
   { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv },
   { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
+  { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers },
+  { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { VP9_GET_REFERENCE, ctrl_get_reference },
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
@@ -1879,14 +2059,30 @@
 
         VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
         { 0 },
-        { 0 },  // ss_target_bitrate
-        1,      // ts_number_layers
-        { 0 },  // ts_target_bitrate
-        { 0 },  // ts_rate_decimator
-        0,      // ts_periodicity
-        { 0 },  // ts_layer_id
-        { 0 },  // layer_taget_bitrate
-        0       // temporal_layering_mode
+        { 0 },     // ss_target_bitrate
+        1,         // ts_number_layers
+        { 0 },     // ts_target_bitrate
+        { 0 },     // ts_rate_decimator
+        0,         // ts_periodicity
+        { 0 },     // ts_layer_id
+        { 0 },     // layer_taget_bitrate
+        0,         // temporal_layering_mode
+        0,         // use_vizier_rc_params
+        { 1, 1 },  // active_wq_factor
+        { 1, 1 },  // err_per_mb_factor
+        { 1, 1 },  // sr_default_decay_limit
+        { 1, 1 },  // sr_diff_factor
+        { 1, 1 },  // kf_err_per_mb_factor
+        { 1, 1 },  // kf_frame_min_boost_factor
+        { 1, 1 },  // kf_frame_max_boost_first_factor
+        { 1, 1 },  // kf_frame_max_boost_subs_factor
+        { 1, 1 },  // kf_max_total_boost_factor
+        { 1, 1 },  // gf_max_total_boost_factor
+        { 1, 1 },  // gf_frame_max_boost_factor
+        { 1, 1 },  // zm_factor
+        { 1, 1 },  // rd_mult_inter_qp_fac
+        { 1, 1 },  // rd_mult_arf_qp_fac
+        { 1, 1 },  // rd_mult_key_qp_fac
     } },
 };
 
@@ -2105,11 +2301,6 @@
   DUMP_STRUCT_VALUE(fp, oxcf, target_level);
 
   // TODO(angiebird): dump two_pass_stats_in
-
-#if CONFIG_FP_MB_STATS
-  // TODO(angiebird): dump firstpass_mb_stats_in
-#endif
-
   DUMP_STRUCT_VALUE(fp, oxcf, tuning);
   DUMP_STRUCT_VALUE(fp, oxcf, content);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -2123,6 +2314,8 @@
 
   DUMP_STRUCT_VALUE(fp, oxcf, row_mt);
   DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test);
+  DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv);
+  DUMP_STRUCT_VALUE(fp, oxcf, use_simple_encode_api);
 }
 
 FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) {
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -283,7 +283,7 @@
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */
--- /dev/null
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -1,0 +1,62 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_RATECTRL_RTC_H_
+
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx {
+struct VpxRateControlRtcConfig {
+ public:
+  VpxRateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    aq_mode = 0;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+    ts_rate_decimator[0] = 1;
+  }
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  // Number of temporal layers
+  int ts_number_layers;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  int aq_mode;
+};
+}  // namespace libvpx
+#endif
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -22,8 +23,10 @@
                                      unsigned char *img_data) {
   unsigned int h, w, s, xcs, ycs, bps;
   unsigned int stride_in_bytes;
-  int align;
+  unsigned int align;
 
+  if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -88,8 +91,6 @@
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(vpx_image_t));
   }
 
   img->img_data = img_data;
@@ -152,9 +153,8 @@
 
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -165,7 +165,7 @@
     } else {
       const int bytes_per_sample =
           (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -712,6 +712,47 @@
    * Supported in codecs: VP9
    */
   VP9E_SET_EXTERNAL_RATE_CONTROL,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will do 3 things, only for 1 pass:
+   *  - Turn off low motion computation
+   *  - Turn off gf update constraint on key frame frequency
+   *  - Turn off content mode for cyclic refresh
+   *
+   * With those, the rate control is expected to work exactly the same as the
+   * interface provided in ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control function to get loopfilter level in the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LOOPFILTER_LEVEL,
+
+  /*!\brief Codec control to get last quantizers for all spatial layers.
+   *
+   * Return value uses an array of internal quantizers scale defined by the
+   * codec, for all spatial layers.
+   * The size of the array passed in should be #VPX_SS_MAX_LAYERS.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will turn off cyclic refresh for vp8.
+   *
+   * With this, the rate control is expected to work exactly the same as the
+   * interface provided in vp8_ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_RTC_EXTERNAL_RATECTRL,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -767,8 +808,8 @@
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
   /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
-  int delta_q[8];  /**< Quantizer deltas. */
-  int delta_lf[8]; /**< Loop filter deltas. */
+  int delta_q[8];  /**< Quantizer deltas. Valid range: [-63, 63].*/
+  int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/
   /*! skip and ref frame segment is only used in VP9. */
   int skip[8];      /**< Skip this block. */
   int ref_frame[8]; /**< Reference frame for this block. */
@@ -969,6 +1010,9 @@
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
+
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
 #define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
 
@@ -1037,6 +1081,9 @@
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
@@ -1067,6 +1114,12 @@
 
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
+
+VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
+
+VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 
 VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
 #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -33,6 +33,7 @@
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
 API_SRCS-yes += vpx_codec.h
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -58,7 +58,7 @@
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (14 + VPX_CODEC_ABI_VERSION + \
+  (15 + VPX_CODEC_ABI_VERSION + \
    VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
@@ -457,7 +457,7 @@
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
    */
   unsigned int rc_target_bitrate;
 
@@ -498,7 +498,7 @@
    * undershoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *   *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
@@ -513,7 +513,7 @@
    * overshoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -693,6 +693,151 @@
    *
    */
   int temporal_layering_mode;
+
+  /*!\brief A flag indicating whether to use external rate control parameters.
+   * By default is 0. If set to 1, the following parameters will be used in the
+   * rate control system.
+   */
+  int use_vizier_rc_params;
+
+  /*!\brief Active worst quality factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t active_wq_factor;
+
+  /*!\brief Error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t err_per_mb_factor;
+
+  /*!\brief Second reference default decay limit.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_default_decay_limit;
+
+  /*!\brief Second reference difference factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_diff_factor;
+
+  /*!\brief Keyframe error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_err_per_mb_factor;
+
+  /*!\brief Keyframe minimum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_min_boost_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe
+   * in a chunk.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_first_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_subs_factor;
+
+  /*!\brief Keyframe maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_frame_max_boost_factor;
+
+  /*!\brief Zero motion power factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t zm_factor;
+
+  /*!\brief Rate-distortion multiplier for inter frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_inter_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for alt-ref frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_arf_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for key frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_key_qp_fac;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -38,9 +38,15 @@
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ * If max_frame_size = 0, the encoding ignores max frame size limit.
+ * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
+ * If the encoded frame size is larger than max_frame_size, the frame is
+ * recoded to meet the size limit, following VP9's recoding principles.
  */
 typedef struct vpx_rc_encodeframe_decision {
-  int q_index; /**< Quantizer step index [0..255]*/
+  int q_index;        /**< Quantizer step index [0..255]*/
+  int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
 } vpx_rc_encodeframe_decision_t;
 
 /*!\brief Information for the frame to be encoded.
@@ -82,6 +88,7 @@
   int64_t sse;         /**< sum of squared error of the reconstructed frame */
   int64_t bit_count;   /**< number of bits spent on coding the frame*/
   int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/
+  int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/
 } vpx_rc_encodeframe_result_t;
 
 /*!\brief Status returned by rate control callback functions.
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -171,7 +171,8 @@
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -179,7 +180,7 @@
  * \param[in]    w         width
  * \param[in]    h         height
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h);
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -22,8 +22,7 @@
 uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);
   const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-  const uint32x2_t d = horizontal_add_uint16x8(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+  return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
 }
 
 uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
@@ -30,7 +29,6 @@
   int i;
   uint8x8_t b, c;
   uint16x8_t sum;
-  uint32x2_t d;
   b = vld1_u8(a);
   a += a_stride;
   c = vld1_u8(a);
@@ -43,9 +41,7 @@
     sum = vaddw_u8(sum, d);
   }
 
-  d = horizontal_add_uint16x8(sum);
-
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
@@ -139,8 +135,7 @@
     ref += 16;
   }
 
-  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
-                       0);
+  return (int16_t)horizontal_add_uint16x8(vec_sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -15,19 +15,10 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE tran_low_t get_lane(const int32x2_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return vget_lane_s32(a, 0);
-#else
-  return vget_lane_s16(vreinterpret_s16_s32(a), 0);
-#endif  // CONFIG_VP9_HIGHBITDETPH
-}
-
 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x4_t a0, a1, a2, a3;
   int16x8_t b0, b1;
   int16x8_t c;
-  int32x2_t d;
 
   a0 = vld1_s16(input);
   input += stride;
@@ -42,9 +33,7 @@
 
   c = vaddq_s16(b0, b1);
 
-  d = horizontal_add_int16x8(c);
-
-  output[0] = get_lane(vshl_n_s32(d, 1));
+  output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
   output[1] = 0;
 }
 
@@ -57,7 +46,7 @@
     sum = vaddq_s16(sum, input_00);
   }
 
-  output[0] = get_lane(horizontal_add_int16x8(sum));
+  output[0] = (tran_low_t)horizontal_add_int16x8(sum);
   output[1] = 0;
 }
 
@@ -66,7 +55,7 @@
   int r;
   int16x8_t left = vld1q_s16(input);
   int16x8_t right = vld1q_s16(input + 8);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 16; ++r) {
@@ -77,9 +66,9 @@
     right = vaddq_s16(right, b);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+  sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
 
-  output[0] = get_lane(vshr_n_s32(sum, 1));
+  output[0] = (tran_low_t)(sum >> 1);
   output[1] = 0;
 }
 
@@ -90,7 +79,7 @@
   int16x8_t a1 = vld1q_s16(input + 8);
   int16x8_t a2 = vld1q_s16(input + 16);
   int16x8_t a3 = vld1q_s16(input + 24);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 32; ++r) {
@@ -105,9 +94,10 @@
     a3 = vaddq_s16(a3, b3);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a2));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a3));
-  output[0] = get_lane(vshr_n_s32(sum, 3));
+  sum = horizontal_add_int16x8(a0);
+  sum += horizontal_add_int16x8(a1);
+  sum += horizontal_add_int16x8(a2);
+  sum += horizontal_add_int16x8(a3);
+  output[0] = (tran_low_t)(sum >> 3);
   output[1] = 0;
 }
--- a/vpx_dsp/arm/highbd_loopfilter_neon.c
+++ b/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -661,6 +661,17 @@
   vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
 }
 
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
                                      const uint16x8_t blimit_vec,
                                      const uint16x8_t limit_vec,
@@ -722,6 +733,10 @@
     store_4x8(s - 2, p, op1, op0, oq0, oq1);
   }
 }
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
                                        const uint8_t *blimit,
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -975,6 +975,17 @@
 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
 #undef FUN_LPF_16_KERNEL
 
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
@@ -1090,3 +1101,7 @@
               vget_high_u8(oq0), vget_high_u8(oq1));
   }
 }
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -19,6 +19,24 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif
+#endif
+
 static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
                                           const int16_t c2, const int16_t c3) {
   return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
@@ -95,7 +113,8 @@
 }
 
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+                                          ptrdiff_t stride) {
   uint32_t a;
   uint32x2_t a_u32 = vdup_n_u32(0);
   if (stride == 4) return vld1_u8(buf);
@@ -108,7 +127,7 @@
 }
 
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
   if (stride == 4) {
@@ -121,7 +140,8 @@
 }
 
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+                                            ptrdiff_t stride) {
   uint32_t a;
   uint32x4_t a_u32 = vdupq_n_u32(0);
   if (stride == 4) return vld1q_u8(buf);
@@ -141,7 +161,7 @@
 }
 
 // Store 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
   if (stride == 4) {
@@ -158,7 +178,7 @@
 }
 
 // Load 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
   uint32x2_t a = vdup_n_u32(0);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));
@@ -171,7 +191,7 @@
 }
 
 // Store 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   uint32x2_t a_u32 = vreinterpret_u32_u8(a);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -34,7 +34,9 @@
                             uint32_t *const res) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+#if !defined(__aarch64__)
   uint16x4_t a[2];
+#endif
   uint32x4_t r;
 
   assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
@@ -51,9 +53,14 @@
     abs[1] = vabal_u8(abs[1], s, ref23);
   }
 
+#if defined(__aarch64__)
+  abs[0] = vpaddq_u16(abs[0], abs[1]);
+  r = vpaddlq_u16(abs[0]);
+#else
   a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+#endif
   vst1q_u32(res, r);
 }
 
@@ -74,6 +81,12 @@
 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
 static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                           uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  const uint32x4_t r = vpaddlq_u16(b0);
+#else
   const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -81,12 +94,23 @@
   const uint16x4_t b0 = vpadd_u16(a0, a1);
   const uint16x4_t b1 = vpadd_u16(a2, a3);
   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
   vst1q_u32(res, r);
 }
 
+#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
+
 // Can handle 1024 pixels' sad sum (such as 32x32)
 static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint32x4_t b0 = vpaddlq_u16(a0);
+  const uint32x4_t b1 = vpaddlq_u16(a1);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -96,15 +120,26 @@
   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
 static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
   const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t b0 = vpaddq_u32(a0, a1);
+  const uint32x4_t b1 = vpaddq_u32(a2, a3);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
   const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
   const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
   const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
@@ -112,11 +147,13 @@
   const uint32x2_t c0 = vpadd_u32(b0, b1);
   const uint32x2_t c1 = vpadd_u32(b2, b3);
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 4096 pixels' sad sum (such as 64x64)
 static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -129,6 +166,23 @@
   const uint32x4_t b1 = vaddq_u32(a2, a3);
   const uint32x4_t b2 = vaddq_u32(a4, a5);
   const uint32x4_t b3 = vaddq_u32(a6, a7);
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  const uint32x4_t r = vpaddq_u32(c0, c1);
+  vst1q_u32(res, r);
+#else
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+  const uint32x4_t b0 = vaddq_u32(a0, a1);
+  const uint32x4_t b1 = vaddq_u32(a2, a3);
+  const uint32x4_t b2 = vaddq_u32(a4, a5);
+  const uint32x4_t b3 = vaddq_u32(a6, a7);
   const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
   const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
@@ -136,8 +190,11 @@
   const uint32x2_t d0 = vpadd_u32(c0, c1);
   const uint32x2_t d1 = vpadd_u32(c2, c3);
   vst1q_u32(res, vcombine_u32(d0, d1));
+#endif
 }
 
+#endif
+
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
                             uint32_t *res, const int height) {
@@ -180,7 +237,42 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
+                              uint32x4_t *const sum) {
+  const uint8x16_t r = vld1q_u8(ref_ptr);
+  const uint8x16_t diff = vabdq_u8(src_ptr, r);
+  *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
+                             uint32_t *res, const int height) {
+  int i;
+  uint32x4_t r0, r1;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
+    sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
+    sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
+    sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
+    sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+#else
+
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint16x8_t *const sum) {
   const uint8x16_t r = vld1q_u8(ref_ptr);
   *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
@@ -190,7 +282,7 @@
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
                              uint32_t *res, const int height) {
-  int i, j;
+  int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -199,15 +291,22 @@
   for (i = 0; i < height; ++i) {
     const uint8x16_t s = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      sad16_neon(ref_loop[j], s, &sum[j]);
-      ref_loop[j] += ref_stride;
-    }
+    /* Manual unrolling here stops the compiler from getting confused. */
+    sad16_neon(ref_loop[0], s, &sum[0]);
+    ref_loop[0] += ref_stride;
+    sad16_neon(ref_loop[1], s, &sum[1]);
+    ref_loop[1] += ref_stride;
+    sad16_neon(ref_loop[2], s, &sum[2]);
+    ref_loop[2] += ref_stride;
+    sad16_neon(ref_loop[3], s, &sum[3]);
+    ref_loop[3] += ref_stride;
   }
 
   sad_512_pel_final_neon(sum, res);
 }
 
+#endif
+
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
@@ -228,8 +327,69 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
+                             uint32_t *res, const int height) {
+  int i;
+  uint32x4_t r0, r1;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < height; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+}
+
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64);
+}
+
+#else
+
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
                              const int height, uint16x8_t *const sum) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -284,14 +444,124 @@
   sad_2048_pel_final_neon(sum, res);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   int i;
+  uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  for (i = 0; i < 32; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint32x4_t r0, r1, r2, r3;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  for (i = 0; i < 64; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  r0 = vpaddq_u32(sum[0], sum[1]);
+  r1 = vpaddq_u32(sum[2], sum[3]);
+  r2 = vpaddq_u32(sum[4], sum[5]);
+  r3 = vpaddq_u32(sum[6], sum[7]);
+  r0 = vpaddq_u32(r0, r1);
+  r1 = vpaddq_u32(r2, r3);
+  vst1q_u32(res, vpaddq_u32(r0, r1));
+}
+
+#else
+
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
@@ -378,3 +648,5 @@
 
   sad_4096_pel_final_neon(sum, res);
 }
+
+#endif
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -23,7 +23,7 @@
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -35,7 +35,7 @@
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
@@ -51,7 +51,7 @@
     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
   }
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -71,7 +71,7 @@
     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   }
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint16x8(abs);
 }
 
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
@@ -114,7 +114,7 @@
   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
                                const uint8_t *ref_ptr, int ref_stride) {       \
     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+    return horizontal_add_uint16x8(abs);                                       \
   }                                                                            \
                                                                                \
   uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
@@ -122,7 +122,7 @@
                                    const uint8_t *second_pred) {               \
     const uint16x8_t abs =                                                     \
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+    return horizontal_add_uint16x8(abs);                                       \
   }
 
 sad8xN(4);
@@ -172,7 +172,7 @@
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
         sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -180,7 +180,7 @@
                                     const uint8_t *second_pred) {             \
     const uint16x8_t abs =                                                    \
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }
 
 sad16xN(8);
@@ -240,7 +240,7 @@
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
         sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -248,7 +248,7 @@
                                     const uint8_t *second_pred) {             \
     const uint16x8_t abs =                                                    \
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+    return horizontal_add_uint16x8(abs);                                      \
   }
 
 sad32xN(16);
@@ -338,7 +338,7 @@
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint32x4_t abs =                                                    \
         sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+    return horizontal_add_uint32x4(abs);                                      \
   }                                                                           \
                                                                               \
   uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
@@ -346,7 +346,7 @@
                                     const uint8_t *second_pred) {             \
     const uint32x4_t abs =                                                    \
         sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+    return horizontal_add_uint32x4(abs);                                      \
   }
 
 sad64xN(32);
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -16,23 +16,65 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_s16(a);
+#else
   const int32x4_t b = vpaddlq_s16(a);
   const int64x2_t c = vpaddlq_s32(b);
-  return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
-                  vreinterpret_s32_s64(vget_high_s64(c)));
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(a);
+#else
   const uint32x4_t b = vpaddlq_u16(a);
   const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_s32(a);
+#else
+  return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,6 +19,100 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
+#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
+  int i;
+  uint32x4_t sum_a = vdupq_n_u32(0);
+  uint32x4_t sum_b = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(a, b);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
+    sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+  }
+
+  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride, int w,
+                              int h, uint32_t *sse, int *sum) {
+  int i, j;
+  uint32x4_t sum_a = vdupq_n_u32(0);
+  uint32x4_t sum_b = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a = vld1q_u8(src_ptr + j);
+      const uint8x16_t b = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(a, b);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
+      sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+    }
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
+  int i = 0;
+  uint32x2_t sum_a = vdup_n_u32(0);
+  uint32x2_t sum_b = vdup_n_u32(0);
+  uint32x2_t sse_lo_u32 = vdup_n_u32(0);
+  uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+
+  do {
+    const uint8x8_t a_0 = vld1_u8(src_ptr);
+    const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
+    const uint8x8_t b_0 = vld1_u8(ref_ptr);
+    const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
+
+    const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
+    const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
+    sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
+    sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
+
+    sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
+    sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
+    sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
+    sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
+
+    src_ptr += src_stride + src_stride;
+    ref_ptr += ref_stride + ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
+  *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+}
+
+#else
+
 // The variance helper functions use int16_t for sum. 8 values are accumulated
 // and then added (at which point they expand up to int32_t). To avoid overflow,
 // there can be no more than 32767 / 255 ~= 128 values accumulated in each
@@ -66,10 +160,9 @@
     ref_ptr += 4 * ref_stride;
   }
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
 // Process a block of any size where the width is divisible by 16.
@@ -115,10 +208,9 @@
     ref_ptr += ref_stride;
   }
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
 // Process a block of width 8 two rows at a time.
@@ -157,12 +249,13 @@
     i += 2;
   } while (i < h);
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_uint32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
 }
 
+#endif
+
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
@@ -264,117 +357,165 @@
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
+#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
                                unsigned int *sse) {
   int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
+  uint8x16_t a[2], b[2], abs_diff[2];
+  uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
+  for (i = 0; i < 8; i++) {
+    a[0] = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    q1u8 = vld1q_u8(src_ptr);
+    a[1] = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    q2u8 = vld1q_u8(ref_ptr);
+    b[0] = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q3u8 = vld1q_u8(ref_ptr);
+    b[1] = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+    abs_diff[0] = vabdq_u8(a[0], b[0]);
+    abs_diff[1] = vabdq_u8(a[1], b[1]);
 
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+    sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]);
+    sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]);
+  }
 
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+  return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+}
 
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+                                   const unsigned char *ref_ptr,
+                                   int ref_stride) {
+  uint8x8_t a[4], b[4], abs_diff[4];
+  uint32x2_t sse = vdup_n_u32(0);
+
+  a[0] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[0] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[1] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[1] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[2] = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  b[2] = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  a[3] = vld1_u8(src_ptr);
+  b[3] = vld1_u8(ref_ptr);
+
+  abs_diff[0] = vabd_u8(a[0], b[0]);
+  abs_diff[1] = vabd_u8(a[1], b[1]);
+  abs_diff[2] = vabd_u8(a[2], b[2]);
+  abs_diff[3] = vabd_u8(a[3], b[3]);
+
+  sse = vdot_u32(sse, abs_diff[0], abs_diff[0]);
+  sse = vdot_u32(sse, abs_diff[1], abs_diff[1]);
+  sse = vdot_u32(sse, abs_diff[2], abs_diff[2]);
+  sse = vdot_u32(sse, abs_diff[3], abs_diff[3]);
 
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
+  return vget_lane_u32(sse, 0);
+}
 
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
+#else
 
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
+                               const unsigned char *ref_ptr, int ref_stride,
+                               unsigned int *sse) {
+  int i;
+  uint8x16_t a[2], b[2];
+  int16x4_t diff_lo[4], diff_hi[4];
+  uint16x8_t diff[4];
+  int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
 
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  for (i = 0; i < 8; i++) {
+    a[0] = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    a[1] = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    b[0] = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    b[1] = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0]));
+    diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0]));
+    diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1]));
+    diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1]));
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]);
+    sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]);
+
+    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+    sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]);
+    sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]);
+    sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]);
+
+    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+    sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]);
+    sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]);
+  }
+
+  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]);
+  sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]);
+  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]);
+
+  *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
+  uint8x8_t a[4], b[4];
+  int16x4_t diff_lo[4];
+  uint16x8_t diff[4];
+  int32x4_t sse;
 
-  d0u8 = vld1_u8(src_ptr);
+  a[0] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d4u8 = vld1_u8(ref_ptr);
+  b[0] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
-  d1u8 = vld1_u8(src_ptr);
+  a[1] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d5u8 = vld1_u8(ref_ptr);
+  b[1] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
-  d2u8 = vld1_u8(src_ptr);
+  a[2] = vld1_u8(src_ptr);
   src_ptr += src_stride;
-  d6u8 = vld1_u8(ref_ptr);
+  b[2] = vld1_u8(ref_ptr);
   ref_ptr += ref_stride;
-  d3u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
+  a[3] = vld1_u8(src_ptr);
+  b[3] = vld1_u8(ref_ptr);
 
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
+  diff[0] = vsubl_u8(a[0], b[0]);
+  diff[1] = vsubl_u8(a[1], b[1]);
+  diff[2] = vsubl_u8(a[2], b[2]);
+  diff[3] = vsubl_u8(a[3], b[3]);
 
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+  diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0]));
+  diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1]));
+  diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2]));
+  diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3]));
 
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
+  sse = vmull_s16(diff_lo[0], diff_lo[0]);
+  sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]);
+  sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]);
+  sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]);
 
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
+
+#endif
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
@@ -30,6 +31,741 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b0, int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23;
+
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      src += src_stride;
+      s3 = vld1q_u8(src);
+      src += src_stride;
+
+      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23, dd01, dd23;
+      dd01 = vdup_n_u8(0);
+      dd23 = vdup_n_u8(0);
+
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      src += src_stride;
+      s3 = vld1q_u8(src);
+      src += src_stride;
+
+      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+#else
+
 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
                                 const uint8x8_t s0, const uint8x8_t s1,
                                 const uint8x8_t s2, const uint8x8_t s3,
@@ -145,7 +881,7 @@
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -296,7 +1032,7 @@
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -402,7 +1138,7 @@
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -586,7 +1322,7 @@
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -679,7 +1415,7 @@
       s5 = s9;
       s6 = s10;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -759,11 +1495,11 @@
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -860,7 +1596,7 @@
       s5 = s9;
       s6 = s10;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
@@ -950,10 +1686,12 @@
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
+
+#endif
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -72,6 +72,107 @@
   *s7 = vld1q_u8(s);
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
+    (__ARM_FEATURE_DOTPROD == 1)
+
+static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
+                                                const int8x16_t samples_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
+                                        const int8x8_t filters,
+                                        const int32x4_t correction,
+                                        const uint8x16_t range_limit,
+                                        const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
+                                                const int8x16_t samples0_hi,
+                                                const int8x16_t samples1_lo,
+                                                const int8x16_t samples1_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
+                                        const int8x8_t filters,
+                                        const int32x4_t correction,
+                                        const uint8x16_t range_limit,
+                                        const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+#endif
+
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -43,7 +43,7 @@
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // avg8
     uint8x8_t s0, s1, d0, d1;
     uint8x16_t s01, d01;
@@ -64,7 +64,7 @@
       vst1_u8(dst, vget_high_u8(d01));
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // avg16
     uint8x16_t s0, s1, d0, d1;
     do {
@@ -83,7 +83,7 @@
       vst1q_u8(dst, d1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // avg32
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
@@ -110,7 +110,7 @@
       vst1q_u8(dst + 16, d3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // avg64
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -33,7 +33,7 @@
       src += src_stride;
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
     uint8x8_t s0, s1;
     do {
@@ -47,7 +47,7 @@
       vst1_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
     uint8x16_t s0, s1;
     do {
@@ -61,7 +61,7 @@
       vst1q_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // copy32
     uint8x16_t s0, s1, s2, s3;
     do {
@@ -79,7 +79,7 @@
       vst1q_u8(dst + 16, s3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // copy64
     uint8x16_t s0, s1, s2, s3;
     do {
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -223,6 +223,8 @@
     }
   }
 
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
+
   return flags & mask;
 }
 
@@ -240,7 +242,7 @@
 // x86_readtsc directly, but prevent the CPU's out-of-order execution from
 // affecting the measurement (by having earlier/later instructions be evaluated
 // in the time interval). See the white paper, "How to Benchmark Code
-// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
 // Gabriele Paoloni for more information.
 //
 // If you are timing a large function (CPU time > a couple of seconds), use
@@ -306,7 +308,13 @@
 
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return x86_readtsc();
 }
 
@@ -313,7 +321,13 @@
 static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return v;
 }
 
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -64,6 +64,10 @@
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      if (!ybf->buffer_alloc) {
+        ybf->buffer_alloc_sz = 0;
+        return -1;
+      }
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
       // This memset is needed for fixing the issue of using uninitialized
@@ -75,7 +79,7 @@
       ybf->buffer_alloc_sz = frame_size;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
+    if (ybf->buffer_alloc_sz < frame_size) return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -114,10 +114,6 @@
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
-    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
 static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
@@ -287,6 +283,64 @@
   &buf_sz,           &buf_initial_sz,     &buf_optimal_sz, NULL
 };
 
+#if CONFIG_VP9_ENCODER
+static const arg_def_t use_vizier_rc_params =
+    ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params");
+static const arg_def_t active_wq_factor =
+    ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
+static const arg_def_t err_per_mb_factor =
+    ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor");
+static const arg_def_t sr_default_decay_limit = ARG_DEF(
+    NULL, "sr-default-decay-limit", 1, "Second reference default decay limit");
+static const arg_def_t sr_diff_factor =
+    ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor");
+static const arg_def_t kf_err_per_mb_factor = ARG_DEF(
+    NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor");
+static const arg_def_t kf_frame_min_boost_factor =
+    ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost");
+static const arg_def_t kf_frame_max_boost_first_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1,
+            "Max keyframe boost adjustment factor for first frame");
+static const arg_def_t kf_frame_max_boost_subs_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1,
+            "Max boost adjustment factor for subsequent KFs");
+static const arg_def_t kf_max_total_boost_factor = ARG_DEF(
+    NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor");
+static const arg_def_t gf_max_total_boost_factor =
+    ARG_DEF(NULL, "gf-max-total-boost-factor", 1,
+            "Golden frame max total boost factor");
+static const arg_def_t gf_frame_max_boost_factor =
+    ARG_DEF(NULL, "gf-frame-max-boost-factor", 1,
+            "Golden frame max per frame boost factor");
+static const arg_def_t zm_factor =
+    ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
+static const arg_def_t rd_mult_inter_qp_fac =
+    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1,
+            "RD multiplier adjustment for inter frames");
+static const arg_def_t rd_mult_arf_qp_fac =
+    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1,
+            "RD multiplier adjustment for alt-ref frames");
+static const arg_def_t rd_mult_key_qp_fac = ARG_DEF(
+    NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames");
+static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
+                                             &active_wq_factor,
+                                             &err_per_mb_factor,
+                                             &sr_default_decay_limit,
+                                             &sr_diff_factor,
+                                             &kf_err_per_mb_factor,
+                                             &kf_frame_min_boost_factor,
+                                             &kf_frame_max_boost_first_factor,
+                                             &kf_frame_max_boost_subs_factor,
+                                             &kf_max_total_boost_factor,
+                                             &gf_max_total_boost_factor,
+                                             &gf_frame_max_boost_factor,
+                                             &zm_factor,
+                                             &rd_mult_inter_qp_fac,
+                                             &rd_mult_arf_qp_fac,
+                                             &rd_mult_key_qp_fac,
+                                             NULL };
+#endif
+
 static const arg_def_t bias_pct =
     ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
 static const arg_def_t minsection_pct =
@@ -573,6 +627,8 @@
 #if CONFIG_VP9_ENCODER
   fprintf(fout, "\nVP9 Specific Options:\n");
   arg_show_usage(fout, vp9_args);
+  fprintf(fout, "\nVizier Rate Control Options:\n");
+  arg_show_usage(fout, vizier_rc_args);
 #endif
   fprintf(fout,
           "\nStream timebase (--timebase):\n"
@@ -614,9 +670,6 @@
   struct vpx_codec_enc_cfg cfg;
   const char *out_fn;
   const char *stats_fn;
-#if CONFIG_FP_MB_STATS
-  const char *fpmb_stats_fn;
-#endif
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
@@ -644,9 +697,6 @@
   uint64_t cx_time;
   size_t nbytes;
   stats_io_t stats;
-#if CONFIG_FP_MB_STATS
-  stats_io_t fpmb_stats;
-#endif
   struct vpx_image *img;
   vpx_codec_ctx_t decoder;
   int mismatch_seen;
@@ -883,10 +933,6 @@
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
-    } else if (arg_match(&arg, &fpmbf_name, argi)) {
-      config->fpmb_stats_fn = arg.val;
-#endif
     } else if (arg_match(&arg, &use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
@@ -983,6 +1029,40 @@
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_ENCODER
+    } else if (arg_match(&arg, &use_vizier_rc_params, argi)) {
+      config->cfg.use_vizier_rc_params = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &active_wq_factor, argi)) {
+      config->cfg.active_wq_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &err_per_mb_factor, argi)) {
+      config->cfg.err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_default_decay_limit, argi)) {
+      config->cfg.sr_default_decay_limit = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_diff_factor, argi)) {
+      config->cfg.sr_diff_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) {
+      config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) {
+      config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) {
+      config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) {
+      config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) {
+      config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) {
+      config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) {
+      config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &zm_factor, argi)) {
+      config->cfg.zm_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) {
+      config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) {
+      config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) {
+      config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg);
+#endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
       if (strcmp(global->codec->name, "vp9") == 0) {
@@ -1075,17 +1155,6 @@
         fatal("Stream %d: duplicate stats file (from stream %d)",
               streami->index, stream->index);
     }
-
-#if CONFIG_FP_MB_STATS
-    /* Check for two streams sharing a mb stats file. */
-    if (streami != stream) {
-      const char *a = stream->config.fpmb_stats_fn;
-      const char *b = streami->config.fpmb_stats_fn;
-      if (a && b && !strcmp(a, b))
-        fatal("Stream %d: duplicate mb stats file (from stream %d)",
-              streami->index, stream->index);
-    }
-#endif
   }
 }
 
@@ -1177,6 +1246,10 @@
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
+  // Temporary use for debug
+  SHOW(use_vizier_rc_params);
+  SHOW(active_wq_factor.num);
+  SHOW(active_wq_factor.den);
 }
 
 static void open_output_file(struct stream_state *stream,
@@ -1240,26 +1313,11 @@
       fatal("Failed to open statistics store");
   }
 
-#if CONFIG_FP_MB_STATS
-  if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
-                         pass))
-      fatal("Failed to open mb statistics store");
-  } else {
-    if (!stats_open_mem(&stream->fpmb_stats, pass))
-      fatal("Failed to open mb statistics store");
-  }
-#endif
-
   stream->config.cfg.g_pass = global->passes == 2
                                   ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
                                   : VPX_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
-    stream->config.cfg.rc_firstpass_mb_stats_in =
-        stats_get(&stream->fpmb_stats);
-#endif
   }
 
   stream->cx_time = 0;
@@ -1471,13 +1529,6 @@
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
-#if CONFIG_FP_MB_STATS
-      case VPX_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
-                    pkt->data.firstpass_mb_stats.sz);
-        stream->nbytes += pkt->data.raw.sz;
-        break;
-#endif
       case VPX_CODEC_PSNR_PKT:
 
         if (global->show_psnr) {
@@ -1636,6 +1687,7 @@
   int res = 0;
 
   memset(&input, 0, sizeof(input));
+  memset(&raw, 0, sizeof(raw));
   exec_name = argv_[0];
 
   /* Setup default input stream settings */
@@ -1781,14 +1833,10 @@
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
 
     if (pass == (global.pass ? global.pass - 1 : 0)) {
-      if (input.file_type == FILE_TYPE_Y4M)
-        /*The Y4M reader does its own allocation.
-          Just initialize this here to avoid problems if we never read any
-           frames.*/
-        memset(&raw, 0, sizeof(raw));
-      else
+      // The Y4M reader does its own allocation.
+      if (input.file_type != FILE_TYPE_Y4M) {
         vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
-
+      }
       FOREACH_STREAM(stream->rate_hist = init_rate_histogram(
                          &stream->config.cfg, &global.framerate));
     }
@@ -1973,10 +2021,6 @@
     FOREACH_STREAM(close_output_file(stream, global.codec->fourcc));
 
     FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1));
-
-#if CONFIG_FP_MB_STATS
-    FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
-#endif
 
     if (global.pass) break;
   }
--- a/y4minput.c
+++ b/y4minput.c
@@ -10,6 +10,7 @@
  *  Based on code from the OggTheora software codec source code,
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
+#include <assert.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
@@ -52,15 +53,8 @@
 }
 
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int got_w;
-  int got_h;
-  int got_fps;
-  int got_interlace;
-  int got_par;
-  int got_chroma;
   char *p;
   char *q;
-  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
   for (p = _tags;; p = q) {
     /*Skip any leading spaces.*/
     while (*p == ' ') p++;
@@ -73,12 +67,10 @@
     switch (p[0]) {
       case 'W': {
         if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
-        got_w = 1;
         break;
       }
       case 'H': {
         if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
-        got_h = 1;
         break;
       }
       case 'F': {
@@ -85,12 +77,10 @@
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
-        got_fps = 1;
         break;
       }
       case 'I': {
         _y4m->interlace = p[1];
-        got_interlace = 1;
         break;
       }
       case 'A': {
@@ -97,7 +87,6 @@
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
-        got_par = 1;
         break;
       }
       case 'C': {
@@ -104,21 +93,93 @@
         if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
-        got_chroma = 1;
         break;
       }
         /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps) return -1;
-  if (!got_interlace) _y4m->interlace = '?';
-  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
-  /*Chroma-type is not specified in older files, e.g., those generated by
-     mplayer.*/
-  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
   return 0;
 }
 
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+  size_t i;
+  assert(buf_len >= 1);
+  // Skip leading space characters.
+  do {
+    if (!file_read(buf, 1, file)) {
+      return 0;
+    }
+  } while (buf[0] == ' ');
+
+  // If we hit the newline, treat this as the "empty" tag.
+  if (buf[0] == '\n') {
+    buf[0] = '\0';
+    *end_tag = '\n';
+    return 1;
+  }
+
+  // Copy over characters until a space is hit, or the buffer is exhausted.
+  for (i = 1; i < buf_len; ++i) {
+    if (!file_read(buf + i, 1, file)) {
+      return 0;
+    }
+    if (buf[i] == ' ' || buf[i] == '\n') {
+      break;
+    }
+  }
+  if (i == buf_len) {
+    fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+            (unsigned long)i);
+    return 0;
+  }
+  *end_tag = buf[i];
+  buf[i] = '\0';
+  return 1;
+}
+
+/* Returns 1 if tags were parsed successfully, 0 otherwise. */
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+  char tag[256];
+  char end; /* Character denoting the end of the tag, ' ' or '\n'. */
+  /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+     fields are marked with -1 and will be checked after the tags are parsed. */
+  y4m_ctx->pic_w = -1;
+  y4m_ctx->pic_h = -1;
+  y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */
+  y4m_ctx->par_n = 0;
+  y4m_ctx->par_d = 0;
+  y4m_ctx->interlace = '?';
+  snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+  /* Find one tag at a time. */
+  do {
+    if (!copy_tag(tag, sizeof(tag), &end, file)) {
+      return 0;
+    }
+    /* y4m_parse_tags returns 0 on success. */
+    if (y4m_parse_tags(y4m_ctx, tag)) {
+      return 0;
+    }
+  } while (end != '\n');
+
+  /* Check the mandatory fields. */
+  if (y4m_ctx->pic_w == -1) {
+    fprintf(stderr, "Width field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->pic_h == -1) {
+    fprintf(stderr, "Height field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->fps_n == -1) {
+    fprintf(stderr, "FPS field missing\n");
+    return 0;
+  }
+  return 1;
+}
+
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -225,26 +286,6 @@
   }
 }
 
-/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
-static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
-                                         unsigned char *_aux) {
-  int c_w;
-  int c_h;
-  int c_sz;
-  int pli;
-  /*Skip past the luma data.*/
-  _dst += _y4m->pic_w * _y4m->pic_h;
-  /*Compute the size of each chroma plane.*/
-  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
-  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
-  c_sz = c_w * c_h;
-  for (pli = 1; pli < 3; pli++) {
-    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
-    _dst += c_sz;
-    _aux += c_sz;
-  }
-}
-
 /*This format is only used for interlaced content, but is included for
    completeness.
 
@@ -785,277 +826,271 @@
   (void)_aux;
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420) {
-  char buffer[80] = { 0 };
-  int ret;
-  int i;
-  /*Read until newline, or 80 cols, whichever happens first.*/
-  for (i = 0; i < 79; i++) {
-    if (_nskip > 0) {
-      buffer[i] = *_skip++;
-      _nskip--;
-    } else {
-      if (!file_read(buffer + i, 1, _fin)) return -1;
-    }
-    if (buffer[i] == '\n') break;
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420) {
+  // File must start with |TAG|.
+  char tag_buffer[9];  // 9 == strlen(TAG)
+  // Read as much as possible from |skip_buffer|, which were characters
+  // that were previously read from the file to do input-type detection.
+  assert(num_skip >= 0 && num_skip <= 8);
+  if (num_skip > 0) {
+    memcpy(tag_buffer, skip_buffer, num_skip);
   }
-  /*We skipped too much header data.*/
-  if (_nskip > 0) return -1;
-  if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+  // Start reading from the file now that the |skip_buffer| is depleted.
+  if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
     return -1;
   }
-  buffer[i] = '\0';
-  if (memcmp(buffer, "YUV4MPEG", 8)) {
-    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+  if (memcmp(TAG, tag_buffer, 9) != 0) {
+    fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
     return -1;
   }
-  if (buffer[8] != '2') {
-    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  // Next character must be a space.
+  if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
+    fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+    return -1;
   }
-  ret = y4m_parse_tags(_y4m, buffer + 5);
-  if (ret < 0) {
-    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
-    return ret;
+  if (!parse_tags(y4m_ctx, file)) {
+    fprintf(stderr, "Error parsing %s header.\n", TAG);
   }
-  if (_y4m->interlace == '?') {
+  if (y4m_ctx->interlace == '?') {
     fprintf(stderr,
             "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
-  } else if (_y4m->interlace != 'p') {
+  } else if (y4m_ctx->interlace != 'p') {
     fprintf(stderr,
             "Input video is interlaced; "
             "Only progressive scan handled.\n");
     return -1;
   }
-  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
-  _y4m->bps = 12;
-  _y4m->bit_depth = 8;
-  if (strcmp(_y4m->chroma_type, "420") == 0 ||
-      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        _y4m->pic_w * _y4m->pic_h +
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+  y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420;
+  y4m_ctx->bps = 12;
+  y4m_ctx->bit_depth = 8;
+  y4m_ctx->aux_buf = NULL;
+  y4m_ctx->dst_buf = NULL;
+  if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        y4m_ctx->pic_w * y4m_ctx->pic_h +
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 10;
-    _y4m->bps = 15;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->bps = 15;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+  } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 12;
-    _y4m->bps = 18;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->bps = 18;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz =
+        3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422jpeg_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_422_420jpeg;
+      y4m_ctx->aux_buf_read_sz =
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_422_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
-      _y4m->bps = 16;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz =
-          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422;
+      y4m_ctx->bps = 16;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz =
+          y4m_ctx->pic_w * y4m_ctx->pic_h +
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 20;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 20;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 24;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 24;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
-    _y4m->src_c_dec_h = 4;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+    y4m_ctx->src_c_dec_h = 4;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz =
-        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_411_420jpeg;
+    y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz =
+        y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_411_420jpeg;
     fprintf(stderr, "Unsupported conversion from yuv 411\n");
     return -1;
-  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+  } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
-      _y4m->bps = 24;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444;
+      y4m_ctx->bps = 24;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 30;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 30;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 36;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 36;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
-    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
-    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*No extra space required, but we need to clear the chroma planes.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_mono_420jpeg;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_mono_420jpeg;
   } else {
-    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
     return -1;
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz =
-      _y4m->pic_w * _y4m->pic_h +
-      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  if (_y4m->bit_depth == 8)
-    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  y4m_ctx->dst_buf_sz =
+      y4m_ctx->pic_w * y4m_ctx->pic_h +
+      2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+          ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+  if (y4m_ctx->bit_depth == 8)
+    y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
-    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+    y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
 
-  if (_y4m->aux_buf_sz > 0)
-    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  if (y4m_ctx->aux_buf_sz > 0)
+    y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
   return 0;
 }
 
--- a/y4minput.h
+++ b/y4minput.h
@@ -56,8 +56,16 @@
   unsigned int bit_depth;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420);
+/**
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. The |skip_buffer| indicates bytes that were previously read
+ * from |file|, to do input-type detection; this buffer will be read before
+ * the |file| is read. It is of size |num_skip|, which *must* be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);