shithub: libvpx

Download patch

ref: ad55b1d270db717a1f5c1c4966e7aecf9a563e5f
parent: 7b9c86167eb7161ab98cab66fab6e63a2c76c29e
parent: 1364cb58b4372c0f6f377c938f1eca789ffd120c
author: Johann Koenig <johannkoenig@google.com>
date: Thu Sep 29 19:16:44 EDT 2016

Merge changes Ia3e9122f,Id33eb6c8,I956bd8ce

* changes:
  Remove vp8_clear_system_state
  vpx_dsp: clean up rtcd
  vp8: clean up rtcd

--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -83,8 +83,6 @@
 }
 #endif
 
-void vp8_clear_system_state_c(){};
-
 void vp8_machine_specific_config(VP8_COMMON *ctx) {
 #if CONFIG_MULTITHREAD
   ctx->processor_core_count = get_cpu_count();
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -12,6 +12,7 @@
 #include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_dsp/postproc.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
@@ -321,7 +322,7 @@
     }
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
       oci->current_video_frame >= 2 &&
@@ -363,7 +364,7 @@
         oci->postproc_state.last_noise != noise_level) {
       double sigma;
       struct postproc_state *ppstate = &oci->postproc_state;
-      vp8_clear_system_state();
+      vpx_clear_system_state();
       sigma = noise_level + .5 + .6 * q / 63.0;
       ppstate->clamp =
           vpx_setup_noise(sigma, ppstate->generated_noise, oci->Width + 256);
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -19,13 +19,6 @@
 forward_decls qw/vp8_common_forward_decls/;
 
 #
-# system state
-#
-add_proto qw/void vp8_clear_system_state/, "";
-specialize qw/vp8_clear_system_state mmx/;
-$vp8_clear_system_state_mmx=vpx_reset_mmx_state;
-
-#
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
@@ -33,15 +26,12 @@
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
 specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
-$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
 specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
-$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
 specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
-$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 #
 # Loopfilter
@@ -48,19 +38,15 @@
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
@@ -101,23 +87,18 @@
 #idct16
 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
 specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
-$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
 
 #iwalsh1
 add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
 specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
-$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
-# no asm yet
 
 #iwalsh16
 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
 specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
-$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
 specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
-$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
 
 #
 # RECON
@@ -124,15 +105,12 @@
 #
 add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
-$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
 
 add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
-$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
 
 add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
-$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
 
 #
 # Postproc
@@ -140,13 +118,10 @@
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
@@ -155,7 +130,6 @@
     specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
 
     add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
-    # no asm yet
 }
 
 #
@@ -163,19 +137,15 @@
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -29,6 +29,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/systemdependent.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
@@ -352,7 +353,7 @@
     goto decode_exit;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cm->show_frame) {
     cm->current_video_frame++;
@@ -383,7 +384,7 @@
 
 decode_exit:
   pbi->common.error.setjmp = 0;
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
@@ -416,7 +417,7 @@
   }
 
 #endif /*!CONFIG_POSTPROC*/
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return ret;
 }
 
@@ -447,7 +448,7 @@
     if (setjmp(fb->pbi[0]->common.error.jmp)) {
       vp8_remove_decoder_instances(fb);
       memset(fb->pbi, 0, sizeof(fb->pbi) / sizeof(fb->pbi[0]));
-      vp8_clear_system_state();
+      vpx_clear_system_state();
       return VPX_CODEC_ERROR;
     }
 
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
 #include "defaultcoefcounts.h"
@@ -843,7 +844,7 @@
   int new_intra, new_last, new_garf, oldtotal, newtotal;
   int ref_frame_cost[MAX_REF_FRAMES];
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->common.frame_type != KEY_FRAME) {
     if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter))) new_intra = 1;
@@ -908,7 +909,7 @@
 #endif
   int savings = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   do {
     int j = 0;
@@ -1295,7 +1296,7 @@
 
 #endif
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
   pack_coef_probs(cpi);
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -12,6 +12,7 @@
 #include "encodemv.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/systemdependent.h"
+#include "vpx_ports/system_state.h"
 
 #include <math.h>
 
@@ -126,7 +127,7 @@
   unsigned int cost0 = 0;
   unsigned int cost1 = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   i = 1;
 
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -26,6 +26,7 @@
 #include "vpx_scale/vpx_scale.h"
 #include "encodemb.h"
 #include "vp8/common/extend.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "rdopt.h"
@@ -499,7 +500,7 @@
 
   zero_ref_mv.as_int = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   x->src = *cpi->Source;
   xd->pre = *lst_yv12;
@@ -741,10 +742,10 @@
     /* extend the recon for intra prediction */
     vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8,
                       xd->dst.v_buffer + 8);
-    vp8_clear_system_state();
+    vpx_clear_system_state();
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   {
     double weight = 0.0;
 
@@ -1655,7 +1656,7 @@
   cpi->twopass.gf_group_bits = 0;
   cpi->twopass.gf_decay_rate = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   start_pos = cpi->twopass.stats_in;
 
@@ -2268,7 +2269,7 @@
     return;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (EOF == input_stats(cpi, &this_frame)) return;
 
@@ -2543,7 +2544,7 @@
 
   memset(&next_frame, 0, sizeof(next_frame));
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   start_position = cpi->twopass.stats_in;
 
   cpi->common.frame_type = KEY_FRAME;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -33,6 +33,7 @@
 #include "vp8/common/reconintra.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_timer.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
@@ -2296,7 +2297,7 @@
     recon += recon_stride;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return total_sse;
 }
 
@@ -2691,7 +2692,7 @@
   if (cpi->Speed > 11) return 0;
 
   /* Clear down mmx registers */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0)) {
     double change = 1.0 *
@@ -3129,7 +3130,7 @@
   } else {
     struct vpx_usec_timer timer;
 
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     vpx_usec_timer_start(&timer);
     if (cpi->sf.auto_filter == 0) {
@@ -3217,7 +3218,7 @@
   int drop_mark25 = drop_mark / 8;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->force_next_frame_intra) {
     cm->frame_type = KEY_FRAME; /* delayed intra frame */
@@ -3576,7 +3577,7 @@
    * There is some odd behavior for one pass here that needs attention.
    */
   if ((cpi->pass == 2) || (cpi->ni_frames > 150)) {
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     Q = cpi->active_worst_quality;
 
@@ -3802,7 +3803,7 @@
 #endif
 
   do {
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     vp8_set_quantizer(cpi, Q);
 
@@ -3935,7 +3936,7 @@
     cpi->projected_frame_size =
         (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 #endif
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     /* Test to see if the stats generated for this frame indicate that
      * we should have coded a key frame (assuming that we didn't)!
@@ -3979,7 +3980,7 @@
 #endif
     }
 
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
@@ -4549,7 +4550,7 @@
     {
         FILE *f = fopen("tmp.stt", "a");
 
-        vp8_clear_system_state();
+        vpx_clear_system_state();
 
         if (cpi->twopass.total_left_stats.coded_error != 0.0)
             fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
@@ -4779,7 +4780,7 @@
 
   if (setjmp(cpi->common.error.jmp)) {
     cpi->common.error.setjmp = 0;
-    vp8_clear_system_state();
+    vpx_clear_system_state();
     return VPX_CODEC_CORRUPT_FRAME;
   }
 
@@ -4986,7 +4987,7 @@
   *size = 0;
 
   /* Clear down mmx registers */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   cm->frame_type = INTER_FRAME;
   cm->frame_flags = *frame_flags;
@@ -5139,7 +5140,7 @@
 
           vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer,
                       cm->filter_level * 10 / 6, 1, 0);
-          vp8_clear_system_state();
+          vpx_clear_system_state();
 
           ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer,
                                 pp->y_stride, y_width, y_height);
@@ -5249,7 +5250,7 @@
     }
 
 #endif
-    vp8_clear_system_state();
+    vpx_clear_system_state();
     return ret;
   }
 }
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -22,6 +22,7 @@
 #include "vp8/common/systemdependent.h"
 #include "encodemv.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
 
 #define MIN_BPB_FACTOR 0.01
 #define MAX_BPB_FACTOR 50
@@ -296,7 +297,7 @@
   uint64_t target;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->oxcf.fixed_q >= 0) {
     int Q = cpi->oxcf.key_q;
@@ -1019,7 +1020,7 @@
   int projected_size_based_on_q = 0;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->common.frame_type == KEY_FRAME) {
     rate_correction_factor = cpi->key_frame_rate_correction_factor;
@@ -1302,7 +1303,7 @@
 
 void vp8_adjust_key_frame_context(VP8_COMP *cpi) {
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   /* Do we have any key frame overspend to recover? */
   /* Two-pass overspend handled elsewhere. */
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -30,6 +30,7 @@
 #include "encodemb.h"
 #include "vp8/encoder/quantize.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_ports/system_state.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@@ -163,7 +164,7 @@
   double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
   double rdconst = 2.80;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   /* Further tests required to see if optimum is different
    * for key frames, golden frames and arf frames.
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -24,6 +24,7 @@
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #if CONFIG_ERROR_CONCEALMENT
 #include "decoder/error_concealment.h"
 #endif
@@ -365,7 +366,7 @@
            * reallocation is attempted on resync. */
           ctx->si.w = 0;
           ctx->si.h = 0;
-          vp8_clear_system_state();
+          vpx_clear_system_state();
           /* same return value as used in vp8dx_receive_compressed_data */
           return -1;
         }
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -11,12 +11,6 @@
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
-  $avx2_ssse3 = 'avx2';
-}
-
 # functions that are 64 bit only.
 $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
@@ -437,13 +431,13 @@
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;