shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -1077,7 +1077,7 @@

 specialize qw/vp9_full_range_search/;

 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";

-specialize qw/vp9_temporal_filter_apply sse2/;

+specialize qw/vp9_temporal_filter_apply sse2 msa/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

--- /dev/null

+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c

@@ -1,0 +1,289 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp9_rtcd.h"

+#include "vp9/common/mips/msa/vp9_macros_msa.h"

+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,

+                                            uint32_t stride,

+                                            uint8_t *frm2_ptr,

+                                            int32_t filt_sth,

+                                            int32_t filt_wgt,

+                                            uint32_t *acc,

+                                            uint16_t *cnt) {

+  uint32_t row;

+  uint64_t f0, f1, f2, f3;

+  v16i8 frm2, frm1 = { 0 };

+  v16i8 frm4, frm3 = { 0 };

+  v16u8 frm_r, frm_l;

+  v8i16 frm2_r, frm2_l;

+  v8i16 diff0, diff1, mod0_h, mod1_h;

+  v4i32 cnst3, cnst16, filt_wt, strength;

+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;

+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;

+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;

+  v4i32 acc0, acc1, acc2, acc3;

+  v8i16 cnt0, cnt1;

+  filt_wt = __msa_fill_w(filt_wgt);

+  strength = __msa_fill_w(filt_sth);

+  cnst3 = __msa_ldi_w(3);

+  cnst16 = __msa_ldi_w(16);

+  for (row = 2; row--;) {

+    LD4(frm1_ptr, stride, f0, f1, f2, f3);

+    frm1_ptr += (4 * stride);

+    LD_SB2(frm2_ptr, 16, frm2, frm4);

+    frm2_ptr += 32;

+    LD_SW2(acc, 4, acc0, acc1);

+    LD_SW2(acc + 8, 4, acc2, acc3);

+    LD_SH2(cnt, 8, cnt0, cnt1);

+    INSERT_D2_SB(f0, f1, frm1);

+    INSERT_D2_SB(f2, f3, frm3);

+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);

+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);

+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);

+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);

+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,

+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);

+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

+    diff0_r = (mod0_w < cnst16);

+    diff0_l = (mod1_w < cnst16);

+    diff1_r = (mod2_w < cnst16);

+    diff1_l = (mod3_w < cnst16);

+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    mod0_w = diff0_r & mod0_w;

+    mod1_w = diff0_l & mod1_w;

+    mod2_w = diff1_r & mod2_w;

+    mod3_w = diff1_l & mod3_w;

+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);

+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);

+    ST_SH2(mod0_h, mod1_h, cnt, 8);

+    cnt += 16;

+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);

+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);

+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);

+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ST_SW2(mod0_w, mod1_w, acc, 4);

+    acc += 8;

+    ST_SW2(mod2_w, mod3_w, acc, 4);

+    acc += 8;

+    LD_SW2(acc, 4, acc0, acc1);

+    LD_SW2(acc + 8, 4, acc2, acc3);

+    LD_SH2(cnt, 8, cnt0, cnt1);

+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);

+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);

+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);

+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);

+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,

+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);

+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

+    diff0_r = (mod0_w < cnst16);

+    diff0_l = (mod1_w < cnst16);

+    diff1_r = (mod2_w < cnst16);

+    diff1_l = (mod3_w < cnst16);

+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    mod0_w = diff0_r & mod0_w;

+    mod1_w = diff0_l & mod1_w;

+    mod2_w = diff1_r & mod2_w;

+    mod3_w = diff1_l & mod3_w;

+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);

+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);

+    ST_SH2(mod0_h, mod1_h, cnt, 8);

+    cnt += 16;

+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);

+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);

+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);

+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ST_SW2(mod0_w, mod1_w, acc, 4);

+    acc += 8;

+    ST_SW2(mod2_w, mod3_w, acc, 4);

+    acc += 8;

+  }

+}

+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,

+                                             uint32_t stride,

+                                             uint8_t *frm2_ptr,

+                                             int32_t filt_sth,

+                                             int32_t filt_wgt,

+                                             uint32_t *acc,

+                                             uint16_t *cnt) {

+  uint32_t row;

+  v16i8 frm1, frm2, frm3, frm4;

+  v16u8 frm_r, frm_l;

+  v16i8 zero = { 0 };

+  v8u16 frm2_r, frm2_l;

+  v8i16 diff0, diff1, mod0_h, mod1_h;

+  v4i32 cnst3, cnst16, filt_wt, strength;

+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;

+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;

+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;

+  v4i32 acc0, acc1, acc2, acc3;

+  v8i16 cnt0, cnt1;

+  filt_wt = __msa_fill_w(filt_wgt);

+  strength = __msa_fill_w(filt_sth);

+  cnst3 = __msa_ldi_w(3);

+  cnst16 = __msa_ldi_w(16);

+  for (row = 8; row--;) {

+    LD_SB2(frm1_ptr, stride, frm1, frm3);

+    frm1_ptr += stride;

+    LD_SB2(frm2_ptr, 16, frm2, frm4);

+    frm2_ptr += 16;

+    LD_SW2(acc, 4, acc0, acc1);

+    LD_SW2(acc, 4, acc2, acc3);

+    LD_SH2(cnt, 8, cnt0, cnt1);

+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);

+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);

+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);

+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);

+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

+    diff0_r = (mod0_w < cnst16);

+    diff0_l = (mod1_w < cnst16);

+    diff1_r = (mod2_w < cnst16);

+    diff1_l = (mod3_w < cnst16);

+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    mod0_w = diff0_r & mod0_w;

+    mod1_w = diff0_l & mod1_w;

+    mod2_w = diff1_r & mod2_w;

+    mod3_w = diff1_l & mod3_w;

+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);

+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);

+    ST_SH2(mod0_h, mod1_h, cnt, 8);

+    cnt += 16;

+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);

+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);

+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);

+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ST_SW2(mod0_w, mod1_w, acc, 4);

+    acc += 8;

+    ST_SW2(mod2_w, mod3_w, acc, 4);

+    acc += 8;

+    LD_SW2(acc, 4, acc0, acc1);

+    LD_SW2(acc + 8, 4, acc2, acc3);

+    LD_SH2(cnt, 8, cnt0, cnt1);

+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);

+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);

+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);

+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);

+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

+    diff0_r = (mod0_w < cnst16);

+    diff0_l = (mod1_w < cnst16);

+    diff1_r = (mod2_w < cnst16);

+    diff1_l = (mod3_w < cnst16);

+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    mod0_w = diff0_r & mod0_w;

+    mod1_w = diff0_l & mod1_w;

+    mod2_w = diff1_r & mod2_w;

+    mod3_w = diff1_l & mod3_w;

+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);

+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);

+    ST_SH2(mod0_h, mod1_h, cnt, 8);

+    cnt += 16;

+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);

+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);

+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);

+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,

+         mod0_w, mod1_w, mod2_w, mod3_w);

+    ST_SW2(mod0_w, mod1_w, acc, 4);

+    acc += 8;

+    ST_SW2(mod2_w, mod3_w, acc, 4);

+    acc += 8;

+    frm1_ptr += stride;

+    frm2_ptr += 16;

+  }

+}

+void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,

+                                   uint8_t *frame2_ptr, uint32_t blk_w,

+                                   uint32_t blk_h, int32_t strength,

+                                   int32_t filt_wgt, uint32_t *accu,

+                                   uint16_t *cnt) {

+  if (8 == (blk_w * blk_h)) {

+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,

+                                    strength, filt_wgt, accu, cnt);

+  } else if (16 == (blk_w * blk_h)) {

+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,

+                                     strength, filt_wgt, accu, cnt);

+  } else {

+    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,

+                                strength, filt_wgt, accu, cnt);

+  }

+}

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -160,5 +160,6 @@

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c

+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--

⑨