1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ 12 #define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ 13 14 #include "vpx_dsp/mips/macros_msa.h" 15 #include "vpx_dsp/vpx_filter.h" 16 17 extern const uint8_t mc_filt_mask_arr[16 * 3]; 18 19 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ 20 filt3) \ 21 ({ \ 22 v8i16 tmp_dpadd_0, tmp_dpadd_1; \ 23 \ 24 tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ 25 tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ 26 tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ 27 tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ 28 tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ 29 \ 30 tmp_dpadd_0; \ 31 }) 32 33 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ 34 filt_h1, filt_h2, filt_h3) \ 35 ({ \ 36 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 37 v8i16 hz_out_m; \ 38 \ 39 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ 40 vec3_m); \ 41 hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ 42 filt_h1, filt_h2, filt_h3); \ 43 \ 44 hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ 45 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 46 \ 47 hz_out_m; \ 48 }) 49 50 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 51 mask2, mask3, filt0, filt1, filt2, filt3, \ 52 out0, out1) \ 53 { \ 54 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 55 v8i16 res0_m, res1_m, res2_m, res3_m; \ 56 \ 57 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 58 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ 59 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 60 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ 61 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 62 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ 63 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 64 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ 65 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ 66 } 67 68 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 69 mask2, mask3, filt0, filt1, filt2, filt3, \ 70 out0, out1, out2, out3) \ 71 { \ 72 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 73 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ 74 \ 75 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 76 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 77 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 78 res0_m, res1_m, res2_m, res3_m); \ 79 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 80 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 81 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 82 res4_m, res5_m, res6_m, res7_m); \ 83 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 84 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 85 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 86 res0_m, res1_m, res2_m, res3_m); \ 87 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 88 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 89 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 90 res4_m, res5_m, res6_m, res7_m); \ 91 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ 92 res7_m, out0, out1, out2, out3); \ 93 } 94 95 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ 96 { \ 97 v16u8 tmp_m; \ 98 \ 99 tmp_m = PCKEV_XORI128_UB(in1, in0); \ 100 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ 101 ST_UB(tmp_m, (pdst)); \ 102 } 103 104 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ 105 { \ 106 v16u8 tmp_m; \ 107 \ 108 tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 109 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ 110 ST_UB(tmp_m, (pdst)); \ 111 } 112 113 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ 114 { \ 115 v16u8 tmp0_m, tmp1_m; \ 116 uint8_t *pdst_m = (uint8_t *)(pdst); \ 117 \ 118 PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 119 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 120 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ 121 } 122 #endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ 123