/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ |
D | sse_common.h | 89 … __m128 *dst6, __m128 *dst7, __m128 *dst8, int stride, int extra_stride, int r) { in WriteCol1() argument 101 _mm_store_ss(*dst, *dst7); in WriteCol1() 108 __m128 *dst6, __m128 *dst7, __m128 *dst8, int stride, int r) { in WriteCol2() argument 126 _mm_store_ss(*dst, *dst7); in WriteCol2() 127 *dst7 = _mm_shuffle_ps(*dst7, *dst7, _MM_SHUFFLE(0, 3, 2, 1)); in WriteCol2() 128 _mm_store_ss(*dst, *dst7); in WriteCol2() 133 __m128 *dst6, __m128 *dst7, __m128 *dst8, int stride, int r) { in WriteCol2Opt() argument 151 _mm_store_ss(*dst, *dst7); in WriteCol2Opt() 152 *dst7 = _mm_shuffle_ps(*dst7, *dst7, _MM_SHUFFLE(0, 3, 2, 1)); in WriteCol2Opt() 153 _mm_store_ss(*dst + 1, *dst7); in WriteCol2Opt() [all …]
|
D | MatMul_Sse.c | 118 …__m128 dst5 = _mm_setzero_ps(), dst6 = _mm_setzero_ps(), dst7 = _mm_setzero_ps(), dst8 = _mm_setze… in MatmulFloatSse64Opt() local 131 dst7 = _mm_add_ps(dst7, tmp3), dst8 = _mm_add_ps(dst8, tmp4); in MatmulFloatSse64Opt() 136 DoBiasBlock8(bias_d, &dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8); in MatmulFloatSse64Opt() 140 ActBlock8(&dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8, act_type); in MatmulFloatSse64Opt() 150 _mm_storeu_ps(dst, dst7), _mm_storeu_ps(dst + 4, dst8); in MatmulFloatSse64Opt() 155 _mm_storeu_ps(c + 24, dst7), _mm_storeu_ps(c + 28, dst8); in MatmulFloatSse64Opt() 161 WriteCol1(&dst, &dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8, stride, 1, r); in MatmulFloatSse64Opt() 165 WriteCol2Opt(&dst, &dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8, stride, r); in MatmulFloatSse64Opt() 174 WriteCol3(&dst, &dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8, stride, 3, r); in MatmulFloatSse64Opt() 178 WriteCol4(&dst, &dst1, &dst2, &dst3, &dst4, &dst5, &dst6, &dst7, &dst8, stride, 4, r); in MatmulFloatSse64Opt() [all …]
|
D | TiledC4MatMulFp32.c | 66 __m128 dst7 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src3, 0))); in TiledC4MatmulFp32() local 69 TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], MS_F32X4_GETI(src1, j), in TiledC4MatmulFp32() 106 … TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[0], MS_F32X4_GETI(src1, 0), in TiledC4MatmulFp32() 109 … TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[1], MS_F32X4_GETI(src1, 1), in TiledC4MatmulFp32() 112 … TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[2], MS_F32X4_GETI(src1, 2), in TiledC4MatmulFp32() 119 … TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[3], MS_F32X4_GETI(src1, 3), in TiledC4MatmulFp32() 146 … TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], MS_F32X4_GETI(src1, j), in TiledC4MatmulFp32() 156 _mm_storeu_ps(dst + 24, dst7); in TiledC4MatmulFp32()
|
/third_party/ffmpeg/libavcodec/mips/ |
D | hevc_mc_uniw_msa.c | 138 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; in hevc_uniwgt_copy_6w_msa() local 159 ILVRL_B2_SH(zero, src3, dst6, dst7); in hevc_uniwgt_copy_6w_msa() 162 SLLI_4V(dst4, dst5, dst6, dst7, 6); in hevc_uniwgt_copy_6w_msa() 167 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, in hevc_uniwgt_copy_6w_msa() 169 dst7); in hevc_uniwgt_copy_6w_msa() 171 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); in hevc_uniwgt_copy_6w_msa() 200 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; in hevc_uniwgt_copy_8w_msa() local 263 ILVRL_B2_SH(zero, src3, dst6, dst7); in hevc_uniwgt_copy_8w_msa() 265 SLLI_4V(dst4, dst5, dst6, dst7, 6); in hevc_uniwgt_copy_8w_msa() 269 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, in hevc_uniwgt_copy_8w_msa() [all …]
|
D | vc1dsp_msa.c | 145 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_vc1_inv_trans_4x8_msa() local 204 LD_SW8(dest, linesize, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_vc1_inv_trans_4x8_msa() 206 zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7, in ff_vc1_inv_trans_4x8_msa() 207 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_vc1_inv_trans_4x8_msa() 210 ILVR_H4_SW(zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7, in ff_vc1_inv_trans_4x8_msa() 211 dst4, dst5, dst6, dst7); in ff_vc1_inv_trans_4x8_msa() 222 ADD4(in_r4, dst4, in_r5, dst5, in_r6, dst6, in_r7, dst7, in ff_vc1_inv_trans_4x8_msa() 235 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_vc1_inv_trans_8x4_msa() local 283 dst4, dst5, dst6, dst7); in ff_vc1_inv_trans_8x4_msa() 306 ADD4(in4, dst4, in5, dst5, in6, dst6, in7, dst7, in4, in5, in6, in7); in ff_vc1_inv_trans_8x4_msa()
|
D | hevc_idct_msa.c | 773 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_addblk_16x16_msa() local 778 LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7); in hevc_addblk_16x16_msa() 788 UNPCK_UB_SH(dst7, dst_r3, dst_l3); in hevc_addblk_16x16_msa() 800 LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7); in hevc_addblk_16x16_msa() 818 UNPCK_UB_SH(dst7, dst_r3, dst_l3); in hevc_addblk_16x16_msa() 840 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_addblk_32x32_msa() local 847 LD_UB2(temp_dst, 16, dst6, dst7); in hevc_addblk_32x32_msa() 857 UNPCK_UB_SH(dst7, dst_r3, dst_l3); in hevc_addblk_32x32_msa() 871 LD_UB2(temp_dst, 16, dst6, dst7); in hevc_addblk_32x32_msa() 890 UNPCK_UB_SH(dst7, dst_r3, dst_l3); in hevc_addblk_32x32_msa() [all …]
|
D | h264idct_msa.c | 122 v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in avc_idct8_addblk_msa() local 227 LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in avc_idct8_addblk_msa() 230 ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, in avc_idct8_addblk_msa() 246 v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in avc_idct8_dc_addblk_msa() local 256 LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in avc_idct8_dc_addblk_msa() 259 ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, in avc_idct8_dc_addblk_msa()
|
D | hevc_mc_bi_msa.c | 143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_bi_copy_6w_msa() local 159 ILVRL_B2_SH(zero, src3, dst6, dst7); in hevc_bi_copy_6w_msa() 161 SLLI_4V(dst4, dst5, dst6, dst7, 6); in hevc_bi_copy_6w_msa() 164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, in hevc_bi_copy_6w_msa() 165 7, dst4, dst5, dst6, dst7); in hevc_bi_copy_6w_msa() 167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); in hevc_bi_copy_6w_msa() 194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_bi_copy_8w_msa() local 251 ILVRL_B2_SH(zero, src3, dst6, dst7); in hevc_bi_copy_8w_msa() 256 SLLI_4V(dst4, dst5, dst6, dst7, 6); in hevc_bi_copy_8w_msa() 260 dst7, 7, dst4, dst5, dst6, dst7); in hevc_bi_copy_8w_msa() [all …]
|
D | hevc_mc_biw_msa.c | 444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; in hevc_biwgt_copy_24w_msa() local 468 ILVRL_B2_SH(zero, src4, dst6, dst7); in hevc_biwgt_copy_24w_msa() 472 SLLI_4V(dst4, dst5, dst6, dst7, 6); in hevc_biwgt_copy_24w_msa() 477 HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6, in hevc_biwgt_copy_24w_msa() 479 dst6, dst7); in hevc_biwgt_copy_24w_msa() 484 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); in hevc_biwgt_copy_24w_msa() 2098 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; in hevc_hv_biwgt_8t_8multx2mult_msa() local 2194 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, in hevc_hv_biwgt_8t_8multx2mult_msa() 2197 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); in hevc_hv_biwgt_8t_8multx2mult_msa() 2212 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); in hevc_hv_biwgt_8t_8multx2mult_msa() [all …]
|
D | hevcdsp_msa.c | 905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_hz_8t_64w_msa() local 982 dst7 = const_vec; in hevc_hz_8t_64w_msa() 984 dst7, dst7, dst7, dst7); in hevc_hz_8t_64w_msa() 985 ST_SH(dst7, dst + 56); in hevc_hz_8t_64w_msa() 1485 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_hv_8t_8multx1mult_msa() local 1561 dst7 = const_vec; in hevc_hv_8t_8multx1mult_msa() 1563 dst7, dst7, dst7, dst7); in hevc_hv_8t_8multx1mult_msa() 1568 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); in hevc_hv_8t_8multx1mult_msa() 1586 dst6 = dst7; in hevc_hv_8t_8multx1mult_msa() 1617 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_hv_8t_12w_msa() local [all …]
|
D | hpeldsp_msa.c | 504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in common_vt_bil_and_aver_dst_16w_msa() local 517 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in common_vt_bil_and_aver_dst_16w_msa() 520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7, in common_vt_bil_and_aver_dst_16w_msa() 1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in common_hv_bil_and_aver_dst_16w_msa() local 1058 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in common_hv_bil_and_aver_dst_16w_msa() 1073 PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst); in common_hv_bil_and_aver_dst_16w_msa() 1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in avg_width16_msa() local 1308 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in avg_width16_msa() 1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, in avg_width16_msa() 1313 dst4, dst5, dst6, dst7); in avg_width16_msa() [all …]
|
D | h264dsp_msa.c | 2342 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_weight_h264_pixels16_8_msa() local 2389 dst5, dst6, dst7); in ff_weight_h264_pixels16_8_msa() 2390 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); in ff_weight_h264_pixels16_8_msa() 2428 dst5, dst6, dst7); in ff_weight_h264_pixels16_8_msa() 2429 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); in ff_weight_h264_pixels16_8_msa() 2466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_biweight_h264_pixels16_8_msa() local 2485 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_biweight_h264_pixels16_8_msa() 2487 XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_biweight_h264_pixels16_8_msa() 2492 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10, in ff_biweight_h264_pixels16_8_msa() 2494 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11, in ff_biweight_h264_pixels16_8_msa() [all …]
|
D | vp9_mc_msa.c | 3421 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_avg_bilin_32v_msa() local 3439 LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); in ff_avg_bilin_32v_msa() 3486 PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); in ff_avg_bilin_32v_msa() 3502 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_avg_bilin_64v_msa() local 3522 LD_UB2(dst + 48, dst_stride, dst6, dst7); in ff_avg_bilin_64v_msa() 3571 PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); in ff_avg_bilin_64v_msa() 4132 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in avg_width16_msa() local 4138 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in avg_width16_msa() 4142 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, in avg_width16_msa() 4143 dst4, dst5, dst6, dst7); in avg_width16_msa() [all …]
|
D | hevc_mc_uni_msa.c | 1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; in hevc_hv_uni_8t_8multx2mult_msa() local 1563 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, in hevc_hv_uni_8t_8multx2mult_msa() 1566 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); in hevc_hv_uni_8t_8multx2mult_msa() 1579 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); in hevc_hv_uni_8t_8multx2mult_msa() 1599 dst5 = dst7; in hevc_hv_uni_8t_8multx2mult_msa() 1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; in hevc_hv_uni_8t_12w_msa() local 1709 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, in hevc_hv_uni_8t_12w_msa() 1712 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); in hevc_hv_uni_8t_12w_msa() 1725 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); in hevc_hv_uni_8t_12w_msa() 1745 dst5 = dst7; in hevc_hv_uni_8t_12w_msa() [all …]
|
D | h264dsp_mmi.c | 727 MMI_LDC1(%[ftmp4], %[dst7], 0x00) in ff_h264_idct8_dc_add_8_mmi() 751 MMI_SDC1(%[ftmp4], %[dst7], 0x00) in ff_h264_idct8_dc_add_8_mmi() 762 [dst6]"r"(dst+6*stride), [dst7]"r"(dst+7*stride), in ff_h264_idct8_dc_add_8_mmi()
|
D | vp9_idct_msa.c | 713 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in vp9_iadst8x8_colcol_addblk_msa() local 763 dst7 = LD_UB(dst + 7 * dst_stride); in vp9_iadst8x8_colcol_addblk_msa() 771 res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7); in vp9_iadst8x8_colcol_addblk_msa() 1310 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in vp9_iadst16_1d_columns_addblk_msa() local 1421 dst7 = LD_UB(dst + 11 * dst_stride); in vp9_iadst16_1d_columns_addblk_msa() 1422 ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); in vp9_iadst16_1d_columns_addblk_msa()
|
D | h264qpel_msa.c | 676 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_avg_h264_qpel16_mc00_msa() local 680 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_avg_h264_qpel16_mc00_msa() 684 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, in ff_avg_h264_qpel16_mc00_msa() 685 dst6, dst7); in ff_avg_h264_qpel16_mc00_msa() 686 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); in ff_avg_h264_qpel16_mc00_msa() 690 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in ff_avg_h264_qpel16_mc00_msa() 694 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, in ff_avg_h264_qpel16_mc00_msa() 695 dst6, dst7); in ff_avg_h264_qpel16_mc00_msa() 696 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); in ff_avg_h264_qpel16_mc00_msa() 1607 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in ff_put_h264_qpel16_mc21_msa() local [all …]
|
D | hevc_lpf_sao_msa.c | 466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in hevc_loopfilter_luma_ver_msa() local 865 ILVRL_B2_UB(dst3, dst2, dst6, dst7); in hevc_loopfilter_luma_ver_msa() 867 ILVRL_H2_UB(dst7, dst6, dst2, dst3); in hevc_loopfilter_luma_ver_msa()
|
D | qpeldsp_msa.c | 5838 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; in avg_width16_msa() local 5843 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); in avg_width16_msa() 5847 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, in avg_width16_msa() 5848 dst4, dst5, dst6, dst7); in avg_width16_msa() 5849 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); in avg_width16_msa()
|
/third_party/ffmpeg/libavcodec/x86/ |
D | cavsidct.asm | 104 SUMSUB_BA w, 7, 6 ; m7 = dst0, m6 = dst7
|
/third_party/ffmpeg/libavcodec/aarch64/ |
D | vp9mc_16bpp_neon.S | 123 .macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, … 138 smlal \dst7\().4s, v23.4h, v0.h[\offset]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/ |
D | matmul_fp32.c | 304 __m128 dst7 = _mm_movelh_ps(src56H, src78H); in RowMajor2Col12Major() local 328 _mm_storeu_ps(dst_c + 28, dst7); in RowMajor2Col12Major()
|