/external/libvpx/libvpx/vpx_dsp/x86/ |
D | fwd_txfm_sse2.c | 48 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); in vpx_fdct8x8_1_sse2() 49 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); in vpx_fdct8x8_1_sse2() 50 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); in vpx_fdct8x8_1_sse2() 51 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); in vpx_fdct8x8_1_sse2() 57 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); in vpx_fdct8x8_1_sse2() 58 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); in vpx_fdct8x8_1_sse2() 59 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); in vpx_fdct8x8_1_sse2() 60 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); in vpx_fdct8x8_1_sse2() 95 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); in vpx_fdct16x16_1_sse2() 96 in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); in vpx_fdct16x16_1_sse2() [all …]
|
D | highbd_idct8x8_add_sse4.c | 100 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse4_1() 101 io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse4_1() 102 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse4_1() 103 io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse4_1() 104 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse4_1() 105 io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse4_1() 106 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse4_1() 107 io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse4_1() 116 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse4_1() 117 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse4_1() [all …]
|
D | highbd_idct8x8_add_sse2.c | 98 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse2() 99 io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse2() 100 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse2() 101 io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse2() 102 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse2() 103 io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse2() 104 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse2() 105 io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse2() 114 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); in vpx_highbd_idct8x8_64_add_sse2() 115 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); in vpx_highbd_idct8x8_64_add_sse2() [all …]
|
/external/libhevc/common/x86/ |
D | ihevc_itrans_recon_32x32_ssse3_intr.c | 260 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 262 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 264 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 266 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 268 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 270 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 272 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 274 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 277 m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() 279 m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_32x32_ssse3() [all …]
|
D | ihevc_itrans_recon_16x16_ssse3_intr.c | 210 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 212 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 214 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 216 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 218 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 220 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 222 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 224 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); in ihevc_itrans_recon_16x16_ssse3() 249 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 in ihevc_itrans_recon_16x16_ssse3() 295 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 in ihevc_itrans_recon_16x16_ssse3() [all …]
|
/external/libvpx/libvpx/vp8/encoder/x86/ |
D | vp8_quantize_sse2.c | 41 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); in vp8_regular_quantize_b_sse2() 42 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); in vp8_regular_quantize_b_sse2() 43 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); in vp8_regular_quantize_b_sse2() 44 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); in vp8_regular_quantize_b_sse2() 46 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); in vp8_regular_quantize_b_sse2() 47 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); in vp8_regular_quantize_b_sse2() 48 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); in vp8_regular_quantize_b_sse2() 49 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); in vp8_regular_quantize_b_sse2() 50 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); in vp8_regular_quantize_b_sse2() 51 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); in vp8_regular_quantize_b_sse2() [all …]
|
D | quantize_sse4.c | 37 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); in vp8_regular_quantize_b_sse4_1() 38 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); in vp8_regular_quantize_b_sse4_1() 39 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); in vp8_regular_quantize_b_sse4_1() 40 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); in vp8_regular_quantize_b_sse4_1() 42 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); in vp8_regular_quantize_b_sse4_1() 43 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); in vp8_regular_quantize_b_sse4_1() 44 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); in vp8_regular_quantize_b_sse4_1() 45 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); in vp8_regular_quantize_b_sse4_1() 46 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); in vp8_regular_quantize_b_sse4_1() 47 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); in vp8_regular_quantize_b_sse4_1() [all …]
|
D | vp8_quantize_ssse3.c | 44 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); in vp8_fast_quantize_b_ssse3() 45 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); in vp8_fast_quantize_b_ssse3() 46 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); in vp8_fast_quantize_b_ssse3() 47 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); in vp8_fast_quantize_b_ssse3() 48 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); in vp8_fast_quantize_b_ssse3() 49 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); in vp8_fast_quantize_b_ssse3() 50 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); in vp8_fast_quantize_b_ssse3() 51 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); in vp8_fast_quantize_b_ssse3() 58 __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); in vp8_fast_quantize_b_ssse3()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x4c2-xw-minmax-xop.c | 58 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 62 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 66 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 70 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 83 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 90 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 97 const __m128i vxb2 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 106 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 107 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 120 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() [all …]
|
D | 1x4c2-xw-minmax-sse41.c | 53 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 57 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 61 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 78 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 85 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 92 const __m128i vxb2 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 101 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 102 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 115 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() [all …]
|
D | 1x4c2-xw-minmax-sse2.c | 53 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 57 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 61 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 78 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 85 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 92 const __m128i vxb2 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 101 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 102 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() 131 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2() [all …]
|
D | 1x4c2-xw-minmax-ssse3.c | 53 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 57 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 61 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 78 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 85 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 92 const __m128i vxb2 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 101 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 102 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() 131 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3() [all …]
|
D | 1x4c8-xw-minmax-xop.c | 61 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 64 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 67 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 70 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 83 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 84 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 97 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 101 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 102 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() 106 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop() [all …]
|
D | 1x4c8-xw-minmax-sse41.c | 56 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 59 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 62 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 78 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 79 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 92 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 96 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 97 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() 101 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41() [all …]
|
D | 1x4c8-xw-minmax-ssse3.c | 56 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 59 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 62 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 78 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 79 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 108 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 112 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 113 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() 117 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3() [all …]
|
D | 1x4c8-xw-minmax-sse2.c | 56 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 59 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 62 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 65 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 78 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 79 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 108 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 112 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 113 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() 117 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2() [all …]
|
D | 1x8c8-minmax-avx2.c | 65 const __m128i vb01 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 69 const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t))); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 73 const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t))); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 77 const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t))); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 94 …const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->ss… in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 95 …const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2… in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 108 …const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params… in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 112 …const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) p… in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 113 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 117 …const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) par… in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() [all …]
|
D | 1x4c8-minmax-xop-ld128.c | 61 const __m128i vb01 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 68 const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t))); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 85 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 86 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 99 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 103 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 104 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 108 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 111 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128() 112 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128()
|
D | 1x4c8-minmax-sse41-ld128.c | 56 const __m128i vb01 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 63 const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t))); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 80 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 81 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 94 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 98 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 99 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 103 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 106 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128() 107 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128()
|
D | 2x4c8-xw-minmax-xop.c | 74 const __m128i vxb0 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 78 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 82 const __m128i vxb2 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 86 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 103 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 104 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 123 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 129 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 130 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 136 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() [all …]
|
/external/XNNPACK/src/qu8-vadd/ |
D | minmax-sse2.c | 23 …const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_produ… in xnn_qu8_vadd_minmax_ukernel__sse2() 24 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.a_multiplier_lo); in xnn_qu8_vadd_minmax_ukernel__sse2() 25 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.a_multiplier_hi); in xnn_qu8_vadd_minmax_ukernel__sse2() 26 const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.b_multiplier_lo); in xnn_qu8_vadd_minmax_ukernel__sse2() 27 const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.b_multiplier_hi); in xnn_qu8_vadd_minmax_ukernel__sse2() 28 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qu8_vadd_minmax_ukernel__sse2() 29 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qu8_vadd_minmax_ukernel__sse2() 68 const __m128i vy_zero_point = _mm_load_si128((const __m128i*) params->sse2.y_zero_point); in xnn_qu8_vadd_minmax_ukernel__sse2() 71 vy = _mm_max_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_min)); in xnn_qu8_vadd_minmax_ukernel__sse2() 72 vy = _mm_min_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_max)); in xnn_qu8_vadd_minmax_ukernel__sse2() [all …]
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up32x9-minmax-avx512skx-mul32.c | 34 …const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.mu… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 35 …const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.roun… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 36 …const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 37 …const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 38 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 39 …const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 40 …const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.ou… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 41 …const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.ou… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 100 …const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintpt… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() 102 …const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintpt… in xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x8c8-minmax-avx2.c | 76 const __m128i vb01 = _mm_load_si128((const __m128i*) w); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 80 const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t))); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 84 const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t))); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 88 const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t))); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 107 …const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->ss… in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 108 …const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2… in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 121 …const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params… in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 125 …const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) p… in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 126 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 130 …const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) par… in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() [all …]
|
D | 1x4c8-minmax-xop-ld128.c | 72 const __m128i vb01 = _mm_load_si128((const __m128i*) w); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 79 const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16)); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 98 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 99 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 112 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 116 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 117 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 121 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 124 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128() 125 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128()
|
/external/libvpx/libvpx/vp9/common/x86/ |
D | vp9_highbd_iht8x8_add_sse4.c | 177 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); in vp9_highbd_iht8x8_64_add_sse4_1() 178 io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); in vp9_highbd_iht8x8_64_add_sse4_1() 179 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); in vp9_highbd_iht8x8_64_add_sse4_1() 180 io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); in vp9_highbd_iht8x8_64_add_sse4_1() 181 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); in vp9_highbd_iht8x8_64_add_sse4_1() 182 io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); in vp9_highbd_iht8x8_64_add_sse4_1() 183 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); in vp9_highbd_iht8x8_64_add_sse4_1() 184 io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); in vp9_highbd_iht8x8_64_add_sse4_1() 185 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); in vp9_highbd_iht8x8_64_add_sse4_1() 186 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); in vp9_highbd_iht8x8_64_add_sse4_1() [all …]
|