Home
last modified time | relevance | path

Searched refs:_mm_shuffle_epi32 (Results 1 – 25 of 321) sorted by relevance

12345678910>>...13

/external/flac/src/libFLAC/
Dlpc_intrin_sse2.c68 … q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
69 … q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
70 … q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
71 … q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
72 … q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
73 … q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
74 … q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
75 … q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
76 … q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
77 … q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()
[all …]
Dlpc_intrin_sse41.c77 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
78 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
79 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
80 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
81 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
82 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
89 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
95 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
102 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
109 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D4x4c2-minmax-sse2-ld128.c106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128()
[all …]
D4x4c2-minmax-ssse3-ld64.c104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64()
[all …]
D4x4c2-minmax-ssse3-ld128.c106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128()
[all …]
D4x4c2-minmax-sse2-ld64.c104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64()
[all …]
D4x4c2-minmax-sse41-ld128.c106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
[all …]
D4x4c2-minmax-sse41-ld64.c104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
[all …]
D4x4c2-minmax-xop-ld128.c111 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
113 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
115 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
117 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
120 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
122 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
124 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
126 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
133 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
135 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
[all …]
D4x4c2-minmax-xop-ld64.c109 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
111 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
113 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
115 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
120 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
122 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
124 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
126 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
131 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
133 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
[all …]
/external/XNNPACK/src/qu8-gemm/
D4x4c2-minmax-sse2.c89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
101 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
103 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
105 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
107 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
113 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
115 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2()
[all …]
/external/XNNPACK/src/qs8-gemm/gen/
D4x4c2-xw-minmax-sse2.c86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2()
[all …]
D4x4c2-minmax-sse2-ld128.c89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128()
[all …]
D4x4c2-minmax-sse2-ld64.c87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64()
[all …]
D4x4c2-minmax-ssse3-ld64.c87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64()
[all …]
D4x4c2-xw-minmax-ssse3.c86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3()
[all …]
D4x4c2-minmax-ssse3-ld128.c89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128()
[all …]
D4x4c2-minmax-xop-ld128.c94 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
96 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
98 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
100 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
116 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
118 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
[all …]
D4x4c2-minmax-xop-ld64.c92 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
94 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
96 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
98 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
114 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
116 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
[all …]
D4x4c2-xw-minmax-xop.c91 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
93 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
95 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
97 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
101 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
103 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
105 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
107 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
111 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
113 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
[all …]
D4x4c2-xw-minmax-sse41.c86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
[all …]
D4x4c2-minmax-sse41-ld128.c89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
[all …]
D4x4c2-minmax-sse41-ld64.c87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
[all …]
/external/XNNPACK/src/qu8-igemm/
D4x4c2-minmax-sse2.c104 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
105 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
106 …vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
107 …vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
111 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
112 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
113 …vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
114 …vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
118 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
119 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2()
[all …]
/external/python/cpython3/Modules/_blake2/impl/
Dblake2s-load-sse41.h33 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
36 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
39 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
45 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
51 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
57 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
69 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
75 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
81 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
87 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
[all …]

12345678910>>...13