/external/flac/src/libFLAC/ |
D | lpc_intrin_sse2.c | 68 … q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 69 … q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 70 … q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 71 … q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 72 … q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 73 … q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 74 … q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 75 … q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 76 … q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() 77 … q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2() [all …]
|
D | lpc_intrin_sse41.c | 77 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 78 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 79 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 80 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 81 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 82 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 89 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 95 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 102 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() 109 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); in FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x4c2-minmax-sse2-ld128.c | 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() 130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128() [all …]
|
D | 4x4c2-minmax-ssse3-ld64.c | 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64() [all …]
|
D | 4x4c2-minmax-ssse3-ld128.c | 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() 130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128() [all …]
|
D | 4x4c2-minmax-sse2-ld64.c | 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64() [all …]
|
D | 4x4c2-minmax-sse41-ld128.c | 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 112 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 130 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() [all …]
|
D | 4x4c2-minmax-sse41-ld64.c | 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 110 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 126 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 128 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() [all …]
|
D | 4x4c2-minmax-xop-ld128.c | 111 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 113 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 115 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 117 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 120 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 122 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 124 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 126 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 133 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 135 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() [all …]
|
D | 4x4c2-minmax-xop-ld64.c | 109 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 111 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 113 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 115 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 120 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 122 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 124 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 126 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 131 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 133 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() [all …]
|
/external/XNNPACK/src/qu8-gemm/ |
D | 4x4c2-minmax-sse2.c | 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 101 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 103 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 105 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 107 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 113 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() 115 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x4c2-xw-minmax-sse2.c | 86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2() [all …]
|
D | 4x4c2-minmax-sse2-ld128.c | 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() 113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128() [all …]
|
D | 4x4c2-minmax-sse2-ld64.c | 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64() [all …]
|
D | 4x4c2-minmax-ssse3-ld64.c | 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64() [all …]
|
D | 4x4c2-xw-minmax-ssse3.c | 86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3() [all …]
|
D | 4x4c2-minmax-ssse3-ld128.c | 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() 113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128() [all …]
|
D | 4x4c2-minmax-xop-ld128.c | 94 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 96 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 98 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 100 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 116 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 118 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() [all …]
|
D | 4x4c2-minmax-xop-ld64.c | 92 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 94 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 96 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 98 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 114 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 116 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() [all …]
|
D | 4x4c2-xw-minmax-xop.c | 91 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 93 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 95 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 97 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 101 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 103 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 105 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 107 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 111 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 113 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() [all …]
|
D | 4x4c2-xw-minmax-sse41.c | 86 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 88 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 90 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 92 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 108 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() [all …]
|
D | 4x4c2-minmax-sse41-ld128.c | 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 113 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() [all …]
|
D | 4x4c2-minmax-sse41-ld64.c | 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 91 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 93 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 109 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 111 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() [all …]
|
/external/XNNPACK/src/qu8-igemm/ |
D | 4x4c2-minmax-sse2.c | 104 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 105 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 106 …vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 107 …vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 111 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 112 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 113 …vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 114 …vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 118 …vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() 119 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2() [all …]
|
/external/python/cpython3/Modules/_blake2/impl/ |
D | blake2s-load-sse41.h | 33 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 36 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 39 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 45 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 51 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 57 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 69 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 75 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 81 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 87 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); [all …]
|