Home
last modified time | relevance | path

Searched refs:vxb1 (Results 1 – 25 of 746) sorted by relevance

12345678910>>...30

/external/XNNPACK/src/qs8-gemm/gen/
D4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c90 const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() local
93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
147 const v128_t vxb1 = wasm_v128_load(w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() local
151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c90 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
147 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
D4x4c2-minmax-fp32-xop-ld128.c92 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
104 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
106 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
108 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
110 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
165 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
169 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
171 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
173 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
175 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
D4x4c2-xw-minmax-fp32-xop.c100 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() local
103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
161 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() local
165 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
167 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
169 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
171 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
D4x4c2-xw-minmax-fp32-sse41.c95 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() local
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
156 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() local
160 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
162 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
164 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
166 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c82 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
149 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
159 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
D4x4c2-minmax-fp32-xop-ld64.c102 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
105 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
107 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
109 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
111 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
167 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
171 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
173 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
175 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
177 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
D4x4c2-xw-minmax-fp32-avx.c95 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() local
98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
156 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() local
160 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
162 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
164 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
166 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
D4x4c2-minmax-fp32-sse41-ld64.c97 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local
100 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
102 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
104 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
106 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
162 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local
166 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
168 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
170 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
172 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
/external/XNNPACK/src/qs8-igemm/gen/
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c98 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
165 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
175 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c106 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
109 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
111 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
113 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
115 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
163 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
167 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
D4x4c2-minmax-fp32-sse41-ld64.c113 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local
116 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
118 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
120 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
122 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
179 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local
182 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
184 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
186 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
188 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
/external/XNNPACK/src/qc8-igemm/gen/
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c98 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
165 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
175 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c106 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
109 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
111 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
113 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
115 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
163 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
167 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
/external/XNNPACK/src/qc8-gemm/gen/
D4x4c2-minmax-fp32-xop-ld64.c102 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
105 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
107 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
109 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
111 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
167 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
171 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
173 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
175 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
177 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
D4x4c2-minmax-fp32-xop-ld128.c92 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
104 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
106 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
108 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
110 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
165 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
169 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
171 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
173 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
175 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c82 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
149 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
159 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c90 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
147 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
/external/XNNPACK/src/qu8-gemm/gen/
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c83 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
101 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
150 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local
154 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
156 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
158 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
160 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c91 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8((const uint8_t*) w + 8), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
148 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
152 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
154 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
156 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
158 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
D4x4c2-minmax-fp32-avx-ld64.c98 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() local
101 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
103 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
105 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
107 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
163 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() local
167 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
169 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
171 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
173 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
/external/XNNPACK/src/qu8-igemm/gen/
D4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c107 … const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8((const uint8_t*) w + 8), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
164 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local
168 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
170 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
172 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
174 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
D4x4c2-minmax-fp32-xop-ld64.c119 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
122 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
124 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
126 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
128 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
185 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local
188 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
190 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
192 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
194 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
D4x4c2-minmax-fp32-avx-ld128.c105 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() local
117 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
119 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
121 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
123 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
179 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() local
182 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
184 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
186 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
188 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
D4x4c2-minmax-fp32-xop-ld128.c110 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
122 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
124 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
126 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
128 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
184 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local
187 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
189 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
191 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
193 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()

12345678910>>...30