/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c | 90 const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() local 93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 147 const v128_t vxb1 = wasm_v128_load(w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() local 151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2() 157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 90 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 147 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
D | 4x4c2-minmax-fp32-xop-ld128.c | 92 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 104 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 106 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 108 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 110 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 165 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 169 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 171 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 173 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 175 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
|
D | 4x4c2-xw-minmax-fp32-xop.c | 100 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() local 103 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 105 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 107 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 109 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 161 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() local 165 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 167 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 169 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop() 171 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop()
|
D | 4x4c2-xw-minmax-fp32-sse41.c | 95 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() local 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 156 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() local 160 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 162 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 164 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41() 166 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c | 82 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 149 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 159 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
|
D | 4x4c2-minmax-fp32-xop-ld64.c | 102 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 105 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 107 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 109 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 111 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 167 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 171 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 173 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 175 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 177 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
|
D | 4x4c2-xw-minmax-fp32-avx.c | 95 const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() local 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 156 const __m128i vxb1 = _mm_load_si128((const __m128i*) w); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() local 160 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 162 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 164 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx() 166 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx()
|
D | 4x4c2-minmax-fp32-sse41-ld64.c | 97 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local 100 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 102 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 104 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 162 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local 166 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 168 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 170 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 172 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c | 98 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 165 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 175 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 106 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 109 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 111 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 113 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 115 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 163 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 167 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
D | 4x4c2-minmax-fp32-sse41-ld64.c | 113 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local 116 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 118 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 120 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 122 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 179 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() local 182 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 184 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 186 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64() 188 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64()
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c | 98 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 165 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 175 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 106 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 109 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 111 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 113 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 115 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 163 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 167 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 169 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 171 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 173 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 4x4c2-minmax-fp32-xop-ld64.c | 102 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 105 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 107 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 109 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 111 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 167 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 171 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 173 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 175 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 177 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
|
D | 4x4c2-minmax-fp32-xop-ld128.c | 92 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 104 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 106 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 108 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 110 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 165 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 169 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 171 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 173 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 175 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c | 82 const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 149 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 159 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 90 const v128_t vxb1 = wasm_i16x8_load8x8((const int8_t*) w + 8); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 93 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 147 const v128_t vxb1 = wasm_i16x8_load8x8(w); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 151 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 153 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 155 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 157 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c | 83 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 95 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 97 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 99 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 101 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 150 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() local 154 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 156 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 158 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128() 160 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128()
|
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 91 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8((const uint8_t*) w + 8), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 94 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 96 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 98 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 100 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 148 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 152 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 154 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 156 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 158 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
D | 4x4c2-minmax-fp32-avx-ld64.c | 98 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() local 101 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 103 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 105 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 107 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 163 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() local 167 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 169 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 171 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64() 173 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64()
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c | 107 … const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8((const uint8_t*) w + 8), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 110 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 112 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 114 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 116 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 164 const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_load8x8(w), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() local 168 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa0, vxa0, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 170 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa1, vxa1, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 172 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa2, vxa2, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64() 174 wasm_i32x4_dot_i16x8(wasm_v32x4_shuffle(vxa3, vxa3, 1, 1, 1, 1), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64()
|
D | 4x4c2-minmax-fp32-xop-ld64.c | 119 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 122 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 124 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 126 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 128 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 185 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() local 188 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 190 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 192 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64() 194 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64()
|
D | 4x4c2-minmax-fp32-avx-ld128.c | 105 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() local 117 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 119 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 121 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 123 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 179 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() local 182 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 184 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 186 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128() 188 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128()
|
D | 4x4c2-minmax-fp32-xop-ld128.c | 110 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 122 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 124 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 126 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 128 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 184 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() local 187 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 189 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 191 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128() 193 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123); in xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128()
|