/external/XNNPACK/src/qs8-igemm/ |
D | MRx8c8-avx2.c.in | 161 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); local 165 _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi)); 169 _mm_storel_epi64((__m128i*) c1, vout_hi); 181 *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2); 185 _mm_storeu_si32(c1, vout_hi); 192 vout_hi = _mm_srli_epi64(vout_hi, 32); 196 *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4); 200 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); 207 vout_hi = _mm_srli_epi32(vout_hi, 16); 211 *c3 = (uint8_t) _mm_extract_epi8(vout_hi, 8); [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c8-xw-minmax-avx2.c | 155 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() local 159 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 171 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 177 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 181 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 187 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 191 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
|
D | 2x8c8-minmax-avx2.c | 159 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() local 163 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 175 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 181 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 185 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 191 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 195 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
|
D | 3x8c8-minmax-avx2.c | 193 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() local 197 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 212 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 220 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 224 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 232 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 236 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
|
D | 3x8c8-xw-minmax-avx2.c | 189 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() local 193 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 208 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 216 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 220 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 228 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 232 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
|
D | 1x8c8-xw-minmax-avx2.c | 124 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() local 141 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 149 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
|
D | 1x8c8-minmax-avx2.c | 128 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() local 145 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 153 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
|
/external/XNNPACK/src/qs8-gemm/ |
D | MRx8c8-avx2.c.in | 162 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); 167 _mm_storel_epi64((__m128i*) c1, vout_hi); 171 _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi)); 184 _mm_storeu_si32(c1, vout_hi); 188 *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2); 194 vout_hi = _mm_srli_epi64(vout_hi, 32); 199 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); 203 *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4); 209 vout_hi = _mm_srli_epi32(vout_hi, 16); 214 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c8-minmax-avx2.c | 174 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() local 177 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 188 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 195 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 198 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 205 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 208 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
|
D | 3x8c8-minmax-avx2.c | 210 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() local 214 _mm_storel_epi64((__m128i*) c1, vout_hi); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 227 _mm_storeu_si32(c1, vout_hi); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 235 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 239 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 247 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 251 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
|
D | 1x8c8-minmax-avx2.c | 141 __m128i vout_hi = _mm256_extracti128_si256(vout, 1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() local 158 vout_hi = _mm_srli_epi64(vout_hi, 32); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 166 vout_hi = _mm_srli_epi32(vout_hi, 16); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
|
/external/XNNPACK/src/qu8-dwconv/ |
D | up8x9-minmax-sse2.c | 213 …const __m128i vout_hi = _mm_sub_epi32(_mm_sra_epi32(vq31prod_hi0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local 216 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 363 …const __m128i vout_hi = _mm_sub_epi32(_mm_sra_epi32(vq31prod_hi0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local 366 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2()
|