/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-neon-mlal-lane.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 199 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 221 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 243 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 265 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 299 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 112 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 140 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 168 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 196 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 224 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 252 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 280 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 308 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 344 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 79 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 117 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 133 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 167 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 189 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 206 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 223 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 234 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 248 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 229 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 230 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 231 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 232 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 279 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 318 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 357 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 377 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 396 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 159 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 211 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 263 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 315 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 467 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 468 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 469 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 470 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 517 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 91 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 139 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 163 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 207 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 237 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 262 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 287 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 306 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 326 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 103 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 161 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 193 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 247 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 285 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 318 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 351 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 378 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 404 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
|
D | 4x16c8-minmax-avx512skx.c | 88 __m512i vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() local 125 vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() 138 …512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vac… in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
|
D | 4x16c8-minmax-neon-mull-padal.c | 332 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 445 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 464 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 483 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 500 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 511 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 524 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 396 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 509 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 528 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 547 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 564 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 575 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 588 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 571 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 684 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 703 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 722 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 739 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 750 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 763 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-neon-mull-addw-dup.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 129 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 157 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 185 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 213 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 241 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 269 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 297 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 325 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 361 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 216 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 238 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 260 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 282 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 316 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16c4-minmax-neondot.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 132 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 148 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 182 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 203 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 222 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 239 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 250 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 264 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 246 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 247 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 248 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 249 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 296 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 335 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 374 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 397 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 416 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 82 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 158 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 182 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 226 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 255 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 282 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 307 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 346 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 176 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 228 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 280 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 332 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 484 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 485 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 486 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 487 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 534 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 8x16c4-minmax-neondot.c | 90 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 184 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 216 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 270 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 307 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 342 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 375 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 402 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 428 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
|
D | 4x16c8-minmax-avx512skx.c | 83 __m512i vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() local 140 vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() 155 …512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vac… in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
|
D | 4x16c8-minmax-neon-mull-padal.c | 352 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 465 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 484 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 503 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 520 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 531 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 544 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 416 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 529 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 548 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 567 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 584 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 595 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 608 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 591 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 704 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 723 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 742 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 759 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 770 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 783 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/f32-prelu/gen/ |
D | wasmsimd-minmax-4x16.c | 117 v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() local 135 vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, wasm_f32x4_mul(vi3xCDEF, vwCDEF)); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() 155 wasm_v128_store(o3 + 12, vacc3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
|
D | wasmsimd-bitselect-4x16.c | 117 v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() local 135 vacc3xCDEF = wasm_v128_bitselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() 155 wasm_v128_store(o3 + 12, vacc3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
|
D | neon-4x16.c | 111 float32x4_t vacc3xCDEF = vmulq_f32(vi3xCDEF, vwCDEF); in xnn_f32_prelu_ukernel__neon_4x16() local 129 vacc3xCDEF = vbslq_f32(vm3xCDEF, vacc3xCDEF, vi3xCDEF); in xnn_f32_prelu_ukernel__neon_4x16() 146 vst1q_f32(o3, vacc3xCDEF); o3 += 4; in xnn_f32_prelu_ukernel__neon_4x16()
|