/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-neon-mlal-lane.c | 75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 153 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 175 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 198 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 220 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 242 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 264 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 298 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 111 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 139 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 167 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 195 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 223 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 251 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 279 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 307 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 343 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 78 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 116 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 132 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 166 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 188 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 205 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 222 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 234 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 248 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 76 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 221 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 222 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 223 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 224 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 277 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 316 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 355 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 376 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 395 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 76 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 146 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 198 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 250 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 302 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 459 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 460 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 461 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 462 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 515 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 90 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 138 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 162 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 206 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 236 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 261 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 286 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 306 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 102 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 160 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 192 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 246 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 284 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 317 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 350 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 378 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 404 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
|
D | 4x16c8-minmax-avx512skx.c | 87 __m512i vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() local 119 vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() 138 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
|
D | 4x16c8-minmax-neon-mull-padal.c | 331 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 438 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 463 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 482 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 499 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 511 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 524 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 395 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 502 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 527 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 546 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 563 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 575 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 588 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 570 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 677 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 702 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 721 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 738 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 750 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 763 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-neon-mull-addw-dup.c | 72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 128 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 156 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 184 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 212 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 240 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 268 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 296 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 324 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 360 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 170 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 192 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 215 … vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 237 … vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 259 … vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 281 … vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 315 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16c4-minmax-neondot.c | 73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 131 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 147 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 181 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 202 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 221 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 238 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 250 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 264 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 238 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 239 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 240 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 241 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 294 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 333 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 372 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 396 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 415 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 81 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 157 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 181 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 225 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 254 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 281 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 306 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 346 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 163 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 215 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 267 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 319 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 476 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 477 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 478 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 479 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 532 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 8x16c4-minmax-neondot.c | 89 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 183 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 215 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 269 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 306 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 341 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 374 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 402 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 428 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
|
D | 4x16c8-minmax-avx512skx.c | 82 __m512i vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() local 134 vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() 155 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
|
D | 4x16c8-minmax-neon-mull-padal.c | 351 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 458 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 483 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 502 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 519 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 531 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 544 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 415 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 522 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 547 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 566 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 583 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 595 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 608 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 590 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 697 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 722 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 741 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 758 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 770 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 783 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/f32-prelu/gen/ |
D | wasmsimd-minmax-4x16.c | 115 v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() local 134 vacc3x89AB = wasm_f32x4_add(vacc3x89AB, wasm_f32x4_mul(vi3x89AB, vw89AB)); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() 154 wasm_v128_store(o3 + 8, vacc3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
|
D | wasmsimd-bitselect-4x16.c | 115 v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() local 134 vacc3x89AB = wasm_v128_bitselect(vacc3x89AB, vi3x89AB, vmask3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() 154 wasm_v128_store(o3 + 8, vacc3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
|
D | neon-4x16.c | 109 float32x4_t vacc3x89AB = vmulq_f32(vi3x89AB, vw89AB); in xnn_f32_prelu_ukernel__neon_4x16() local 128 vacc3x89AB = vbslq_f32(vm3x89AB, vacc3x89AB, vi3x89AB); in xnn_f32_prelu_ukernel__neon_4x16() 145 vst1q_f32(o3, vacc3x89AB); o3 += 4; in xnn_f32_prelu_ukernel__neon_4x16()
|