Home
last modified time | relevance | path

Searched refs:vacc3x89AB (Results 1 – 25 of 25) sorted by relevance

/external/XNNPACK/src/qs8-gemm/gen/
D4x16-minmax-neon-mlal-lane.c75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
153 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
175 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
198 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
220 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
242 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
264 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
298 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
111 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
139 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
167 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
195 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
223 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
251 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
279 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
307 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
343 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16c4-minmax-neondot.c78 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local
116 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
132 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
166 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
188 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
205 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
222 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
234 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
248 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
D4x16c2-minmax-neon-mull-padal-dup.c76 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
221 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
222 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
223 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
224 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
277 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
316 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
355 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
376 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
395 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D4x16c2-minmax-neon-mlal-padal-dup.c76 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
146 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
198 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
250 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
302 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
459 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
460 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
461 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
462 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
515 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D6x16c4-minmax-neondot.c90 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local
138 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
162 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
206 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
236 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
261 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
286 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
306 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
D8x16c4-minmax-neondot.c102 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local
160 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
192 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
246 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
284 const int32x4_t vproduct3x89AB = vqrdmulhq_n_s32(vacc3x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
317 vacc3x89AB = vsraq_n_s32(vproduct3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
350 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
378 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
404 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
D4x16c8-minmax-avx512skx.c87 __m512i vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() local
119 vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
138 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
D4x16c8-minmax-neon-mull-padal.c331 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local
438 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local
463 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
482 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
499 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
511 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
524 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
D4x16c16-minmax-neon-mlal-padal.c395 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
502 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
527 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
546 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
563 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
575 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
588 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
D4x16c8-minmax-neon-mlal-padal.c570 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
677 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
702 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
721 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
738 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
750 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
763 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
/external/XNNPACK/src/qs8-igemm/gen/
D4x16-minmax-neon-mull-addw-dup.c72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
128 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
156 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
184 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
212 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
240 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
268 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
296 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
324 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
360 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
170 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
192 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
215vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
237vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
259vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
281vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
315 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16c4-minmax-neondot.c73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local
131 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
147 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
181 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
202 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
221 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
238 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
250 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
264 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
D4x16c2-minmax-neon-mull-padal-dup.c73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
238 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
239 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
240 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
241 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
294 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
333 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
372 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
396 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
415 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D6x16c4-minmax-neondot.c81 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local
157 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
181 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
225 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
254 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
281 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
306 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
346 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
D4x16c2-minmax-neon-mlal-padal-dup.c73 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
163 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
215 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
267 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
319 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
476 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
477 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
478 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
479 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
532 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D8x16c4-minmax-neondot.c89 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local
183 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
215 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
269 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
306 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
341 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
374 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
402 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
428 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
D4x16c8-minmax-avx512skx.c82 __m512i vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() local
134 vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
155 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
D4x16c8-minmax-neon-mull-padal.c351 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local
458 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local
483 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
502 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
519 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
531 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
544 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
D4x16c16-minmax-neon-mlal-padal.c415 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
522 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
547 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
566 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
583 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
595 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
608 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
D4x16c8-minmax-neon-mlal-padal.c590 int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
697 int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
722 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
741 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
758 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
770 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
783 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3x… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
/external/XNNPACK/src/f32-prelu/gen/
Dwasmsimd-minmax-4x16.c115 v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() local
134 vacc3x89AB = wasm_f32x4_add(vacc3x89AB, wasm_f32x4_mul(vi3x89AB, vw89AB)); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
154 wasm_v128_store(o3 + 8, vacc3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
Dwasmsimd-bitselect-4x16.c115 v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() local
134 vacc3x89AB = wasm_v128_bitselect(vacc3x89AB, vi3x89AB, vmask3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
154 wasm_v128_store(o3 + 8, vacc3x89AB); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
Dneon-4x16.c109 float32x4_t vacc3x89AB = vmulq_f32(vi3x89AB, vw89AB); in xnn_f32_prelu_ukernel__neon_4x16() local
128 vacc3x89AB = vbslq_f32(vm3x89AB, vacc3x89AB, vi3x89AB); in xnn_f32_prelu_ukernel__neon_4x16()
145 vst1q_f32(o3, vacc3x89AB); o3 += 4; in xnn_f32_prelu_ukernel__neon_4x16()