Home
last modified time | relevance | path

Searched refs:vout2x01234567 (Results 1 – 25 of 28) sorted by relevance

12

/external/XNNPACK/src/qs8-igemm/gen/
D3x8c2-minmax-neon-mull-padal-dup.c238 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local
245 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local
250 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
253 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
257 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
270 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
273 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
277 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
280 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
284 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
D3x8c8-minmax-neon-mull-padal.c261 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local
268 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local
273 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
276 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
280 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
293 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
296 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
300 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
303 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
307 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
D3x8-minmax-neon-mull-addw-dup.c309 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() local
316 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() local
321 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
324 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
328 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
341 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
344 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
348 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
351 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
355 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
D3x8-minmax-neon-mlal-lane.c286 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() local
293 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() local
298 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
301 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
305 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
318 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
321 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
325 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
328 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
332 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
D3x8c16-minmax-neon-mlal-padal.c285 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local
292 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local
297 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
300 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
304 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
317 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
320 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
324 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
327 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
331 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
D3x8c8-minmax-neon-mlal-padal.c362 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local
369 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local
374 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
377 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
381 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
394 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
397 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
401 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
404 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
408 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
D3x8c2-minmax-neon-mlal-padal-dup.c338 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local
345 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local
350 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
353 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
357 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
370 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
373 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
377 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
380 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
384 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
D3x16c2-minmax-neon-mull-padal-dup.c402 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
404 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
407 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
411 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
414 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
418 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
421 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
425 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
D3x16c8-minmax-neon-mull-padal.c461 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local
463 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
466 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
470 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
473 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
477 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
480 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
484 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
D3x16-minmax-neon-mull-addw-dup.c533 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
535 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
538 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
542 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
545 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
549 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
552 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
556 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
D3x16-minmax-neon-mlal-lane.c480 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local
482 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
485 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
489 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
492 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
496 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
499 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
503 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
D3x16c16-minmax-neon-mlal-padal.c509 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local
511 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
514 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
518 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
521 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
525 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
528 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
532 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
/external/XNNPACK/src/qs8-gemm/gen/
D3x8c2-minmax-neon-mull-padal-dup.c220 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local
227 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local
233 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
236 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
241 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
256 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
258 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
263 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
265 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
270 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
D3x8c8-minmax-neon-mull-padal.c243 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local
250 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local
256 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
259 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
264 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
279 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
281 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
286 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
288 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
293 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
D3x8c16-minmax-neon-mlal-padal.c267 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local
274 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local
280 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
283 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
288 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
303 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
305 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
310 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
312 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
317 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
D3x8-minmax-neon-mull-addw-dup.c291 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() local
298 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() local
304 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
307 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
312 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
327 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
329 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
334 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
336 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
341 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
D3x8-minmax-neon-mlal-lane.c269 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() local
276 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() local
282 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
285 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
290 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
305 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
307 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
312 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
314 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
319 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
D3x8c8-minmax-neon-mlal-padal.c344 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local
351 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local
357 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
360 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
365 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
380 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
382 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
387 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
389 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
394 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
D3x8c2-minmax-neon-mlal-padal-dup.c320 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local
327 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local
333 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
336 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
341 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
356 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
358 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
363 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
365 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
370 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
D3x16c2-minmax-neon-mull-padal-dup.c386 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
390 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
392 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
397 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
399 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
404 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
406 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
411 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
D3x16c8-minmax-neon-mull-padal.c445 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local
449 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
451 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
456 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
458 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
463 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
465 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
470 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
D3x16-minmax-neon-mull-addw-dup.c517 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
521 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
523 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
528 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
530 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
535 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
537 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
542 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
D3x16c16-minmax-neon-mlal-padal.c493 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local
497 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
499 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
504 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
506 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
511 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
513 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
518 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
D3x16-minmax-neon-mlal-lane.c465 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local
469 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
471 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
476 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
478 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
483 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
485 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
490 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
D3x16c8-minmax-neon-mlal-padal.c634 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local
638 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
640 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
645 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
647 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
652 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
654 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
659 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()

12