/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c2-minmax-neon-mull-padal-dup.c | 238 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 245 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 250 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 253 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 257 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 270 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 273 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 277 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 280 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 284 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
|
D | 3x8c8-minmax-neon-mull-padal.c | 261 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 268 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 273 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 276 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 280 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 293 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 296 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 300 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 303 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 307 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8-minmax-neon-mull-addw-dup.c | 309 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() local 316 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() local 321 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 324 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 328 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 341 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 344 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 348 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 351 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 355 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup()
|
D | 3x8-minmax-neon-mlal-lane.c | 286 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() local 293 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() local 298 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 301 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 305 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 318 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 321 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 325 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 328 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 332 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 285 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 292 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 297 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 300 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 304 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 317 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 320 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 324 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 327 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 331 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 362 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 369 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 374 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 377 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 381 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 394 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 397 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 401 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 404 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 408 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 338 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 345 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 350 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 353 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 357 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 370 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 373 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 377 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 380 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 384 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 402 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 404 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 407 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 411 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 414 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 418 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 421 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 425 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
|
D | 3x16c8-minmax-neon-mull-padal.c | 461 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 463 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 466 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 470 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 473 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 477 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 480 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 484 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16-minmax-neon-mull-addw-dup.c | 533 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 535 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 538 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 542 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 545 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 549 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 552 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 556 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
|
D | 3x16-minmax-neon-mlal-lane.c | 480 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 482 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 485 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 489 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 492 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 496 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 499 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 503 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 509 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 511 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 514 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 518 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 521 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 525 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 528 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 532 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c2-minmax-neon-mull-padal-dup.c | 220 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 227 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 233 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 236 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 241 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 256 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 258 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 263 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 265 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 270 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
|
D | 3x8c8-minmax-neon-mull-padal.c | 243 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 250 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 256 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 259 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 264 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 279 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 281 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 286 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 288 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 293 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 267 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 274 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 280 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 283 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 288 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 303 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 305 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 310 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 312 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 317 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 3x8-minmax-neon-mull-addw-dup.c | 291 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() local 298 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() local 304 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 307 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 312 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 327 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 329 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 334 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 336 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 341 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup()
|
D | 3x8-minmax-neon-mlal-lane.c | 269 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() local 276 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() local 282 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 285 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 290 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 305 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 307 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 312 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 314 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 319 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 344 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 351 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 357 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 360 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 365 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 380 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 382 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 387 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 389 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 394 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 320 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 327 int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 333 vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 336 vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 341 vst1_s8(c2 + 0, vout2x01234567); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 356 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 358 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 363 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 365 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 370 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 386 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 390 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 392 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 397 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 399 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 404 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 406 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 411 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
|
D | 3x16c8-minmax-neon-mull-padal.c | 445 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 449 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 451 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 456 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 458 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 463 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 465 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 470 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16-minmax-neon-mull-addw-dup.c | 517 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 521 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 523 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 528 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 530 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 535 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 537 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 542 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 493 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 497 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 499 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 504 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 506 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 511 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 513 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 518 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16-minmax-neon-mlal-lane.c | 465 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 469 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 471 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 476 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 478 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 483 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 485 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 490 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 634 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 638 vst1_s8(c2, vout2x01234567); c2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 640 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 645 … vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 647 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 652 … vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 654 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 659 vst1_lane_s8(c2, vout2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|