/external/XNNPACK/src/qs8-gemm/gen/ |
D | 6x8c4-minmax-neondot.c | 218 … int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 229 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x012… in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 236 vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 240 vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 248 vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567)); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 249 vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567)); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 274 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 275 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 278 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 285 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() [all …]
|
D | 8x8c4-minmax-neondot.c | 264 … int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 278 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x012… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 286 vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 291 vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 300 vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567)); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 301 vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567)); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 332 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 333 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 338 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 346 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() [all …]
|
D | 6x16c4-minmax-neondot.c | 385 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vget_low_s8(vout4x0123456789ABCDEF), vget_low_s8… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 391 vst1_s8(c4, vget_low_s8(vout4x01234567_5x01234567)); c4 += 8; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 392 vst1_s8(c5, vget_high_s8(vout4x01234567_5x01234567)); c5 += 8; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 395 …vout4x01234567_5x01234567 = vcombine_s8(vget_high_s8(vout4x0123456789ABCDEF), vget_high_s8(vout5x0… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 402 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 403 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 406 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 413 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 414 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 417 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() [all …]
|
D | 8x16c4-minmax-neondot.c | 479 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vget_low_s8(vout4x0123456789ABCDEF), vget_low_s8… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 486 vst1_s8(c4, vget_low_s8(vout4x01234567_5x01234567)); c4 += 8; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 487 vst1_s8(c5, vget_high_s8(vout4x01234567_5x01234567)); c5 += 8; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 492 …vout4x01234567_5x01234567 = vcombine_s8(vget_high_s8(vout4x0123456789ABCDEF), vget_high_s8(vout5x0… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 500 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 501 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 506 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 514 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 515 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 520 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 6x8c4-minmax-neondot.c | 238 … int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 249 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x012… in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 254 vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 258 vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 263 vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567)); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 264 vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567)); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 282 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 283 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 288 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 293 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() [all …]
|
D | 8x8c4-minmax-neondot.c | 288 … int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 302 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x012… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 309 vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 314 vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 321 vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567)); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 322 vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567)); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 344 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 345 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 351 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 358 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() [all …]
|
D | 6x16c4-minmax-neondot.c | 397 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vget_low_s8(vout4x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 399 vst1_s8(c5, vget_high_s8(vout4x01234567_5x01234567)); c5 += 8; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 400 vst1_s8(c4, vget_low_s8(vout4x01234567_5x01234567)); c4 += 8; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 405 …vout4x01234567_5x01234567 = vcombine_s8(vget_high_s8(vout4x0123456789ABCDEF), vget_high_s8(vout5x0… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 410 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 411 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 416 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 421 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 422 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 427 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() [all …]
|
D | 8x16c4-minmax-neondot.c | 493 …int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vget_low_s8(vout4x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 498 vst1_s8(c5, vget_high_s8(vout4x01234567_5x01234567)); c5 += 8; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 499 vst1_s8(c4, vget_low_s8(vout4x01234567_5x01234567)); c4 += 8; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 505 …vout4x01234567_5x01234567 = vcombine_s8(vget_high_s8(vout4x0123456789ABCDEF), vget_high_s8(vout5x0… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 512 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 513 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 519 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 526 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 527 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 533 … vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() [all …]
|
/external/XNNPACK/src/qu8-gemm/ |
D | 8x8-minmax-neon.c | 518 …uint8x16_t vout4x01234567_5x01234567 = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x0123456… in xnn_qu8_gemm_minmax_ukernel_8x8__neon() local 540 …uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x… in xnn_qu8_gemm_minmax_ukernel_8x8__neon() local 548 vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 552 vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 560 … vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567)); c4 = (uint8_t*) ((uintptr_t) c4 + cn_stride); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 561 …vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 = (uint8_t*) ((uintptr_t) c5 + cn_stride); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 581 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4… in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 582 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5… in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 587 … vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 595 …_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 0); c4… in xnn_qu8_gemm_minmax_ukernel_8x8__neon() [all …]
|
/external/XNNPACK/src/qu8-igemm/ |
D | 8x8-minmax-neon.c | 563 …uint8x16_t vout4x01234567_5x01234567 = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x0123456… in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 585 …uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x… in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 593 vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 597 vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 603 vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 += cn_stride; in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 604 vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567)); c4 += cn_stride; in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 617 …_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5… in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 618 …_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4… in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 624 … vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 631 …_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_u8(vout4x01234567_5x01234567), 4); c5… in xnn_qu8_igemm_minmax_ukernel_8x8__neon() [all …]
|