/external/XNNPACK/src/qs8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 88 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 162 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 192 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 222 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 252 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 286 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 316 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 346 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 376 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 422 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 88 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 162 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 192 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 222 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 252 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 284 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 314 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 344 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 374 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 420 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16c4-minmax-rndnu-neondot.c | 89 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() local 165 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 189 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 233 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 265 vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 290 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 315 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 331 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 351 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-rndnu-neondot.c | 97 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() local 191 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 223 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 277 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 317 vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 350 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 383 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 407 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 433 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot()
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 96 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 142 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 172 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 202 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 232 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 264 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 294 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 324 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 354 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 400 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16c4-minmax-rndnu-neondot.c | 274 int32x4_t vacc5x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc5x89AB, vnacc5x0123)); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() local 303 vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 328 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 353 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 369 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 389 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot()
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 89 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 163 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 193 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 223 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 253 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 285 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 315 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 345 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 375 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 421 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16c4-minmax-rndnu-neondot.c | 296 int32x4_t vacc5x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc5x89AB, vnacc5x0123)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() local 325 vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 350 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 375 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 391 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot() 411 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-rndnu-neondot.c | 354 int32x4_t vacc5x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc5x89AB, vnacc5x0123)); in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() local 395 vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 428 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 461 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 485 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot() 511 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 95 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 141 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 171 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 201 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 231 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 265 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 295 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 325 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 355 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 401 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 95 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 141 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 171 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 201 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 231 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 263 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 293 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 323 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 353 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 399 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16c4-minmax-rndnu-neondot.c | 99 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() local 147 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 171 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 215 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 245 vacc5x89AB = vqshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 270 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 295 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 311 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot() 331 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-rndnu-neondot.c | 111 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() local 169 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 201 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 255 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 293 vacc5x89AB = vqshlq_s32(vacc5x89AB, vright_pre_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 326 vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 359 vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 383 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot() 409 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot()
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 88 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 162 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 192 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 222 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 252 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 286 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 316 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 346 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 376 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 422 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 88 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 162 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 192 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 222 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 252 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 284 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 314 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 344 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 374 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 420 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 89 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 163 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 193 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 223 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 253 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 285 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 315 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 345 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 375 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 421 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 89 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 163 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 193 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 223 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 253 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 287 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 317 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 347 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 377 … vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 423 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16c4-minmax-fp32-neondot.c | 90 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() local 166 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 190 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 234 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 262 float32x4_t vfpacc5x89AB = vcvtq_f32_s32(vacc5x89AB); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 316 vacc5x89AB = vcvtnq_s32_f32(vfpacc5x89AB); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 332 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot() 352 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-fp32-neondot.c | 98 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() local 192 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 224 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 278 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 314 float32x4_t vfpacc5x89AB = vcvtq_f32_s32(vacc5x89AB); in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 384 vacc5x89AB = vcvtnq_s32_f32(vfpacc5x89AB); in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 408 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot() 434 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot()
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 95 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 141 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 171 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 201 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 231 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 265 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 295 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 325 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 355 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 401 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 95 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 141 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 171 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 201 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 231 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 263 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 293 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 323 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 353 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 399 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 96 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 142 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 172 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 202 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 232 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 266 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 296 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 326 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 356 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 402 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 96 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 142 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 172 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 202 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 232 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 264 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 294 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa5), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 324 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa5), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 354 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa5), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 400 vacc5x89AB = vmlal_lane_s16(vacc5x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa5), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16c4-minmax-fp32-neondot.c | 100 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() local 148 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 172 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 216 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 242 float32x4_t vfpacc5x89AB = vcvtq_f32_s32(vacc5x89AB); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 296 vacc5x89AB = vcvtnq_s32_f32(vfpacc5x89AB); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 312 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot() 332 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-fp32-neondot.c | 112 int32x4_t vacc5x89AB = vacc0x89AB; in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() local 170 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 202 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1); in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 256 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0); in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 290 float32x4_t vfpacc5x89AB = vcvtq_f32_s32(vacc5x89AB); in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 360 vacc5x89AB = vcvtnq_s32_f32(vfpacc5x89AB); in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 384 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), v… in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot() 410 …const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5x… in xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot()
|