/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c8-minmax-rndnu-neon-mull.c | 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 157 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 204 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 261 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 197 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 296 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 343 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 400 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 184 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 236 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 292 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 255 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 392 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 450 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 519 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 203 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 261 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 330 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 3x16c16-minmax-rndnu-neon-mlal.c | 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 241 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 309 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 377 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 249 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 318 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 399 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 313 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 488 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 557 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 638 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 298 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 382 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 462 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c8-minmax-rndnu-neon-mull.c | 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 144 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 188 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 245 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 184 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 283 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 327 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 384 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 171 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 220 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 276 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 240 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 377 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 432 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 501 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 188 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 243 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 312 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 3x16c16-minmax-rndnu-neon-mlal.c | 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 226 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 291 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 359 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 232 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 298 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 379 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 296 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 471 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 537 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 618 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 281 vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 362 const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 442 const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|