/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c4-minmax-rndnu-neon-mull-ld2r.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local 173 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 242 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 341 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 363 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 409 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r()
|
D | 4x16c4-minmax-rndnu-neon-mull-dup.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local 173 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 242 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 341 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 363 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 409 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup()
|
D | 4x16c4-minmax-rndnu-neon-mull-ld1r.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local 177 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 246 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 345 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 367 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 413 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r()
|
D | 4x16c4s2-minmax-rndnu-neon-mull.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local 166 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 234 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 270 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 316 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 210 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 323 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 429 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 498 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 597 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 619 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 665 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 218 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 331 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 441 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 510 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 609 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 631 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 677 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 210 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 323 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 429 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 498 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 597 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 619 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 665 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r()
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 91 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 200 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 312 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 412 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 480 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 515 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 561 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c4-minmax-rndnu-neon-mull-ld2r.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local 190 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 259 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 358 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 382 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 428 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r()
|
D | 4x16c4-minmax-rndnu-neon-mull-ld1r.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local 194 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 263 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 362 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 386 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 432 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mull-dup.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local 190 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 259 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 358 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 382 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 428 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup()
|
D | 4x16c4s2-minmax-rndnu-neon-mull.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local 183 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 251 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 290 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 336 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 235 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 348 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 458 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 527 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 626 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 650 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 696 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 227 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 340 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 446 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 515 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 614 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 638 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 684 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 227 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 340 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 446 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 515 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 614 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 638 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 684 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r()
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 88 int32x4_t vacc3xAB = vacc0xAB; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 217 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 329 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 429 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 497 vacc3xAB = vpadalq_s16(vacc3xAB, vprod3xABc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 535 int32x4_t vacc3x89AB = vpaddq_s32(vacc3x89, vacc3xAB); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 581 const int32x2_t vsum3xAB = vpadd_s32(vget_low_s32(vacc3xAB), vget_high_s32(vacc3xAB)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal()
|