/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 121 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 156 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 209 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 222 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 245 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 123 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 158 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 211 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 224 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 247 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 121 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 156 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 209 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 222 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 245 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mull.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() local 116 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 150 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 167 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 190 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld1r.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() local 149 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 210 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 275 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 310 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 363 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 376 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 399 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mlal-dup.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() local 145 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 206 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 269 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 304 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 357 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 370 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 393 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld2r.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() local 145 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 206 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 269 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 304 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 357 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 370 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 393 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mlal.c | 64 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() local 139 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 199 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 258 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 292 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 308 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 331 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal()
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 70 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 175 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 261 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 338 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 389 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 408 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 435 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld1r.c | 70 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() local 153 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 205 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 262 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 292 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 319 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld2r.c | 70 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() local 150 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 202 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 259 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 289 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 316 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mull-dup.c | 70 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() local 150 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 202 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 259 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 289 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 316 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 134 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 169 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 222 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 237 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 260 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mull.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() local 129 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 163 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 183 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 206 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull()
|
D | 2x16c4-minmax-rndnu-neon-mlal-dup.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() local 158 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 219 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 282 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 317 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 370 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 385 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 408 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld1r.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() local 162 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 223 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 288 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 323 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 376 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 391 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 414 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld2r.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() local 158 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 219 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 282 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 317 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 370 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 385 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 408 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r()
|
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 134 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 169 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 222 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 237 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 260 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 136 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 171 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 224 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 239 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 262 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r()
|
D | 2x16c4s2-minmax-rndnu-neon-mlal.c | 65 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() local 152 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 212 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 271 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 305 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 324 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 347 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal()
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 69 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 190 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 276 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 353 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 404 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 426 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 453 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
|
D | 3x16c4-minmax-rndnu-neon-mull-dup.c | 69 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() local 165 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 217 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 274 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 306 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 333 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld2r.c | 69 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() local 165 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 217 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 274 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 306 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 333 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld1r.c | 69 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() local 168 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 220 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 277 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 309 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 336 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r()
|
D | 3x16c4-minmax-rndnu-neon-mlal-ld1r.c | 69 int32x4_t vacc1xCD = vacc0xCD; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() local 204 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 291 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 376 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 428 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 485 vacc1xCD = vpadalq_s16(vacc1xCD, vprod1xCDc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 517 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 544 const int32x2_t vsum1xCD = vpadd_s32(vget_low_s32(vacc1xCD), vget_high_s32(vacc1xCD)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r()
|