/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 125 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 160 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 211 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 222 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 246 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 127 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 162 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 213 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 224 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 248 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 125 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 160 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 211 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 222 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 246 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mull.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() local 120 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 154 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 167 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 191 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld1r.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() local 156 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 217 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 279 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 314 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 365 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 376 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 400 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mlal-dup.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() local 152 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 213 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 273 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 308 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 359 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 370 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 394 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld2r.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() local 152 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 213 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 273 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 308 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 359 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 370 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 394 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mlal.c | 65 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() local 146 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 206 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 262 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 296 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 308 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 332 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal()
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 71 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 185 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 271 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 344 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 395 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 408 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 436 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld1r.c | 71 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() local 159 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 211 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 264 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 292 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 320 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld2r.c | 71 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() local 156 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 208 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 261 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 289 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 317 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mull-dup.c | 71 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() local 156 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 208 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 261 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 289 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 317 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 138 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 173 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 224 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 237 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 261 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r()
|
D | 2x16c4s2-minmax-rndnu-neon-mull.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() local 133 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 167 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 183 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull() 207 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull()
|
D | 2x16c4-minmax-rndnu-neon-mlal-dup.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() local 165 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 226 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 286 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 321 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 372 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 385 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup() 409 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld1r.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() local 169 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 230 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 292 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 327 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 378 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 391 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r() 415 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r()
|
D | 2x16c4-minmax-rndnu-neon-mlal-ld2r.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() local 165 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 226 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 286 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 321 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 372 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 385 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r() 409 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r()
|
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 138 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 173 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 224 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 237 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 261 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup()
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 140 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 175 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 226 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 239 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 263 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r()
|
D | 2x16c4s2-minmax-rndnu-neon-mlal.c | 66 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() local 159 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 219 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 275 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 309 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 324 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal() 348 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal()
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 70 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 200 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 286 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 359 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 410 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 426 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 454 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal()
|
D | 3x16c4-minmax-rndnu-neon-mull-dup.c | 70 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() local 171 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 223 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 276 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 306 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup() 334 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld2r.c | 70 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() local 171 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 223 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 276 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 306 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r() 334 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mull-ld1r.c | 70 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() local 174 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 226 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 279 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 309 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r() 337 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r()
|
D | 3x16c4-minmax-rndnu-neon-mlal-ld1r.c | 70 int32x4_t vacc1xEF = vacc0xEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() local 214 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 301 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 382 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 434 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 487 vacc1xEF = vpadalq_s16(vacc1xEF, vprod1xEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 517 int32x4_t vacc1xCDEF = vpaddq_s32(vacc1xCD, vacc1xEF); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 545 const int32x2_t vsum1xEF = vpadd_s32(vget_low_s32(vacc1xEF), vget_high_s32(vacc1xEF)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r()
|