/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mlal-ld1r.c | 168 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 171 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 173 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 306 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 308 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 401 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 402 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld4r.c | 156 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 159 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 161 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 288 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 290 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 383 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 384 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-dup.c | 156 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 159 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 161 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 288 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 290 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 383 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 384 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld2r.c | 160 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 163 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 165 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 294 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 296 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 389 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 390 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 150 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 153 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 155 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 275 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 277 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld4r.c | 195 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 199 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 202 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 375 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 378 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 500 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 501 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 195 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 199 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 202 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 375 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 378 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 500 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 501 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld1r.c | 213 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 217 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 220 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 402 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 405 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 527 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 528 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld2r.c | 201 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 205 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 208 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 384 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 387 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 509 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 510 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r()
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 187 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 191 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 194 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 359 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 362 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 136 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 138 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 231 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 232 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 130 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 132 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 225 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 226 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld4r.c | 130 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 132 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 225 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 226 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mlal-ld2r.c | 146 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 149 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 151 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 280 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 282 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 375 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 376 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld1r.c | 154 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 157 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 159 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 292 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 294 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 387 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 388 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-dup.c | 142 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 145 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 147 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 274 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 276 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 369 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 370 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld4r.c | 142 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 145 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 147 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 274 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 276 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 369 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 370 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 137 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 140 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 142 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 262 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 264 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 179 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 183 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 186 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 359 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 362 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 484 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 485 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld4r.c | 179 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 183 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 186 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 359 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 362 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 484 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 485 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld2r.c | 185 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 189 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 192 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 368 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 371 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 493 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 494 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld1r.c | 197 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 201 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 204 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 386 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 389 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 511 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 512 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r()
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 172 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 176 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 179 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 344 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 347 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 117 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 119 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 212 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 213 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 123 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 125 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 218 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 219 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|