/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mlal-ld1r.c | 135 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 138 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 140 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 287 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 289 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 376 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 377 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld4r.c | 123 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 126 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 128 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 269 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 271 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 358 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 359 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-dup.c | 123 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 126 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 128 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 269 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 271 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 358 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 359 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld2r.c | 127 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 130 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 132 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 275 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 277 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 364 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 365 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 118 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 121 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 123 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 257 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 259 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld4r.c | 148 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 152 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 155 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 347 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 350 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 466 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 467 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 148 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 152 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 155 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 347 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 350 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 466 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 467 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld1r.c | 166 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 170 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 173 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 374 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 377 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 493 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 494 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld2r.c | 154 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 158 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 161 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 356 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 359 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 475 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 476 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r()
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 141 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 145 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 148 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 332 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 335 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 117 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 119 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 206 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 207 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 111 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 113 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 200 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 201 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld4r.c | 111 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 113 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 200 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 201 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mlal-ld2r.c | 113 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 116 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 118 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 261 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 263 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() 350 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r() local 351 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld1r.c | 121 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 124 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 126 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 273 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 275 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() 362 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r() local 363 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mlal-dup.c | 109 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 112 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 114 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 255 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 257 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() 344 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup() local 345 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup()
|
D | 2x16c2-minmax-rndnu-neon-mlal-ld4r.c | 109 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 112 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 114 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 255 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 257 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() 344 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r() local 345 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 105 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 108 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 110 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 244 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 246 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 132 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 136 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 139 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 331 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 334 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 450 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 451 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld4r.c | 132 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 136 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 139 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 331 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 334 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() 450 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r() local 451 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld2r.c | 138 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 142 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 145 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 340 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 343 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() 459 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r() local 460 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-ld1r.c | 150 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 154 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 358 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 361 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() 477 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r() local 478 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r()
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 126 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 130 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 133 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 317 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 320 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 98 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 100 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 187 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 188 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 104 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 106 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 193 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 194 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|