/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 93 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 263 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 264 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 265 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 296 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 397 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 398 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 399 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 81 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 200 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 201 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 229 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 297 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 298 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 105 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 326 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 327 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 328 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 329 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 363 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 497 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 498 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 499 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 500 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 69 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 137 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 162 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 197 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal()
|
D | 3x16c2s4-minmax-rndnu-neon-mull.c | 90 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() local 191 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 192 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 193 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull()
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 101 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 235 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 236 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 237 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 238 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
|
D | 2x16c2s4-minmax-rndnu-neon-mull.c | 79 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() local 147 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() 148 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull()
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 68 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 103 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull()
|
D | 4x16c2-minmax-rndnu-neon-mlal-dup.c | 104 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() local 337 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 338 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 339 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 340 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld2r.c | 112 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() local 345 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 346 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 347 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 348 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld4r.c | 104 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() local 337 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 338 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 339 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 340 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld1r.c | 128 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() local 361 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 362 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 363 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 364 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 92 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 272 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 273 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 274 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 108 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 278 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 279 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 280 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 311 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 412 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 413 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 414 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 94 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 213 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 214 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 242 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 310 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 311 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 122 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 343 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 344 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 345 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 346 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 380 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 514 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 515 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 516 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 517 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 80 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 148 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 173 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 208 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal()
|
D | 3x16c2s4-minmax-rndnu-neon-mull.c | 105 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() local 206 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 207 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 208 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull()
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 118 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 252 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 253 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 254 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 255 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
|
D | 2x16c2s4-minmax-rndnu-neon-mull.c | 92 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() local 160 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() 161 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull()
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 79 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 114 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld2r.c | 130 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() local 363 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 364 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 365 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 366 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld1r.c | 146 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() local 379 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 380 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 381 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 382 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld4r.c | 122 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() local 355 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 356 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 357 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 358 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-dup.c | 122 … const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() local 355 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 356 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 357 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 358 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup()
|