/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 85 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 171 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 172 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 173 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 288 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 343 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 344 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 345 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 73 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 136 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 137 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 221 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 261 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 262 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 97 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 206 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 207 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 208 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 209 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 355 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 425 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 426 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 427 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 428 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 61 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 101 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 154 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 179 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal()
|
D | 3x16c2s4-minmax-rndnu-neon-mull.c | 82 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() local 137 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 138 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 139 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull()
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 93 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 163 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 164 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 165 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 166 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
|
D | 2x16c2s4-minmax-rndnu-neon-mull.c | 71 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() local 111 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() 112 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull()
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 60 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 85 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull()
|
D | 4x16c2-minmax-rndnu-neon-mlal-dup.c | 96 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() local 215 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 216 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 217 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 218 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld2r.c | 104 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() local 223 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 224 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 225 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 226 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld4r.c | 96 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() local 215 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 216 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 217 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 218 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld1r.c | 120 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() local 239 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 240 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 241 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 242 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 84 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 178 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 179 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 180 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 100 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 186 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 187 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 188 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 303 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 358 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 359 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 360 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal()
|
D | 2x16c2s4-minmax-rndnu-neon-mlal.c | 86 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 149 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 150 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 234 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() local 274 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal() 275 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal()
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 114 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 223 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 224 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 225 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 226 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 372 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 442 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 443 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 444 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 445 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal()
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 72 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 112 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 165 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 190 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal()
|
D | 3x16c2s4-minmax-rndnu-neon-mull.c | 97 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() local 152 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 153 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull() 154 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull()
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 110 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 180 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 181 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 182 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 183 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull()
|
D | 2x16c2s4-minmax-rndnu-neon-mull.c | 84 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() local 124 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull() 125 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull()
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 71 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 96 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld2r.c | 122 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() local 241 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 242 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 243 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r() 244 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld1r.c | 138 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() local 257 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 258 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 259 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r() 260 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-ld4r.c | 114 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() local 233 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 234 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 235 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r() 236 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r()
|
D | 4x16c2-minmax-rndnu-neon-mlal-dup.c | 114 … const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() local 233 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 234 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 235 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup() 236 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup()
|