/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-rndnu-neon-mlal.c | 125 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 246 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 247 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 248 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 249 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 250 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 251 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 148 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 308 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 309 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 310 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 311 int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 312 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 313 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 314 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 315 vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 102 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 184 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 185 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 186 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 187 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 1x16c16-minmax-rndnu-neon-mlal.c | 79 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() local 122 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() 123 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mull.c | 106 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull() local 107 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull()
|
D | 2x16c8-minmax-rndnu-neon-mull.c | 155 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 156 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 157 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 253 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 254 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 255 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 256 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 257 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 204 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 205 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 206 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 207 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 393 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 394 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 395 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 396 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 294 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 295 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 296 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mlal.c | 195 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal() local 196 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 492 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 493 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 494 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 495 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 496 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-rndnu-neon-mlal.c | 140 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 261 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 262 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 263 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 264 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 265 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 266 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 165 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 325 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 326 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 327 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 328 int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 329 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 330 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 331 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 332 vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 115 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 197 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 198 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 199 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 200 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 1x16c16-minmax-rndnu-neon-mlal.c | 90 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() local 133 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() 134 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mull.c | 168 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 169 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 170 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 270 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 271 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 272 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 273 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 274 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 219 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 220 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 221 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 222 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 1x16c8-minmax-rndnu-neon-mull.c | 117 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull() local 118 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 408 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 409 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 410 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 411 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 307 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 308 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 309 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mlal.c | 206 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal() local 207 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 509 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 510 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 511 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 512 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 513 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|