/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 125 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 246 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 247 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 248 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 249 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 250 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 251 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 148 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 308 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 309 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 310 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 311 int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 312 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 313 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 314 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 315 vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 102 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 184 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 185 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 186 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 187 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 79 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 122 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 123 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 106 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 107 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 155 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 156 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 157 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 253 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 254 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 255 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 256 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 257 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 204 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 205 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 206 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 207 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 195 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 196 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 294 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 295 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 296 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 393 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 394 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 395 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 396 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 492 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 493 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 494 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 495 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 496 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 140 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 261 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 262 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 263 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 264 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 265 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 266 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 165 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 325 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 326 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 327 int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 328 int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 329 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 330 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 331 vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 332 vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 115 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 197 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 198 int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 199 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 200 vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 90 const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 133 int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 134 vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 168 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 169 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 170 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 270 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 271 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 272 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 273 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 274 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 117 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 118 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 219 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 220 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 221 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 222 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 206 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 207 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 307 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 308 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 309 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 509 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 510 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 511 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 512 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 513 const int16x8_t vprod3x13 = vmull_s8(vb13, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 408 const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 409 const int16x8_t vprod0x13 = vmull_s8(vb13, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 410 const int16x8_t vprod1x13 = vmull_s8(vb13, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 411 const int16x8_t vprod2x13 = vmull_s8(vb13, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|