/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c8-minmax-neon-mull-padal.c | 358 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 359 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 365 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 366 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 372 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 373 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 379 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 380 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 386 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 387 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() [all …]
|
D | 3x16c8-minmax-neon-mull-padal.c | 289 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 290 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 296 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 297 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 303 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 304 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 310 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 311 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 317 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 318 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() [all …]
|
D | 4x8c8-minmax-neon-mull-padal.c | 230 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 231 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 237 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 238 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 244 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 245 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 251 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 252 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 258 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 259 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() [all …]
|
D | 2x16c8-minmax-neon-mull-padal.c | 220 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 221 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 227 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 228 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 234 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 235 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 241 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 242 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 248 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 249 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() [all …]
|
D | 4x16c16-minmax-neon-mlal-padal.c | 422 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 423 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 429 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 430 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 436 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 437 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 443 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 444 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 450 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 451 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 337 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 338 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 344 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 345 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 351 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 352 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 358 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 359 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 365 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 366 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mull-padal.c | 191 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 192 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 198 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 199 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 205 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 206 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 212 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 213 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 219 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 220 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 262 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 263 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 269 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 270 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 276 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 277 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 283 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 284 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 290 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 291 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 3x8c16-minmax-neon-mlal-padal.c | 215 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 216 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 222 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 223 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 229 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 230 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 236 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 237 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 243 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 244 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 252 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 253 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 259 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 260 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 266 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 267 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 273 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 274 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 280 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 281 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 597 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 598 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 604 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 605 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 611 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 612 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 618 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 619 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 625 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 626 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c8-minmax-neon-mull-padal.c | 338 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 339 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 345 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 346 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 352 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 353 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 359 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 360 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 366 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 367 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() [all …]
|
D | 3x16c8-minmax-neon-mull-padal.c | 271 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 272 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 278 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 279 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 285 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 286 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 292 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 293 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 299 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 300 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() [all …]
|
D | 4x8c8-minmax-neon-mull-padal.c | 210 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 211 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 217 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 218 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 224 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 225 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 231 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 232 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 238 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 239 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() [all …]
|
D | 2x16c8-minmax-neon-mull-padal.c | 204 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 205 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 211 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 212 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 218 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 219 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 225 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 226 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 232 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 233 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() [all …]
|
D | 4x16c16-minmax-neon-mlal-padal.c | 402 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 403 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 409 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 410 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 416 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 417 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 423 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 424 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 430 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 431 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 319 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 320 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 326 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 327 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 333 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 334 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 340 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 341 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 347 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 348 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mull-padal.c | 173 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 174 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 180 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 181 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 187 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 188 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 194 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 195 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 201 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 202 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 236 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 237 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 243 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 244 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 250 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 251 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 257 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 258 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 264 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 265 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 242 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 243 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 249 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 250 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 256 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 257 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 263 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 264 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 270 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 271 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 2x8c8-minmax-neon-mull-padal.c | 136 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 137 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 143 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 144 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 150 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 151 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 157 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 158 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 197 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 198 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 204 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 205 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 211 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 212 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 218 const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 219 const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 225 const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 226 const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 460 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 461 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 467 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 468 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 474 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 475 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 481 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 482 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 488 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 489 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 577 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 578 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 584 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 585 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 591 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 592 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 598 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 599 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 605 const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 606 const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 1x16c8-minmax-neon-mull-padal.c | 137 const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 138 const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 144 const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 145 const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 151 const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 152 const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 158 const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 159 const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|