/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-neon-mlal-padal.c | 76 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 149 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 224 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 261 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 305 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 76 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 123 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 160 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 204 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 76 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 141 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 184 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 228 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 82 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 147 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 191 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 241 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 82 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 179 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 274 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 318 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 368 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 82 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 170 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 223 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 273 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 92 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 147 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 248 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 330 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 92 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 181 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 336 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 437 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 519 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 92 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 173 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 296 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 378 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 98 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 179 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 303 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 397 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 98 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 219 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 418 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 542 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 636 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 98 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 210 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 367 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 461 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c8-minmax-neon-mull-padal.c | 75 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 138 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 178 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 222 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 75 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 164 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 239 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 279 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 323 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 75 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 156 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 202 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 246 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 79 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 164 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 211 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 261 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 79 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 196 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 291 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 338 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 388 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 79 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 187 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 243 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 293 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 91 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 162 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 266 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 348 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 91 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 196 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 351 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 455 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 537 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 91 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 188 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 314 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 396 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 95 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 196 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 323 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 417 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 95 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 236 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 435 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 562 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 656 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 95 int32x4_t vacc2x4 = vacc0x4; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 227 vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 387 const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 481 const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|