/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-neon-mlal-padal.c | 79 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 179 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 245 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 262 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 308 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 79 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 144 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 161 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 207 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 79 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 168 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 185 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 231 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 85 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 174 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 192 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 244 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 85 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 218 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 301 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 319 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 371 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 85 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 206 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 224 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 276 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 95 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 168 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 249 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 333 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 95 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 211 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 357 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 438 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 522 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 95 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 200 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 297 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 381 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 101 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 206 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 304 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 400 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 101 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 258 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 445 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 543 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 639 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 101 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 246 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 368 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 464 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c8-minmax-neon-mull-padal.c | 78 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 159 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 179 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 225 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 78 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 194 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 260 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 280 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 326 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 78 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 183 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 203 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 249 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 82 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 191 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 212 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 264 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 82 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 235 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 318 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 339 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 391 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 82 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 223 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 244 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 296 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 94 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 183 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 267 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 351 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 94 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 226 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 372 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 456 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 540 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 94 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 215 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 315 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 399 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 98 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 223 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 324 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 420 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 98 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 275 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 462 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 563 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 659 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 98 int32x4_t vacc2x7 = vacc0x7; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 263 vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 388 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 484 const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|