/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-neon-mlal-padal.c | 78 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 169 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 238 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 262 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 307 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 78 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 137 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 161 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 206 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 78 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 159 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 185 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 230 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 84 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 165 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 192 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 243 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 84 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 205 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 292 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 319 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 370 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 84 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 194 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 224 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 275 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 94 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 161 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 249 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 332 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 94 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 201 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 350 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 438 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 521 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 94 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 191 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 297 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 380 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 100 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 197 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 304 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 399 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 100 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 245 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 436 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 543 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 638 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 100 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 234 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 368 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 463 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c8-minmax-neon-mull-padal.c | 77 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 152 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 179 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 224 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 77 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 184 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 253 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 280 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 325 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 77 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 174 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 203 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 248 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 81 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 182 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 212 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 263 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 81 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 222 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 309 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 339 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 390 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 81 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 211 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 244 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 295 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 93 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 176 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 267 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 350 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 93 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 216 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 365 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 456 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 539 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 93 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 206 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 315 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 398 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 97 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 214 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 324 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 419 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 97 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 262 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 453 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 563 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 658 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 97 int32x4_t vacc2x6 = vacc0x6; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 251 vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 388 const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 483 const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|