/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c8-minmax-neon-mull-padal.c | 63 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() local 103 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 125 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() 154 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal()
|
D | 2x8c8-minmax-neon-mlal-padal.c | 63 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 125 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 178 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 200 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 229 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 63 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 117 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 141 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 170 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 69 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 158 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 230 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 257 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 292 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 69 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 129 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 156 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() 191 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 69 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 149 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 180 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 215 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 75 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 155 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 187 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() 228 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 71 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 119 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 185 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 236 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 75 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 191 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 282 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 314 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 355 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 71 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 149 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 258 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 324 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 375 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 71 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 141 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 217 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 268 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 75 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 181 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 219 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 260 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 64 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 138 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 191 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 216 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 245 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c8-minmax-neon-mull-padal.c | 64 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal() local 116 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal() 141 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal() 170 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 64 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 130 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 157 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 186 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 68 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 144 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 174 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() 209 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 68 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 173 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 245 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 275 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 310 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 68 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 164 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 198 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 233 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 172 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 207 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() 248 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 132 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 201 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 252 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 162 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 271 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 340 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 391 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 208 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 299 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 334 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 375 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 198 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 239 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 280 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 72 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 154 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 233 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 284 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 76 int32x4_t vacc1x5 = vacc0x5; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 168 vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 258 const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 321 const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|