/external/XNNPACK/src/qc8-gemm/gen/ |
D | 2x8c4-minmax-fp32-neonv8-mlal-dup.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() local 109 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 142 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 178 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 197 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 228 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 235 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 247 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld2r.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() local 108 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 141 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 177 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 227 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 234 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 246 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld1r.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() local 112 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 145 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 183 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 202 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 233 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 240 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 252 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld2r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() local 109 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 142 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 178 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 197 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 228 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 235 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 247 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld1r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() local 113 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 146 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 184 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 203 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 234 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 241 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 253 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r()
|
D | 2x8c4-minmax-fp32-neon-mlal-dup.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() local 108 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 141 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 177 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 227 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 234 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 246 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c4-minmax-rndnu-neon-mlal-ld1r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() local 125 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 158 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 215 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 246 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 255 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r() 267 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r()
|
D | 2x8c4-minmax-rndnu-neon-mlal-ld2r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r()
|
D | 2x8c4-minmax-rndnu-neon-mlal-dup.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld2r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-dup.c | 59 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() local 122 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 155 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 191 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 210 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 241 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 250 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 262 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld1r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() local 125 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 158 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 215 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 246 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 255 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 267 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r()
|
D | 2x8c4-minmax-fp32-neon-mlal-dup.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup()
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 2x8c4-minmax-fp32-neon-mlal-ld1r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() local 125 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 158 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 215 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 246 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 255 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r() 267 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-dup.c | 59 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() local 122 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 155 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 191 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 210 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 241 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 250 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 262 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld2r.c | 59 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() local 122 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 155 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 191 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 210 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 241 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 250 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 262 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld1r.c | 59 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() local 126 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 159 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 197 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 216 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 247 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 256 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 268 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r()
|
D | 2x8c4-minmax-fp32-neon-mlal-dup.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld2r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() local 121 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 154 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 190 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 209 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 240 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 249 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 261 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c4-minmax-fp32-neonv8-mlal-dup.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() local 109 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 142 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 178 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 197 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 228 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 235 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup() 247 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup()
|
D | 2x8c4-minmax-rndnu-neon-mlal-dup.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() local 108 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 141 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 177 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 227 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 234 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup() 246 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup()
|
D | 2x8c4-minmax-fp32-neon-mlal-dup.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() local 108 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 141 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 177 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 227 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 234 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup() 246 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld2r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() local 109 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 142 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 178 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 197 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 228 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 235 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r() 247 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neon-mlal-ld2r.c | 57 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() local 108 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 141 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 177 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 196 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 227 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 234 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r() 246 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld2r()
|
D | 2x8c4-minmax-fp32-neonv8-mlal-ld1r.c | 58 int32x4_t vacc1x67 = vacc0x67; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() local 113 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 146 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 184 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 203 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c1); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 234 vacc1x67 = vpadalq_s16(vacc1x67, vprod1x67c0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 241 int32x4_t vacc1x4567 = vpaddq_s32(vacc1x45, vacc1x67); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r() 253 const int32x2_t vsum1x67 = vpadd_s32(vget_low_s32(vacc1x67), vget_high_s32(vacc1x67)); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r()
|