/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c4s2-minmax-rndnu-neon-mlal.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local 106 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 152 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 201 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 228 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 249 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 265 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
|
D | 3x8c4-minmax-rndnu-neon-mull-ld1r.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() local 104 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 132 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 181 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 193 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 209 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mull-dup.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() local 101 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 129 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 178 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 190 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 206 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup()
|
D | 3x8c4-minmax-rndnu-neon-mull-ld2r.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() local 101 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 129 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 178 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 190 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 206 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld2r.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() local 114 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 161 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 215 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 243 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 292 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 304 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 320 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld1r.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() local 120 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 167 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 224 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 252 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 301 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 313 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 329 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-dup.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() local 114 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 161 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 215 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 243 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 292 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 304 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 320 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup()
|
D | 3x8c4s2-minmax-rndnu-neon-mull.c | 65 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() local 95 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 122 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 144 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 160 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull()
|
D | 4x8c4-minmax-rndnu-neon-mull-ld2r.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() local 116 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 153 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 208 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 229 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 247 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mull-dup.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() local 116 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 153 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 208 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 229 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 247 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup()
|
D | 4x8c4-minmax-rndnu-neon-mull-ld1r.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() local 120 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 157 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 212 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 233 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 251 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r()
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 123 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 183 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 243 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 279 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 305 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 323 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
|
D | 4x8c4s2-minmax-rndnu-neon-mull.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local 109 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 145 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 172 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 190 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld2r.c | 71 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() local 133 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 194 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 260 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 297 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 352 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 373 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 391 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c4-minmax-rndnu-neon-mull-ld1r.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() local 119 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 147 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 196 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 210 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r() 226 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mull-dup.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() local 116 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 144 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 193 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 207 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup() 223 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup()
|
D | 3x8c4-minmax-rndnu-neon-mull-ld2r.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() local 116 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 144 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 193 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 207 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r() 223 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-dup.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() local 129 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 176 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 230 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 258 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 307 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 321 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 337 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld1r.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() local 135 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 182 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 239 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 267 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 316 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 330 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 346 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld2r.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() local 129 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 176 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 230 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 258 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 307 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 321 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 337 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r()
|
D | 3x8c4s2-minmax-rndnu-neon-mlal.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local 121 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 167 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 216 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 243 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 267 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 283 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
|
D | 3x8c4s2-minmax-rndnu-neon-mull.c | 64 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() local 110 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 137 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 162 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull() 178 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull()
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 68 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 140 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 200 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 260 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 296 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 325 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 343 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
|
D | 4x8c4s2-minmax-rndnu-neon-mull.c | 68 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local 126 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 162 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 192 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 210 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld1r.c | 68 int32x4_t vacc2x23 = vacc0x23; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() local 158 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 219 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 289 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 326 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 381 vacc2x23 = vpadalq_s16(vacc2x23, vprod2x23c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 404 int32x4_t vacc2x0123 = vpaddq_s32(vacc2x01, vacc2x23); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 422 const int32x2_t vsum2x23 = vpadd_s32(vget_low_s32(vacc2x23), vget_high_s32(vacc2x23)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r()
|