/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16-minmax-neon-mlal-lane.c | 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 91 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 105 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 119 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 133 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 148 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 162 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 176 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 190 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 212 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 91 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 107 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 123 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 139 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 155 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 171 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 187 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 203 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 225 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 58 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 156 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 158 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 159 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 188 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 211 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 234 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 249 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 260 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 107 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 129 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 151 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 173 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 195 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 217 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 239 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 261 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 287 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 107 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 125 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 143 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 161 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 180 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 198 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 216 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 234 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 260 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 123 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 151 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 179 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 207 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 235 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 263 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 291 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 319 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 349 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 123 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 145 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 167 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 189 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 212 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 234 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 256 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 278 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 308 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 58 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 124 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 152 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 180 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 208 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 294 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 295 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 296 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 297 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 326 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 66 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 124 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 140 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 174 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 195 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 214 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 231 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 246 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 260 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 169 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 170 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 171 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 172 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 234 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 265 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 296 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 319 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 334 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 149 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 189 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 229 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 269 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 357 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 358 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 359 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 360 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 422 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 66 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 182 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 183 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 184 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 185 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 280 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 319 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 358 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 389 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 408 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 74 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 150 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 174 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 218 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 247 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 274 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 299 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 322 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 342 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16-minmax-neon-mlal-lane.c | 56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local 78 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 106 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 120 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 135 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 149 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 163 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 177 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 199 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 78 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 94 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 110 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 126 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 142 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 158 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 174 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 190 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 212 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 143 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 144 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 145 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 146 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 175 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 198 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 221 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 233 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 244 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 92 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 114 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 136 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 158 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 180 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 202 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 224 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 272 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 110 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 128 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 146 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 165 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 183 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 201 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 219 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 245 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 111 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 139 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 167 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 195 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 281 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 282 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 283 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 284 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 313 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 106 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 128 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 150 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 172 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 195 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 217 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 239 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 261 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 291 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 106 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 134 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 162 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 190 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 218 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 274 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 302 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 332 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 63 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 154 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 155 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 156 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 219 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 250 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 281 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 301 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 316 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 71 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 109 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 125 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 159 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 181 const int32x4_t vproduct1xCDEF = vqrdmulhq_n_s32(vacc1xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 198 vacc1xCDEF = vsraq_n_s32(vproduct1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 215 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 230 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 244 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 63 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 134 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 174 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 214 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 254 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 342 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 343 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 344 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 345 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 407 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 69 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 165 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 166 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 167 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 168 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 263 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 302 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 341 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 369 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 388 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|