Home
last modified time | relevance | path

Searched refs:vacc1xCDEF (Results 1 – 25 of 60) sorted by relevance

123

/external/XNNPACK/src/qs8-igemm/gen/
D2x16-minmax-neon-mlal-lane.c57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local
91vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
105vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
119vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
133vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
148vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
162vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
176vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
190vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
212vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
[all …]
D2x16-minmax-neon-mull-addw-dup.c57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() local
91 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
107 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
123 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
139 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
155 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
171 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
187 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
203 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
225 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
[all …]
D2x16c2-minmax-neon-mull-padal-dup.c58 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local
156 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
158 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
159 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
188 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
211 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
234 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
249 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
260 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
[all …]
D3x16-minmax-neon-mull-addw-dup.c61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
107 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
129 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
151 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
173 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
195 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
217 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
239 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
261 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
287 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local
107vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
125vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
143vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
161vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
180vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
198vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
216vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
234vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
260vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
123 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
151 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
179 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
207 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
235 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
263 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
291 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
319 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
349 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
123vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
145vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
167vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
189vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
212vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
234vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
256vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
278vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
308vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D2x16c2-minmax-neon-mlal-padal-dup.c58 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local
124 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
152 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
180 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
208 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
294 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
295 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
296 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
297 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
326 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
[all …]
D4x16c4-minmax-neondot.c66 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local
124 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
140 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
174 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
195 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
214 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
231 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
246 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
260 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mull-padal-dup.c62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
169 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
170 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
171 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
172 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
234 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
265 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
296 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
319 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
334 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D3x16c2-minmax-neon-mlal-padal-dup.c62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
149 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
189 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
229 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
269 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
357 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
358 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
359 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
360 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
422 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c66 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
182 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
183 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
184 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
185 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
280 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
319 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
358 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
389 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
408 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D6x16c4-minmax-neondot.c74 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local
150 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
174 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
218 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
247 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
274 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
299 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
322 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
342 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
/external/XNNPACK/src/qs8-gemm/gen/
D2x16-minmax-neon-mlal-lane.c56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local
78 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
106 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
120 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
135 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
149 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
163 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
177 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
199 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
[all …]
D2x16-minmax-neon-mull-addw-dup.c56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() local
78 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
94 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
110 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
126 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
142 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
158 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
174 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
190 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
212 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
[all …]
D2x16c2-minmax-neon-mull-padal-dup.c57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local
143 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
144 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
145 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
146 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
175 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
198 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
221 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
233 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
244 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
[all …]
D3x16-minmax-neon-mull-addw-dup.c62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
92 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
114 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
136 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
158 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
180 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
202 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
224 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
272 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local
92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
110 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
128 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
146 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
165 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
183 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
201 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
219 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
245 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D2x16c2-minmax-neon-mlal-padal-dup.c57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local
111 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
139 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
167 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
195 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
281 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
282 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
283 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
284 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
313 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
106 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
128 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
150 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
172 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
195 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
217 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
239 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
261 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
291 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
106 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
134 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
162 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
190 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
218 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
274 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
302 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
332 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D3x16c2-minmax-neon-mull-padal-dup.c63 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
154 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
155 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
156 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
219 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
250 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
281 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
301 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
316 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D4x16c4-minmax-neondot.c71 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local
109 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
125 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
159 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
181 const int32x4_t vproduct1xCDEF = vqrdmulhq_n_s32(vacc1xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
198 vacc1xCDEF = vsraq_n_s32(vproduct1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
215 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
230 …const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
244 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mlal-padal-dup.c63 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
134 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
174 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
214 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
254 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
342 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
343 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
344 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
345 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
407 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c69 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
165 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
166 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
167 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
168 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
263 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
302 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
341 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
369 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
388 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]

123