Home
last modified time | relevance | path

Searched refs:vacc2xCDEF (Results 1 – 25 of 41) sorted by relevance

12

/external/XNNPACK/src/qs8-igemm/gen/
D3x16-minmax-neon-mull-addw-dup.c65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
110 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
132 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
154 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
176 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
198 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
220 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
242 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
264 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
293 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local
109vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
127vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
145vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
163vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
182vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
200vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
218vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
236vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
264vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
126 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
154 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
182 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
210 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
238 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
266 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
294 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
322 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
355 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
125vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
147vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
169vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
191vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
214vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
236vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
258vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
280vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
312vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16c4-minmax-neondot.c70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local
128 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
144 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
178 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
199 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
218 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
235 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
248 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
262 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mull-padal-dup.c66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
201 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
202 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
203 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
204 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
242 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
273 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
304 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
323 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
338 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D3x16c2-minmax-neon-mlal-padal-dup.c66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
150 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
190 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
230 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
270 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
389 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
390 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
391 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
392 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
430 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
214 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
215 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
216 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
217 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
288 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
327 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
366 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
393 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
412 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D6x16c4-minmax-neondot.c78 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local
154 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
178 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
222 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
251 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
278 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
303 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
344 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
D4x16c2-minmax-neon-mlal-padal-dup.c70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
175 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
227 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
279 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
331 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
452 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
453 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
454 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
455 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
526 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D8x16c4-minmax-neondot.c86 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local
180 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
212 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
266 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
303 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
338 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
371 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
400 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
426 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
D3x16c8-minmax-avx512skx.c75 __m512i vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() local
122 vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
135 …512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vac… in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
D3x16c8-minmax-neon-mull-padal.c283 int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local
368 int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF ); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local
383 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
398 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
411 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
420 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
430 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
/external/XNNPACK/src/qs8-gemm/gen/
D3x16-minmax-neon-mull-addw-dup.c66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
95 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
117 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
139 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
161 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
183 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
205 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
227 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
278 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local
94 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
112 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
130 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
148 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
167 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
185 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
203 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
221 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
249 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mlal-lane.c72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
108 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
130 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
152 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
174 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
197 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
219 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
241 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
263 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
295 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
109 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
137 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
165 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
193 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
221 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
277 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
305 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
338 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D3x16c2-minmax-neon-mull-padal-dup.c67 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
186 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
187 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
188 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
189 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
227 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
258 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
289 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
305 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
320 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D4x16c4-minmax-neondot.c75 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local
113 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
129 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
163 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
185 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
202 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
219 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
232 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
246 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mlal-padal-dup.c67 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
135 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
175 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
215 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
255 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
374 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
375 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
376 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
377 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
415 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c73 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
197 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
198 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
199 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
200 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
271 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
310 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
349 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
373 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
392 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D4x16c2-minmax-neon-mlal-padal-dup.c73 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
158 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
210 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
262 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
314 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
435 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
436 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
437 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
438 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
509 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D6x16c4-minmax-neondot.c87 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local
135 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
159 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
203 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
233 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
258 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
283 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
304 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
324 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
D8x16c4-minmax-neondot.c99 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local
157 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
189 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
243 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
281 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
314 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
347 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
376 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
402 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
D3x16c8-minmax-avx512skx.c78 __m512i vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() local
109 vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()
120 …512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vac… in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()

12