Home
last modified time | relevance | path

Searched refs:vacc2x89AB (Results 1 – 25 of 41) sorted by relevance

12

/external/XNNPACK/src/qs8-igemm/gen/
D3x16-minmax-neon-mull-addw-dup.c64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
109 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
131 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
153 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
175 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
197 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
219 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
241 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
263 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
292 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local
108 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
126 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
144 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
162 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
181vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
199vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
217vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
235vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
263 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
125 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
153 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
181 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
209 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
237 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
265 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
293 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
321 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
354 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
124 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
146 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
168 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
190 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
213vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
235vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
257vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
279vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
311 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16c4-minmax-neondot.c69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local
127 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
143 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
177 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
198 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
217 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
234 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
248 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
262 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mull-padal-dup.c65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
193 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
194 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
195 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
196 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
240 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
271 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
302 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
322 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
337 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D3x16c2-minmax-neon-mlal-padal-dup.c65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
140 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
180 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
220 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
260 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
381 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
382 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
383 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
384 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
428 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
206 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
207 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
208 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
209 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
286 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
325 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
364 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
392 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
411 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D6x16c4-minmax-neondot.c77 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local
153 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
177 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
221 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
250 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
277 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
302 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
344 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
D4x16c2-minmax-neon-mlal-padal-dup.c69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
162 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
214 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
266 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
318 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
444 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
445 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
446 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
447 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
524 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D8x16c4-minmax-neondot.c85 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local
179 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
211 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
265 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
302 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
337 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
370 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
400 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
426 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
D3x16c8-minmax-avx512skx.c74 __m512i vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() local
117 vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
135 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
D3x16c8-minmax-neon-mull-padal.c282 int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local
361 int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB ); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local
382 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
397 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
410 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
420 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
430 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
/external/XNNPACK/src/qs8-gemm/gen/
D3x16-minmax-neon-mull-addw-dup.c65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
94 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
116 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
138 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
160 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
182 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
204 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
226 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
277 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local
93 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
111 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
129 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
147 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
166 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
184 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
202 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
220 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
248 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mlal-lane.c71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
107 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
129 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
151 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
173 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
196 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
218 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
240 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
262 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
294 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
108 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
136 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
164 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
192 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
220 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
276 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
304 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
337 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D3x16c2-minmax-neon-mull-padal-dup.c66 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local
178 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
179 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
180 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
181 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
225 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
256 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
287 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
304 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
319 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup()
[all …]
D4x16c4-minmax-neondot.c74 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local
112 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
128 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
162 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
184 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
201 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
218 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
232 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
246 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
D3x16c2-minmax-neon-mlal-padal-dup.c66 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local
125 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
165 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
205 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
245 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
366 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
367 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
368 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
369 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
413 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup()
[all …]
D4x16c2-minmax-neon-mull-padal-dup.c72 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
189 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
190 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
191 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
192 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
269 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
308 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
347 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
372 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
391 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D4x16c2-minmax-neon-mlal-padal-dup.c72 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
145 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
197 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
249 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
301 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
427 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
428 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
429 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
430 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
507 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D6x16c4-minmax-neondot.c86 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local
134 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
158 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
202 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
232 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
257 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
282 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
304 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
D8x16c4-minmax-neondot.c98 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local
156 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
188 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
242 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
280 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
313 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
346 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
376 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
402 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
D3x16c8-minmax-avx512skx.c77 __m512i vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() local
104 vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()
120 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()

12