Home
last modified time | relevance | path

Searched refs:vacc3xCDEF (Results 1 – 25 of 25) sorted by relevance

/external/XNNPACK/src/qs8-gemm/gen/
D4x16-minmax-neon-mlal-lane.c76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
199 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
221 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
243 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
265 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
299 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
112 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
140 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
168 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
196 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
224 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
252 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
280 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
308 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
344 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16c4-minmax-neondot.c79 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local
117 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
133 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
167 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
189 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
206 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
223 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
234 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
248 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
D4x16c2-minmax-neon-mull-padal-dup.c77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
229 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
230 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
231 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
232 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
279 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
318 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
357 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
377 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
396 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D4x16c2-minmax-neon-mlal-padal-dup.c77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
159 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
211 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
263 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
315 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
467 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
468 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
469 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
470 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
517 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D6x16c4-minmax-neondot.c91 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local
139 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
163 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
207 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
237 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
262 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
287 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
306 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
326 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
D8x16c4-minmax-neondot.c103 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local
161 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
193 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
247 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
285 const int32x4_t vproduct3xCDEF = vqrdmulhq_n_s32(vacc3xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
318 vacc3xCDEF = vsraq_n_s32(vproduct3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
351 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
378 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
404 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
D4x16c8-minmax-avx512skx.c88 __m512i vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx() local
125 vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
138 …512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vac… in xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx()
D4x16c8-minmax-neon-mull-padal.c332 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local
445 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local
464 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
483 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
500 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
511 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
524 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
D4x16c16-minmax-neon-mlal-padal.c396 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
509 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
528 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
547 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
564 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
575 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
588 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
D4x16c8-minmax-neon-mlal-padal.c571 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
684 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
703 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
722 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
739 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
750 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
763 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
/external/XNNPACK/src/qs8-igemm/gen/
D4x16-minmax-neon-mull-addw-dup.c73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
129 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
157 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
185 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
213 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
241 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
269 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
297 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
325 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
361 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
127vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
149vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
171vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
193vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
216vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
238vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
260vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
282vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
316vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16c4-minmax-neondot.c74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local
132 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
148 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
182 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
203 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
222 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
239 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
250 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
264 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
D4x16c2-minmax-neon-mull-padal-dup.c74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local
246 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
247 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
248 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
249 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
296 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
335 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
374 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
397 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
416 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup()
[all …]
D6x16c4-minmax-neondot.c82 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local
158 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
182 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
226 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
255 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
282 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
307 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
326 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
346 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
D4x16c2-minmax-neon-mlal-padal-dup.c74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local
176 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
228 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
280 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
332 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
484 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
485 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
486 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
487 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
534 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup()
[all …]
D8x16c4-minmax-neondot.c90 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local
184 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
216 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
270 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
307 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
342 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
375 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
402 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
428 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
D4x16c8-minmax-avx512skx.c83 __m512i vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx() local
140 vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
155 …512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vac… in xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx()
D4x16c8-minmax-neon-mull-padal.c352 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local
465 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local
484 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
503 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
520 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
531 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
544 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
D4x16c16-minmax-neon-mlal-padal.c416 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
529 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local
548 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
567 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
584 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
595 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
608 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
D4x16c8-minmax-neon-mlal-padal.c591 int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
704 int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF ); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local
723 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
742 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
759 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
770 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
783 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
/external/XNNPACK/src/f32-prelu/gen/
Dwasmsimd-minmax-4x16.c117 v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16() local
135 vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, wasm_f32x4_mul(vi3xCDEF, vwCDEF)); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
155 wasm_v128_store(o3 + 12, vacc3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_minmax_4x16()
Dwasmsimd-bitselect-4x16.c117 v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16() local
135 vacc3xCDEF = wasm_v128_bitselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
155 wasm_v128_store(o3 + 12, vacc3xCDEF); in xnn_f32_prelu_ukernel__wasmsimd_bitselect_4x16()
Dneon-4x16.c111 float32x4_t vacc3xCDEF = vmulq_f32(vi3xCDEF, vwCDEF); in xnn_f32_prelu_ukernel__neon_4x16() local
129 vacc3xCDEF = vbslq_f32(vm3xCDEF, vacc3xCDEF, vi3xCDEF); in xnn_f32_prelu_ukernel__neon_4x16()
146 vst1q_f32(o3, vacc3xCDEF); o3 += 4; in xnn_f32_prelu_ukernel__neon_4x16()