/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16-minmax-neon-mull-addw-dup.c | 64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 109 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 131 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 153 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 175 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 197 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 219 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 241 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 263 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 292 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 108 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 126 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 144 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 162 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 181 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 199 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 217 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 235 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 263 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 125 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 153 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 181 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 209 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 237 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 265 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 293 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 321 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 354 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 124 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 146 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 168 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 190 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 213 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 235 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 257 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 279 … vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 311 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16c4-minmax-neondot.c | 69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 127 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 143 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 177 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 198 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 217 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 234 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 248 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 262 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 193 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 194 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 195 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 196 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 240 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 271 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 302 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 322 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 337 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 140 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 180 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 220 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 260 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 381 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 382 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 383 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 384 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 428 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 206 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 207 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 208 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 209 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 286 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 325 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 364 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 392 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 411 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 77 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 153 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 177 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 221 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 250 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 277 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 302 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 344 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 69 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 162 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 214 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 266 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 318 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 444 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 445 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 446 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 447 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 524 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 8x16c4-minmax-neondot.c | 85 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 179 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 211 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 265 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 302 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 337 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 370 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 400 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 426 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
|
D | 3x16c8-minmax-avx512skx.c | 74 __m512i vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() local 117 vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() 135 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
|
D | 3x16c8-minmax-neon-mull-padal.c | 282 int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 361 int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB ); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 382 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 397 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 410 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 420 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 430 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16-minmax-neon-mull-addw-dup.c | 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 94 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 116 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 138 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 160 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 182 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 204 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 226 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 277 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 93 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 111 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 129 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 147 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 166 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 184 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 202 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 220 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 248 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 107 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 129 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 151 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 173 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 196 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 218 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 240 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 262 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 294 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 108 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 136 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 164 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 192 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 220 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 276 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 304 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 337 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 66 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 178 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 179 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 180 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 181 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 225 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 256 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 287 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 304 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 319 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 74 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 112 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 128 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 162 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 184 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 201 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 218 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 232 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 246 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 66 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 125 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 165 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 205 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 245 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 366 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 367 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 368 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 369 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 413 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 72 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 189 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 190 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 191 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 192 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 269 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 308 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 347 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 372 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 391 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 72 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 145 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 197 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 249 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 301 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 427 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 428 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 429 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 430 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 507 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 86 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 134 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 158 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 202 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 232 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 257 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 282 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 304 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 98 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 156 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 188 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 242 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 280 const int32x4_t vproduct2x89AB = vqrdmulhq_n_s32(vacc2x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 313 vacc2x89AB = vsraq_n_s32(vproduct2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 346 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 376 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 402 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2x… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
|
D | 3x16c8-minmax-avx512skx.c | 77 __m512i vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() local 104 vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() 120 …9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vac… in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()
|