/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16-minmax-neon-mull-addw-dup.c | 65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 110 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 132 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 154 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 176 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 198 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 220 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 242 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 264 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 293 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 109 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 127 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 145 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 163 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 182 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 200 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 218 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 236 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 264 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 126 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 154 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 182 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 210 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 238 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 266 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 294 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 322 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 355 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 125 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 147 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 169 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 191 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 214 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 236 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 258 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 280 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 312 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16c4-minmax-neondot.c | 70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 128 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 144 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 178 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 199 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 218 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 235 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 248 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 262 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 201 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 202 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 203 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 204 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 242 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 273 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 304 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 323 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 338 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 150 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 190 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 230 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 270 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 389 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 390 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 391 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 392 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 430 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 214 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 215 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 216 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 217 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 288 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 327 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 366 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 393 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 412 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 78 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 154 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 178 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 222 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 251 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 278 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 303 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 324 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 344 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 70 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 175 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 227 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 279 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 331 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 452 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 453 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 454 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 455 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 526 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 8x16c4-minmax-neondot.c | 86 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 180 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 212 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 266 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 303 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 338 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 371 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 400 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 426 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
|
D | 3x16c8-minmax-avx512skx.c | 75 __m512i vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() local 122 vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx() 135 …512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vac… in xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx()
|
D | 3x16c8-minmax-neon-mull-padal.c | 283 int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 368 int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF ); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 383 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 398 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 411 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 420 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 430 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16-minmax-neon-mull-addw-dup.c | 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 95 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 117 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 139 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 161 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 183 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 205 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 227 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 278 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 94 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 112 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 130 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 148 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 167 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 185 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 203 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 221 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 249 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 108 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 130 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 152 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 174 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 197 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 219 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 241 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 263 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 295 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 109 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 137 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 165 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 193 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 221 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 277 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 305 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 338 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 3x16c2-minmax-neon-mull-padal-dup.c | 67 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() local 186 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 187 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 188 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 189 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 227 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 258 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 289 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 305 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() 320 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c4-minmax-neondot.c | 75 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 113 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 129 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 163 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 185 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 202 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 219 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 232 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 246 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 67 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 135 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 175 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 215 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 255 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 374 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 375 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 376 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 377 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 415 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mull-padal-dup.c | 73 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() local 197 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 198 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 199 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 200 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 271 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 310 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 349 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 373 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() 392 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 73 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 158 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 210 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 262 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 314 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 435 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 436 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 437 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 438 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 509 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 6x16c4-minmax-neondot.c | 87 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 135 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 159 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 203 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 233 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 258 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 283 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 304 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 324 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 99 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 157 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 189 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 243 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 281 const int32x4_t vproduct2xCDEF = vqrdmulhq_n_s32(vacc2xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 314 vacc2xCDEF = vsraq_n_s32(vproduct2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 347 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 376 …const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 402 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
|
D | 3x16c8-minmax-avx512skx.c | 78 __m512i vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() local 109 vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx() 120 …512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vac… in xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx()
|