/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16-minmax-neon-mull-addw-dup.c | 45 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 60 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 70 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 80 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 90 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 100 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 110 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 120 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 130 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 145 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x16-minmax-neon-mlal-lane.c | 45 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local 61 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 71 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 81 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 91 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 112 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 122 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 148 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 46 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 92 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 93 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 94 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 95 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 121 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 136 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 151 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 160 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 167 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 51 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local 55 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 75 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 117 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 146 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 160 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 174 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 51 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 55 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 74 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 90 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 106 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 122 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 138 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 154 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 170 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 186 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 1x16c4-minmax-neondot.c | 48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() local 71 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 75 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 94 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 104 const int32x4_t vproduct0x89AB = vqrdmulhq_n_s32(vacc0x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 109 vacc0x89AB = vsraq_n_s32(vproduct0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 114 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 120 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 125 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0x… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 46 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 83 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 99 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 115 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 131 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 180 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 181 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 182 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 183 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 209 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 57 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 61 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 88 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 110 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 132 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 154 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 176 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 198 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 220 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 57 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 61 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 107 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 143 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 162 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 180 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 198 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 104 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 105 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 106 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 165 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 188 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 211 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 228 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 131 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 159 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 187 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 241 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 242 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 243 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 244 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 63 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 67 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 147 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 169 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 192 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 214 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 63 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 67 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 102 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 130 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 158 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 186 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 214 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 242 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16-minmax-neon-mlal-lane.c | 48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local 72 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 82 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 92 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 113 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 123 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 133 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 143 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 159 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x16-minmax-neon-mull-addw-dup.c | 48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 71 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 81 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 91 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 101 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 111 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 121 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 131 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 141 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 156 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x16c4-minmax-neondot.c | 49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() local 80 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 84 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 103 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 112 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 119 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 124 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 130 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 135 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0x… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 104 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 105 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 106 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 132 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 147 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 162 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 174 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 181 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 88 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 116 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 130 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 145 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 159 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 173 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 187 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 87 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 103 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 119 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 135 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 151 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 167 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 183 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 199 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 94 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 110 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 126 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 142 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 191 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 192 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 193 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 194 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 220 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 56 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 60 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 103 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 125 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 147 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 169 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 191 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 213 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 235 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 56 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 60 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 122 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 140 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 158 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 177 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 195 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 213 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 53 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 57 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 116 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 117 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 118 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 119 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 178 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 201 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 224 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 244 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 60 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 64 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 119 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 147 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 175 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 203 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 231 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 259 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 60 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 64 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 142 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 164 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 186 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 209 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 231 … vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|