Home
last modified time | relevance | path

Searched refs:vacc0x89AB (Results 1 – 25 of 81) sorted by relevance

1234

/external/XNNPACK/src/qs8-gemm/gen/
D1x16-minmax-neon-mull-addw-dup.c45 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local
60 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
70 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
80 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
90 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
100 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
110 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
120 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
130 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
145 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
[all …]
D1x16-minmax-neon-mlal-lane.c45 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local
61 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
71 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
81 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
91 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
112 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
122 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
148 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
[all …]
D1x16c2-minmax-neon-mull-padal-dup.c46 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local
92 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
93 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
94 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
95 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
121 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
136 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
151 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
160 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
167 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
[all …]
D2x16-minmax-neon-mlal-lane.c51 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local
55 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
75 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
117 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
146 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
160 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
174 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane()
[all …]
D2x16-minmax-neon-mull-addw-dup.c51 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() local
55 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
74 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
90 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
106 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
122 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
138 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
154 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
170 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
186 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup()
[all …]
D1x16c4-minmax-neondot.c48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() local
71 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
75 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
94 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
104 const int32x4_t vproduct0x89AB = vqrdmulhq_n_s32(vacc0x89AB, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
109 vacc0x89AB = vsraq_n_s32(vproduct0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
114 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
120 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
125 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0x… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
D1x16c2-minmax-neon-mlal-padal-dup.c46 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local
83 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
99 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
115 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
131 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
180 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
181 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
182 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
183 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
209 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
[all …]
D3x16-minmax-neon-mull-addw-dup.c57 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
61 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
88 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
110 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
132 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
154 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
176 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
198 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
220 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c57 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local
61 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
65 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
107 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
143 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
162 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
180 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
198 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D2x16c2-minmax-neon-mull-padal-dup.c52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local
56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
104 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
105 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
106 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
165 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
188 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
211 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
228 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
[all …]
D2x16c2-minmax-neon-mlal-padal-dup.c52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local
56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
131 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
159 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
187 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
241 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
242 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
243 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
244 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c63 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local
67 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
147 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
169 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
192 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
214 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-neon-mull-addw-dup.c63 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
67 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
71 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
75 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
102 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
130 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
158 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
186 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
214 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
242 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D1x16-minmax-neon-mlal-lane.c48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local
72 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
82 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
92 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
113vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
123vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
133vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
143vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
159 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
[all …]
D1x16-minmax-neon-mull-addw-dup.c48 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() local
71 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
81 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
91 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
101 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
111 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
121 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
131 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
141 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
156 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
[all …]
D1x16c4-minmax-neondot.c49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() local
80 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
84 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
103 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
112 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
119 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
124 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
130 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
135 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0x… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
D1x16c2-minmax-neon-mull-padal-dup.c49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local
103 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
104 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
105 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
106 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
132 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
147 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
162 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
174 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
181 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
[all …]
D2x16-minmax-neon-mlal-lane.c52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local
56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
88 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
116 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
130 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
145vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
159vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
173vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
187vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
[all …]
D2x16-minmax-neon-mull-addw-dup.c52 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() local
56 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
87 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
103 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
119 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
135 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
151 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
167 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
183 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
199 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup()
[all …]
D1x16c2-minmax-neon-mlal-padal-dup.c49 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local
94 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
110 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
126 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
142 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
191 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
192 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
193 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
194 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
220 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
[all …]
D3x16-minmax-neon-mull-addw-dup.c56 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local
60 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
103 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
125 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
147 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
169 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
191 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
213 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
235 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup()
[all …]
D3x16-minmax-neon-mlal-lane.c56 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local
60 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
64 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
122 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
140 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
158 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
177vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
195vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
213vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane()
[all …]
D2x16c2-minmax-neon-mull-padal-dup.c53 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local
57 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
116 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
117 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
118 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
119 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
178 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
201 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
224 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
244 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
[all …]
D4x16-minmax-neon-mull-addw-dup.c60 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local
64 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
119 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
147 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
175 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
203 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
231 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
259 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup()
[all …]
D4x16-minmax-neon-mlal-lane.c60 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local
64 int32x4_t vacc1x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
68 int32x4_t vacc2x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
72 int32x4_t vacc3x89AB = vacc0x89AB; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
142 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
164 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
186 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
209vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
231vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane()
[all …]

1234