1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/neon-blocked.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_minmax_ukernel_12x4__neonfma(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_12x4__neonfma(
18 size_t mc,
19 size_t nc,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 size_t output_stride,
26 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(mc != 0);
29 assert(mc % sizeof(float) == 0);
30 assert(nc != 0);
31
32 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
33 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
34 size_t output_decrement = output_stride * nc - 12 * sizeof(float);
35 while XNN_LIKELY(mc >= 12 * sizeof(float)) {
36 const float*restrict w = weights;
37 const int32_t* dmap = widx_dmap;
38 const uint32_t* nnzmap = nidx_nnzmap;
39 size_t n = nc;
40 while (n >= 4) {
41 uint32_t nnz = *nnzmap++;
42 float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
43 float32x4_t vacc4567n0 = vacc0123n0;
44 float32x4_t vacc89ABn0 = vacc0123n0;
45 float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
46 float32x4_t vacc4567n1 = vacc0123n1;
47 float32x4_t vacc89ABn1 = vacc0123n1;
48 float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
49 float32x4_t vacc4567n2 = vacc0123n2;
50 float32x4_t vacc89ABn2 = vacc0123n2;
51 float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
52 float32x4_t vacc4567n3 = vacc0123n3;
53 float32x4_t vacc89ABn3 = vacc0123n3;
54 if XNN_LIKELY(nnz != 0) {
55 do {
56 const intptr_t diff = *dmap++;
57 const float32x4_t vi0123 = vld1q_f32(input);
58 const float32x4_t vi4567 = vld1q_f32(input + 4);
59 const float32x4_t vi89AB = vld1q_f32(input + 8);
60 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
61 __builtin_prefetch(input + 16);
62 const float32x4_t vw = vld1q_f32(w); w += 4;
63 __builtin_prefetch(w + 32);
64 vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
65 vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
66 vacc89ABn0 = vfmaq_laneq_f32(vacc89ABn0, vi89AB, vw, 0);
67 vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
68 vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
69 vacc89ABn1 = vfmaq_laneq_f32(vacc89ABn1, vi89AB, vw, 1);
70 vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
71 vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
72 vacc89ABn2 = vfmaq_laneq_f32(vacc89ABn2, vi89AB, vw, 2);
73 vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
74 vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
75 vacc89ABn3 = vfmaq_laneq_f32(vacc89ABn3, vi89AB, vw, 3);
76 } while (--nnz != 0);
77 }
78 float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
79 float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
80 float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
81 float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
82 float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
83 float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
84 float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
85 float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
86 float32x4_t vout89ABn2 = vminq_f32(vacc89ABn2, vmax);
87 float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
88 float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
89 float32x4_t vout89ABn3 = vminq_f32(vacc89ABn3, vmax);
90
91 vout0123n0 = vmaxq_f32(vout0123n0, vmin);
92 vout4567n0 = vmaxq_f32(vout4567n0, vmin);
93 vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
94 vout0123n1 = vmaxq_f32(vout0123n1, vmin);
95 vout4567n1 = vmaxq_f32(vout4567n1, vmin);
96 vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
97 vout0123n2 = vmaxq_f32(vout0123n2, vmin);
98 vout4567n2 = vmaxq_f32(vout4567n2, vmin);
99 vout89ABn2 = vmaxq_f32(vout89ABn2, vmin);
100 vout0123n3 = vmaxq_f32(vout0123n3, vmin);
101 vout4567n3 = vmaxq_f32(vout4567n3, vmin);
102 vout89ABn3 = vmaxq_f32(vout89ABn3, vmin);
103
104 vst1q_f32(output + 0, vout0123n0);
105 vst1q_f32(output + 4, vout4567n0);
106 vst1q_f32(output + 8, vout89ABn0);
107 output = (float*restrict) ((uintptr_t) output + output_stride);
108 vst1q_f32(output + 0, vout0123n1);
109 vst1q_f32(output + 4, vout4567n1);
110 vst1q_f32(output + 8, vout89ABn1);
111 output = (float*restrict) ((uintptr_t) output + output_stride);
112 vst1q_f32(output + 0, vout0123n2);
113 vst1q_f32(output + 4, vout4567n2);
114 vst1q_f32(output + 8, vout89ABn2);
115 output = (float*restrict) ((uintptr_t) output + output_stride);
116 vst1q_f32(output + 0, vout0123n3);
117 vst1q_f32(output + 4, vout4567n3);
118 vst1q_f32(output + 8, vout89ABn3);
119 output = (float*restrict) ((uintptr_t) output + output_stride);
120 n -= 4;
121 }
122
123 // clean up loop, fall back to nr=1
124 if XNN_UNLIKELY(n != 0) {
125 do {
126 uint32_t nnz = *nnzmap++;
127 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
128 float32x4_t vacc4567 = vacc0123;
129 float32x4_t vacc89AB = vacc0123;
130 if XNN_LIKELY(nnz != 0) {
131 do {
132 const intptr_t diff = *dmap++;
133 const float32x4_t vi0123 = vld1q_f32(input);
134 const float32x4_t vi4567 = vld1q_f32(input + 4);
135 const float32x4_t vi89AB = vld1q_f32(input + 8);
136 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
137 __builtin_prefetch(input + 16);
138 const float32x4_t vw = vld1q_dup_f32(w); w += 1;
139 __builtin_prefetch(w + 32);
140 vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
141 vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
142 vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
143 } while (--nnz != 0);
144 }
145 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
146 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
147 float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
148
149 vout0123 = vmaxq_f32(vout0123, vmin);
150 vout4567 = vmaxq_f32(vout4567, vmin);
151 vout89AB = vmaxq_f32(vout89AB, vmin);
152
153 vst1q_f32(output + 0, vout0123);
154 vst1q_f32(output + 4, vout4567);
155 vst1q_f32(output + 8, vout89AB);
156 output = (float*restrict) ((uintptr_t) output + output_stride);
157 n -= 1;
158 } while (n != 0);
159 }
160 output = (float*restrict) ((uintptr_t) output - output_decrement);
161 input += 12;
162 mc -= 12 * sizeof(float);
163 }
164 if XNN_UNLIKELY(mc != 0) {
165 output_decrement += 4 * sizeof(float);
166 if (mc & (8 * sizeof(float))) {
167 const float*restrict w = weights;
168 const int32_t* dmap = widx_dmap;
169 const uint32_t* nnzmap = nidx_nnzmap;
170 size_t n = nc;
171 while (n >= 4) {
172 uint32_t nnz = *nnzmap++;
173 float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
174 float32x4_t vacc4567n0 = vacc0123n0;
175 float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
176 float32x4_t vacc4567n1 = vacc0123n1;
177 float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
178 float32x4_t vacc4567n2 = vacc0123n2;
179 float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
180 float32x4_t vacc4567n3 = vacc0123n3;
181 if XNN_LIKELY(nnz != 0) {
182 do {
183 const intptr_t diff = *dmap++;
184 const float32x4_t vi0123 = vld1q_f32(input);
185 const float32x4_t vi4567 = vld1q_f32(input + 4);
186 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
187 const float32x4_t vw = vld1q_f32(w); w += 4;
188
189 vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
190 vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
191 vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
192 vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
193 vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
194 vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
195 vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
196 vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
197 } while (--nnz != 0);
198 }
199 float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
200 float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
201 float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
202 float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
203 float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
204 float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
205 float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
206 float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
207
208 vout0123n0 = vmaxq_f32(vout0123n0, vmin);
209 vout4567n0 = vmaxq_f32(vout4567n0, vmin);
210 vout0123n1 = vmaxq_f32(vout0123n1, vmin);
211 vout4567n1 = vmaxq_f32(vout4567n1, vmin);
212 vout0123n2 = vmaxq_f32(vout0123n2, vmin);
213 vout4567n2 = vmaxq_f32(vout4567n2, vmin);
214 vout0123n3 = vmaxq_f32(vout0123n3, vmin);
215 vout4567n3 = vmaxq_f32(vout4567n3, vmin);
216
217 vst1q_f32(output + 0, vout0123n0);
218 vst1q_f32(output + 4, vout4567n0);
219 output = (float*restrict) ((uintptr_t) output + output_stride);
220 vst1q_f32(output + 0, vout0123n1);
221 vst1q_f32(output + 4, vout4567n1);
222 output = (float*restrict) ((uintptr_t) output + output_stride);
223 vst1q_f32(output + 0, vout0123n2);
224 vst1q_f32(output + 4, vout4567n2);
225 output = (float*restrict) ((uintptr_t) output + output_stride);
226 vst1q_f32(output + 0, vout0123n3);
227 vst1q_f32(output + 4, vout4567n3);
228 output = (float*restrict) ((uintptr_t) output + output_stride);
229 n -= 4;
230 }
231
232 // clean up loop, fall back to nr=1
233 if XNN_UNLIKELY(n != 0) {
234 do {
235 uint32_t nnz = *nnzmap++;
236 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
237 float32x4_t vacc4567 = vacc0123;
238 if XNN_LIKELY(nnz != 0) {
239 do {
240 const intptr_t diff = *dmap++;
241 const float32x4_t vi0123 = vld1q_f32(input);
242 const float32x4_t vi4567 = vld1q_f32(input + 4);
243 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
244 const float32x4_t vw = vld1q_dup_f32(w); w += 1;
245 vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
246 vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
247 } while (--nnz != 0);
248 }
249 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
250 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
251
252 vout0123 = vmaxq_f32(vout0123, vmin);
253 vout4567 = vmaxq_f32(vout4567, vmin);
254
255 vst1q_f32(output + 0, vout0123);
256 vst1q_f32(output + 4, vout4567);
257 output = (float*restrict) ((uintptr_t) output + output_stride);
258 n -= 1;
259 } while (n != 0);
260 }
261 output = (float*restrict) ((uintptr_t) output - output_decrement);
262 input += 8;
263 }
264 output_decrement += 4 * sizeof(float);
265 if (mc & (4 * sizeof(float))) {
266 const float*restrict w = weights;
267 const int32_t* dmap = widx_dmap;
268 const uint32_t* nnzmap = nidx_nnzmap;
269 size_t n = nc;
270 while (n >= 4) {
271 uint32_t nnz = *nnzmap++;
272 float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
273 float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
274 float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
275 float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
276 if XNN_LIKELY(nnz != 0) {
277 do {
278 const intptr_t diff = *dmap++;
279 const float32x4_t vi0123 = vld1q_f32(input);
280 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
281 const float32x4_t vw = vld1q_f32(w); w += 4;
282
283 vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
284 vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
285 vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
286 vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
287 } while (--nnz != 0);
288 }
289 float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
290 float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
291 float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
292 float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
293
294 vout0123n0 = vmaxq_f32(vout0123n0, vmin);
295 vout0123n1 = vmaxq_f32(vout0123n1, vmin);
296 vout0123n2 = vmaxq_f32(vout0123n2, vmin);
297 vout0123n3 = vmaxq_f32(vout0123n3, vmin);
298
299 vst1q_f32(output + 0, vout0123n0);
300 output = (float*restrict) ((uintptr_t) output + output_stride);
301 vst1q_f32(output + 0, vout0123n1);
302 output = (float*restrict) ((uintptr_t) output + output_stride);
303 vst1q_f32(output + 0, vout0123n2);
304 output = (float*restrict) ((uintptr_t) output + output_stride);
305 vst1q_f32(output + 0, vout0123n3);
306 output = (float*restrict) ((uintptr_t) output + output_stride);
307 n -= 4;
308 }
309
310 // clean up loop, fall back to nr=1
311 if XNN_UNLIKELY(n != 0) {
312 do {
313 uint32_t nnz = *nnzmap++;
314 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
315 if XNN_LIKELY(nnz != 0) {
316 do {
317 const intptr_t diff = *dmap++;
318 const float32x4_t vi0123 = vld1q_f32(input);
319 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
320 const float32x4_t vw = vld1q_dup_f32(w); w += 1;
321 vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
322 } while (--nnz != 0);
323 }
324 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
325
326 vout0123 = vmaxq_f32(vout0123, vmin);
327
328 vst1q_f32(output + 0, vout0123);
329 output = (float*restrict) ((uintptr_t) output + output_stride);
330 n -= 1;
331 } while (n != 0);
332 }
333 output = (float*restrict) ((uintptr_t) output - output_decrement);
334 input += 4;
335 }
336 output_decrement += 2 * sizeof(float);
337 if (mc & (2 * sizeof(float))) {
338 const float*restrict w = weights;
339 const int32_t* dmap = widx_dmap;
340 const uint32_t* nnzmap = nidx_nnzmap;
341 size_t n = nc;
342 while (n >= 4) {
343 uint32_t nnz = *nnzmap++;
344 float32x2_t vacc01n0 = vld1_dup_f32(w); w += 1;
345 float32x2_t vacc01n1 = vld1_dup_f32(w); w += 1;
346 float32x2_t vacc01n2 = vld1_dup_f32(w); w += 1;
347 float32x2_t vacc01n3 = vld1_dup_f32(w); w += 1;
348 if XNN_LIKELY(nnz != 0) {
349 do {
350 const intptr_t diff = *dmap++;
351 const float32x2_t vi01 = vld1_f32(input);
352 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
353 const float32x4_t vw = vld1q_f32(w); w += 4;
354
355 vacc01n0 = vfma_laneq_f32(vacc01n0, vi01, vw, 0);
356 vacc01n1 = vfma_laneq_f32(vacc01n1, vi01, vw, 1);
357 vacc01n2 = vfma_laneq_f32(vacc01n2, vi01, vw, 2);
358 vacc01n3 = vfma_laneq_f32(vacc01n3, vi01, vw, 3);
359 } while (--nnz != 0);
360 }
361 float32x2_t vout01n0 = vmin_f32(vacc01n0, vget_low_f32(vmax));
362 float32x2_t vout01n1 = vmin_f32(vacc01n1, vget_low_f32(vmax));
363 float32x2_t vout01n2 = vmin_f32(vacc01n2, vget_low_f32(vmax));
364 float32x2_t vout01n3 = vmin_f32(vacc01n3, vget_low_f32(vmax));
365
366 vout01n0 = vmax_f32(vout01n0, vget_low_f32(vmin));
367 vout01n1 = vmax_f32(vout01n1, vget_low_f32(vmin));
368 vout01n2 = vmax_f32(vout01n2, vget_low_f32(vmin));
369 vout01n3 = vmax_f32(vout01n3, vget_low_f32(vmin));
370
371 vst1_f32(output + 0, vout01n0);
372 output = (float*restrict) ((uintptr_t) output + output_stride);
373 vst1_f32(output + 0, vout01n1);
374 output = (float*restrict) ((uintptr_t) output + output_stride);
375 vst1_f32(output + 0, vout01n2);
376 output = (float*restrict) ((uintptr_t) output + output_stride);
377 vst1_f32(output + 0, vout01n3);
378 output = (float*restrict) ((uintptr_t) output + output_stride);
379 n -= 4;
380 }
381
382 // clean up loop, fall back to nr=1
383 if XNN_UNLIKELY(n != 0) {
384 do {
385 uint32_t nnz = *nnzmap++;
386 float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
387 if XNN_LIKELY(nnz != 0) {
388 do {
389 const intptr_t diff = *dmap++;
390 const float32x2_t vi01 = vld1_f32(input);
391 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
392 const float32x2_t vw = vld1_dup_f32(w); w += 1;
393 vacc01 = vfma_f32(vacc01, vi01, vw);
394 } while (--nnz != 0);
395 }
396 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
397 vout01 = vmax_f32(vout01, vget_low_f32(vmin));
398
399 vst1_f32(output, vout01);
400 output = (float*restrict) ((uintptr_t) output + output_stride);
401 n -= 1;
402 } while (n != 0);
403 }
404 output = (float*restrict) ((uintptr_t) output - output_decrement);
405 input += 2;
406 }
407 output_decrement += 1 * sizeof(float);
408 if (mc & (1 * sizeof(float))) {
409 const float*restrict w = weights;
410 const int32_t* dmap = widx_dmap;
411 const uint32_t* nnzmap = nidx_nnzmap;
412 size_t n = nc;
413 while (n >= 4) {
414 uint32_t nnz = *nnzmap++;
415 float32x2_t vacc0n0 = vld1_dup_f32(w); w += 1;
416 float32x2_t vacc0n1 = vld1_dup_f32(w); w += 1;
417 float32x2_t vacc0n2 = vld1_dup_f32(w); w += 1;
418 float32x2_t vacc0n3 = vld1_dup_f32(w); w += 1;
419 if XNN_LIKELY(nnz != 0) {
420 do {
421 const intptr_t diff = *dmap++;
422 const float32x2_t vi0 = vld1_dup_f32(input);
423 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
424 const float32x4_t vw = vld1q_f32(w); w += 4;
425
426 vacc0n0 = vfma_laneq_f32(vacc0n0, vi0, vw, 0);
427 vacc0n1 = vfma_laneq_f32(vacc0n1, vi0, vw, 1);
428 vacc0n2 = vfma_laneq_f32(vacc0n2, vi0, vw, 2);
429 vacc0n3 = vfma_laneq_f32(vacc0n3, vi0, vw, 3);
430 } while (--nnz != 0);
431 }
432 float32x2_t vout0n0 = vmin_f32(vacc0n0, vget_low_f32(vmax));
433 float32x2_t vout0n1 = vmin_f32(vacc0n1, vget_low_f32(vmax));
434 float32x2_t vout0n2 = vmin_f32(vacc0n2, vget_low_f32(vmax));
435 float32x2_t vout0n3 = vmin_f32(vacc0n3, vget_low_f32(vmax));
436
437 vout0n0 = vmax_f32(vout0n0, vget_low_f32(vmin));
438 vout0n1 = vmax_f32(vout0n1, vget_low_f32(vmin));
439 vout0n2 = vmax_f32(vout0n2, vget_low_f32(vmin));
440 vout0n3 = vmax_f32(vout0n3, vget_low_f32(vmin));
441
442 vst1_lane_f32(output + 0, vout0n0, 0);
443 output = (float*restrict) ((uintptr_t) output + output_stride);
444 vst1_lane_f32(output + 0, vout0n1, 0);
445 output = (float*restrict) ((uintptr_t) output + output_stride);
446 vst1_lane_f32(output + 0, vout0n2, 0);
447 output = (float*restrict) ((uintptr_t) output + output_stride);
448 vst1_lane_f32(output + 0, vout0n3, 0);
449 output = (float*restrict) ((uintptr_t) output + output_stride);
450 n -= 4;
451 }
452
453 // clean up loop, fall back to nr=1
454 if XNN_UNLIKELY(n != 0) {
455 do {
456 uint32_t nnz = *nnzmap++;
457 float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
458 if XNN_LIKELY(nnz != 0) {
459 do {
460 const intptr_t diff = *dmap++;
461 const float32x2_t vi0 = vld1_dup_f32(input);
462 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
463 const float32x2_t vw = vld1_dup_f32(w); w += 1;
464 vacc0 = vfma_f32(vacc0, vi0, vw);
465 } while (--nnz != 0);
466 }
467 float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
468 vout0 = vmax_f32(vout0, vget_low_f32(vmin));
469
470 vst1_lane_f32(output, vout0, 1);
471 output = (float*restrict) ((uintptr_t) output + output_stride);
472 n -= 1;
473 } while (n != 0);
474 }
475 output = (float*restrict) ((uintptr_t) output - output_decrement);
476 input += 1;
477 }
478 }
479 }
480