• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon-blocked.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_minmax_ukernel_12x4__neonfma(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_12x4__neonfma(
18     size_t mc,
19     size_t nc,
20     const float*restrict input,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict output,
25     size_t output_stride,
26     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(mc != 0);
29   assert(mc % sizeof(float) == 0);
30   assert(nc != 0);
31 
32   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
33   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
34   size_t output_decrement = output_stride * nc - 12 * sizeof(float);
35   while XNN_LIKELY(mc >= 12 * sizeof(float)) {
36     const float*restrict w = weights;
37     const int32_t* dmap = widx_dmap;
38     const uint32_t* nnzmap = nidx_nnzmap;
39     size_t n = nc;
40     while (n >= 4) {
41       uint32_t nnz = *nnzmap++;
42       float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
43       float32x4_t vacc4567n0 = vacc0123n0;
44       float32x4_t vacc89ABn0 = vacc0123n0;
45       float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
46       float32x4_t vacc4567n1 = vacc0123n1;
47       float32x4_t vacc89ABn1 = vacc0123n1;
48       float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
49       float32x4_t vacc4567n2 = vacc0123n2;
50       float32x4_t vacc89ABn2 = vacc0123n2;
51       float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
52       float32x4_t vacc4567n3 = vacc0123n3;
53       float32x4_t vacc89ABn3 = vacc0123n3;
54       if XNN_LIKELY(nnz != 0) {
55         do {
56           const intptr_t diff = *dmap++;
57           const float32x4_t vi0123 = vld1q_f32(input);
58           const float32x4_t vi4567 = vld1q_f32(input + 4);
59           const float32x4_t vi89AB = vld1q_f32(input + 8);
60           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
61           __builtin_prefetch(input + 16);
62           const float32x4_t vw = vld1q_f32(w); w += 4;
63           __builtin_prefetch(w + 32);
64           vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
65           vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
66           vacc89ABn0 = vfmaq_laneq_f32(vacc89ABn0, vi89AB, vw, 0);
67           vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
68           vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
69           vacc89ABn1 = vfmaq_laneq_f32(vacc89ABn1, vi89AB, vw, 1);
70           vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
71           vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
72           vacc89ABn2 = vfmaq_laneq_f32(vacc89ABn2, vi89AB, vw, 2);
73           vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
74           vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
75           vacc89ABn3 = vfmaq_laneq_f32(vacc89ABn3, vi89AB, vw, 3);
76         } while (--nnz != 0);
77       }
78       float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
79       float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
80       float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
81       float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
82       float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
83       float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
84       float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
85       float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
86       float32x4_t vout89ABn2 = vminq_f32(vacc89ABn2, vmax);
87       float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
88       float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
89       float32x4_t vout89ABn3 = vminq_f32(vacc89ABn3, vmax);
90 
91       vout0123n0 = vmaxq_f32(vout0123n0, vmin);
92       vout4567n0 = vmaxq_f32(vout4567n0, vmin);
93       vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
94       vout0123n1 = vmaxq_f32(vout0123n1, vmin);
95       vout4567n1 = vmaxq_f32(vout4567n1, vmin);
96       vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
97       vout0123n2 = vmaxq_f32(vout0123n2, vmin);
98       vout4567n2 = vmaxq_f32(vout4567n2, vmin);
99       vout89ABn2 = vmaxq_f32(vout89ABn2, vmin);
100       vout0123n3 = vmaxq_f32(vout0123n3, vmin);
101       vout4567n3 = vmaxq_f32(vout4567n3, vmin);
102       vout89ABn3 = vmaxq_f32(vout89ABn3, vmin);
103 
104       vst1q_f32(output + 0, vout0123n0);
105       vst1q_f32(output + 4, vout4567n0);
106       vst1q_f32(output + 8, vout89ABn0);
107       output = (float*restrict) ((uintptr_t) output + output_stride);
108       vst1q_f32(output + 0, vout0123n1);
109       vst1q_f32(output + 4, vout4567n1);
110       vst1q_f32(output + 8, vout89ABn1);
111       output = (float*restrict) ((uintptr_t) output + output_stride);
112       vst1q_f32(output + 0, vout0123n2);
113       vst1q_f32(output + 4, vout4567n2);
114       vst1q_f32(output + 8, vout89ABn2);
115       output = (float*restrict) ((uintptr_t) output + output_stride);
116       vst1q_f32(output + 0, vout0123n3);
117       vst1q_f32(output + 4, vout4567n3);
118       vst1q_f32(output + 8, vout89ABn3);
119       output = (float*restrict) ((uintptr_t) output + output_stride);
120       n -= 4;
121     }
122 
123     // clean up loop, fall back to nr=1
124     if XNN_UNLIKELY(n != 0) {
125       do {
126         uint32_t nnz = *nnzmap++;
127         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
128         float32x4_t vacc4567 = vacc0123;
129         float32x4_t vacc89AB = vacc0123;
130         if XNN_LIKELY(nnz != 0) {
131           do {
132             const intptr_t diff = *dmap++;
133             const float32x4_t vi0123 = vld1q_f32(input);
134             const float32x4_t vi4567 = vld1q_f32(input + 4);
135             const float32x4_t vi89AB = vld1q_f32(input + 8);
136             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
137             __builtin_prefetch(input + 16);
138             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
139             __builtin_prefetch(w + 32);
140             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
141             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
142             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
143           } while (--nnz != 0);
144         }
145         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
146         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
147         float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
148 
149         vout0123 = vmaxq_f32(vout0123, vmin);
150         vout4567 = vmaxq_f32(vout4567, vmin);
151         vout89AB = vmaxq_f32(vout89AB, vmin);
152 
153         vst1q_f32(output + 0, vout0123);
154         vst1q_f32(output + 4, vout4567);
155         vst1q_f32(output + 8, vout89AB);
156         output = (float*restrict) ((uintptr_t) output + output_stride);
157         n -= 1;
158       } while (n != 0);
159     }
160     output = (float*restrict) ((uintptr_t) output - output_decrement);
161     input += 12;
162     mc -= 12 * sizeof(float);
163   }
164   if XNN_UNLIKELY(mc != 0) {
165     output_decrement += 4 * sizeof(float);
166     if (mc & (8 * sizeof(float))) {
167       const float*restrict w = weights;
168       const int32_t* dmap = widx_dmap;
169       const uint32_t* nnzmap = nidx_nnzmap;
170       size_t n = nc;
171       while (n >= 4) {
172         uint32_t nnz = *nnzmap++;
173         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
174         float32x4_t vacc4567n0 = vacc0123n0;
175         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
176         float32x4_t vacc4567n1 = vacc0123n1;
177         float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
178         float32x4_t vacc4567n2 = vacc0123n2;
179         float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
180         float32x4_t vacc4567n3 = vacc0123n3;
181         if XNN_LIKELY(nnz != 0) {
182           do {
183             const intptr_t diff = *dmap++;
184             const float32x4_t vi0123 = vld1q_f32(input);
185             const float32x4_t vi4567 = vld1q_f32(input + 4);
186             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
187             const float32x4_t vw = vld1q_f32(w); w += 4;
188 
189             vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
190             vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
191             vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
192             vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
193             vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
194             vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
195             vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
196             vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
197           } while (--nnz != 0);
198         }
199         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
200         float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
201         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
202         float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
203         float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
204         float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
205         float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
206         float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
207 
208         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
209         vout4567n0 = vmaxq_f32(vout4567n0, vmin);
210         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
211         vout4567n1 = vmaxq_f32(vout4567n1, vmin);
212         vout0123n2 = vmaxq_f32(vout0123n2, vmin);
213         vout4567n2 = vmaxq_f32(vout4567n2, vmin);
214         vout0123n3 = vmaxq_f32(vout0123n3, vmin);
215         vout4567n3 = vmaxq_f32(vout4567n3, vmin);
216 
217         vst1q_f32(output + 0, vout0123n0);
218         vst1q_f32(output + 4, vout4567n0);
219         output = (float*restrict) ((uintptr_t) output + output_stride);
220         vst1q_f32(output + 0, vout0123n1);
221         vst1q_f32(output + 4, vout4567n1);
222         output = (float*restrict) ((uintptr_t) output + output_stride);
223         vst1q_f32(output + 0, vout0123n2);
224         vst1q_f32(output + 4, vout4567n2);
225         output = (float*restrict) ((uintptr_t) output + output_stride);
226         vst1q_f32(output + 0, vout0123n3);
227         vst1q_f32(output + 4, vout4567n3);
228         output = (float*restrict) ((uintptr_t) output + output_stride);
229         n -= 4;
230       }
231 
232       // clean up loop, fall back to nr=1
233       if XNN_UNLIKELY(n != 0) {
234         do {
235           uint32_t nnz = *nnzmap++;
236           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
237           float32x4_t vacc4567 = vacc0123;
238           if XNN_LIKELY(nnz != 0) {
239             do {
240               const intptr_t diff = *dmap++;
241               const float32x4_t vi0123 = vld1q_f32(input);
242               const float32x4_t vi4567 = vld1q_f32(input + 4);
243               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
244               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
245               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
246               vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
247             } while (--nnz != 0);
248           }
249           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
250           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
251 
252           vout0123 = vmaxq_f32(vout0123, vmin);
253           vout4567 = vmaxq_f32(vout4567, vmin);
254 
255           vst1q_f32(output + 0, vout0123);
256           vst1q_f32(output + 4, vout4567);
257           output = (float*restrict) ((uintptr_t) output + output_stride);
258           n -= 1;
259         } while (n != 0);
260       }
261       output = (float*restrict) ((uintptr_t) output - output_decrement);
262       input += 8;
263     }
264     output_decrement += 4 * sizeof(float);
265     if (mc & (4 * sizeof(float))) {
266       const float*restrict w = weights;
267       const int32_t* dmap = widx_dmap;
268       const uint32_t* nnzmap = nidx_nnzmap;
269       size_t n = nc;
270       while (n >= 4) {
271         uint32_t nnz = *nnzmap++;
272         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
273         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
274         float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
275         float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
276         if XNN_LIKELY(nnz != 0) {
277           do {
278             const intptr_t diff = *dmap++;
279             const float32x4_t vi0123 = vld1q_f32(input);
280             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
281             const float32x4_t vw = vld1q_f32(w); w += 4;
282 
283             vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
284             vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
285             vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
286             vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
287           } while (--nnz != 0);
288         }
289         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
290         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
291         float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
292         float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
293 
294         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
295         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
296         vout0123n2 = vmaxq_f32(vout0123n2, vmin);
297         vout0123n3 = vmaxq_f32(vout0123n3, vmin);
298 
299         vst1q_f32(output + 0, vout0123n0);
300         output = (float*restrict) ((uintptr_t) output + output_stride);
301         vst1q_f32(output + 0, vout0123n1);
302         output = (float*restrict) ((uintptr_t) output + output_stride);
303         vst1q_f32(output + 0, vout0123n2);
304         output = (float*restrict) ((uintptr_t) output + output_stride);
305         vst1q_f32(output + 0, vout0123n3);
306         output = (float*restrict) ((uintptr_t) output + output_stride);
307         n -= 4;
308       }
309 
310       // clean up loop, fall back to nr=1
311       if XNN_UNLIKELY(n != 0) {
312         do {
313           uint32_t nnz = *nnzmap++;
314           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
315           if XNN_LIKELY(nnz != 0) {
316             do {
317               const intptr_t diff = *dmap++;
318               const float32x4_t vi0123 = vld1q_f32(input);
319               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
320               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
321               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
322             } while (--nnz != 0);
323           }
324           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
325 
326           vout0123 = vmaxq_f32(vout0123, vmin);
327 
328           vst1q_f32(output + 0, vout0123);
329           output = (float*restrict) ((uintptr_t) output + output_stride);
330           n -= 1;
331         } while (n != 0);
332       }
333       output = (float*restrict) ((uintptr_t) output - output_decrement);
334       input += 4;
335     }
336     output_decrement += 2 * sizeof(float);
337     if (mc & (2 * sizeof(float))) {
338       const float*restrict w = weights;
339       const int32_t* dmap = widx_dmap;
340       const uint32_t* nnzmap = nidx_nnzmap;
341       size_t n = nc;
342       while (n >= 4) {
343         uint32_t nnz = *nnzmap++;
344         float32x2_t vacc01n0 = vld1_dup_f32(w); w += 1;
345         float32x2_t vacc01n1 = vld1_dup_f32(w); w += 1;
346         float32x2_t vacc01n2 = vld1_dup_f32(w); w += 1;
347         float32x2_t vacc01n3 = vld1_dup_f32(w); w += 1;
348         if XNN_LIKELY(nnz != 0) {
349           do {
350             const intptr_t diff = *dmap++;
351             const float32x2_t vi01 = vld1_f32(input);
352             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
353             const float32x4_t vw = vld1q_f32(w); w += 4;
354 
355             vacc01n0 = vfma_laneq_f32(vacc01n0, vi01, vw, 0);
356             vacc01n1 = vfma_laneq_f32(vacc01n1, vi01, vw, 1);
357             vacc01n2 = vfma_laneq_f32(vacc01n2, vi01, vw, 2);
358             vacc01n3 = vfma_laneq_f32(vacc01n3, vi01, vw, 3);
359           } while (--nnz != 0);
360         }
361         float32x2_t vout01n0 = vmin_f32(vacc01n0, vget_low_f32(vmax));
362         float32x2_t vout01n1 = vmin_f32(vacc01n1, vget_low_f32(vmax));
363         float32x2_t vout01n2 = vmin_f32(vacc01n2, vget_low_f32(vmax));
364         float32x2_t vout01n3 = vmin_f32(vacc01n3, vget_low_f32(vmax));
365 
366         vout01n0 = vmax_f32(vout01n0, vget_low_f32(vmin));
367         vout01n1 = vmax_f32(vout01n1, vget_low_f32(vmin));
368         vout01n2 = vmax_f32(vout01n2, vget_low_f32(vmin));
369         vout01n3 = vmax_f32(vout01n3, vget_low_f32(vmin));
370 
371         vst1_f32(output + 0, vout01n0);
372         output = (float*restrict) ((uintptr_t) output + output_stride);
373         vst1_f32(output + 0, vout01n1);
374         output = (float*restrict) ((uintptr_t) output + output_stride);
375         vst1_f32(output + 0, vout01n2);
376         output = (float*restrict) ((uintptr_t) output + output_stride);
377         vst1_f32(output + 0, vout01n3);
378         output = (float*restrict) ((uintptr_t) output + output_stride);
379         n -= 4;
380       }
381 
382       // clean up loop, fall back to nr=1
383       if XNN_UNLIKELY(n != 0) {
384         do {
385           uint32_t nnz = *nnzmap++;
386           float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
387           if XNN_LIKELY(nnz != 0) {
388             do {
389               const intptr_t diff = *dmap++;
390               const float32x2_t vi01 = vld1_f32(input);
391               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
392               const float32x2_t vw = vld1_dup_f32(w); w += 1;
393               vacc01 = vfma_f32(vacc01, vi01, vw);
394             } while (--nnz != 0);
395           }
396           float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
397           vout01 = vmax_f32(vout01, vget_low_f32(vmin));
398 
399           vst1_f32(output, vout01);
400           output = (float*restrict) ((uintptr_t) output + output_stride);
401           n -= 1;
402         } while (n != 0);
403       }
404       output = (float*restrict) ((uintptr_t) output - output_decrement);
405       input += 2;
406     }
407     output_decrement += 1 * sizeof(float);
408     if (mc & (1 * sizeof(float))) {
409       const float*restrict w = weights;
410       const int32_t* dmap = widx_dmap;
411       const uint32_t* nnzmap = nidx_nnzmap;
412       size_t n = nc;
413       while (n >= 4) {
414         uint32_t nnz = *nnzmap++;
415         float32x2_t vacc0n0 = vld1_dup_f32(w); w += 1;
416         float32x2_t vacc0n1 = vld1_dup_f32(w); w += 1;
417         float32x2_t vacc0n2 = vld1_dup_f32(w); w += 1;
418         float32x2_t vacc0n3 = vld1_dup_f32(w); w += 1;
419         if XNN_LIKELY(nnz != 0) {
420           do {
421             const intptr_t diff = *dmap++;
422             const float32x2_t vi0 = vld1_dup_f32(input);
423             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
424             const float32x4_t vw = vld1q_f32(w); w += 4;
425 
426             vacc0n0 = vfma_laneq_f32(vacc0n0, vi0, vw, 0);
427             vacc0n1 = vfma_laneq_f32(vacc0n1, vi0, vw, 1);
428             vacc0n2 = vfma_laneq_f32(vacc0n2, vi0, vw, 2);
429             vacc0n3 = vfma_laneq_f32(vacc0n3, vi0, vw, 3);
430           } while (--nnz != 0);
431         }
432         float32x2_t vout0n0 = vmin_f32(vacc0n0, vget_low_f32(vmax));
433         float32x2_t vout0n1 = vmin_f32(vacc0n1, vget_low_f32(vmax));
434         float32x2_t vout0n2 = vmin_f32(vacc0n2, vget_low_f32(vmax));
435         float32x2_t vout0n3 = vmin_f32(vacc0n3, vget_low_f32(vmax));
436 
437         vout0n0 = vmax_f32(vout0n0, vget_low_f32(vmin));
438         vout0n1 = vmax_f32(vout0n1, vget_low_f32(vmin));
439         vout0n2 = vmax_f32(vout0n2, vget_low_f32(vmin));
440         vout0n3 = vmax_f32(vout0n3, vget_low_f32(vmin));
441 
442         vst1_lane_f32(output + 0, vout0n0, 0);
443         output = (float*restrict) ((uintptr_t) output + output_stride);
444         vst1_lane_f32(output + 0, vout0n1, 0);
445         output = (float*restrict) ((uintptr_t) output + output_stride);
446         vst1_lane_f32(output + 0, vout0n2, 0);
447         output = (float*restrict) ((uintptr_t) output + output_stride);
448         vst1_lane_f32(output + 0, vout0n3, 0);
449         output = (float*restrict) ((uintptr_t) output + output_stride);
450         n -= 4;
451       }
452 
453       // clean up loop, fall back to nr=1
454       if XNN_UNLIKELY(n != 0) {
455         do {
456           uint32_t nnz = *nnzmap++;
457           float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
458           if XNN_LIKELY(nnz != 0) {
459             do {
460               const intptr_t diff = *dmap++;
461               const float32x2_t vi0 = vld1_dup_f32(input);
462               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
463               const float32x2_t vw = vld1_dup_f32(w); w += 1;
464               vacc0 = vfma_f32(vacc0, vi0, vw);
465             } while (--nnz != 0);
466           }
467           float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
468           vout0 = vmax_f32(vout0, vget_low_f32(vmin));
469 
470           vst1_lane_f32(output, vout0, 1);
471           output = (float*restrict) ((uintptr_t) output + output_stride);
472           n -= 1;
473         } while (n != 0);
474       }
475       output = (float*restrict) ((uintptr_t) output - output_decrement);
476       input += 1;
477     }
478     }
479 }
480