• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon-blocked.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_ukernel_16x4__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_16x4__neonfma(
18     uint32_t m,
19     uint32_t n,
20     const float*restrict a,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict c,
25     const union xnn_f32_output_params params[restrict static 1])
26 {
27   assert(m != 0);
28 
29   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
30   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
31   size_t i = m;
32   while XNN_LIKELY(i >= 16) {
33     const float*restrict w = weights;
34     const int32_t* dmap = widx_dmap;
35     const uint32_t* nnzmap = nidx_nnzmap;
36     size_t j = n;
37     while (j >= 4) {
38       uint32_t nnz = *nnzmap++;
39       float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
40       float32x4_t vacc4567c0 = vacc0123c0;
41       float32x4_t vacc89ABc0 = vacc0123c0;
42       float32x4_t vaccCDEFc0 = vacc0123c0;
43       float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
44       float32x4_t vacc4567c1 = vacc0123c1;
45       float32x4_t vacc89ABc1 = vacc0123c1;
46       float32x4_t vaccCDEFc1 = vacc0123c1;
47       float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
48       float32x4_t vacc4567c2 = vacc0123c2;
49       float32x4_t vacc89ABc2 = vacc0123c2;
50       float32x4_t vaccCDEFc2 = vacc0123c2;
51       float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
52       float32x4_t vacc4567c3 = vacc0123c3;
53       float32x4_t vacc89ABc3 = vacc0123c3;
54       float32x4_t vaccCDEFc3 = vacc0123c3;
55       if XNN_LIKELY(nnz != 0) {
56         do {
57           const intptr_t diff = *dmap++;
58           const float32x4_t va0123 = vld1q_f32(a);
59           const float32x4_t va4567 = vld1q_f32(a + 4);
60           const float32x4_t va89AB = vld1q_f32(a + 8);
61           const float32x4_t vaCDEF = vld1q_f32(a + 12);
62           __builtin_prefetch(a + 16);
63           a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
64           const float32x4_t vb = vld1q_f32(w); w += 4;
65 
66           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
67           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
68           vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, va89AB, vb, 0);
69           vaccCDEFc0 = vfmaq_laneq_f32(vaccCDEFc0, vaCDEF, vb, 0);
70           vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
71           vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
72           vacc89ABc1 = vfmaq_laneq_f32(vacc89ABc1, va89AB, vb, 1);
73           vaccCDEFc1 = vfmaq_laneq_f32(vaccCDEFc1, vaCDEF, vb, 1);
74           vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
75           vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
76           vacc89ABc2 = vfmaq_laneq_f32(vacc89ABc2, va89AB, vb, 2);
77           vaccCDEFc2 = vfmaq_laneq_f32(vaccCDEFc2, vaCDEF, vb, 2);
78           vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
79           vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
80           vacc89ABc3 = vfmaq_laneq_f32(vacc89ABc3, va89AB, vb, 3);
81           vaccCDEFc3 = vfmaq_laneq_f32(vaccCDEFc3, vaCDEF, vb, 3);
82         } while (--nnz != 0);
83       }
84       float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
85       float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
86       float32x4_t vout89ABc0 = vminq_f32(vacc89ABc0, vmax);
87       float32x4_t voutCDEFc0 = vminq_f32(vaccCDEFc0, vmax);
88       float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
89       float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
90       float32x4_t vout89ABc1 = vminq_f32(vacc89ABc1, vmax);
91       float32x4_t voutCDEFc1 = vminq_f32(vaccCDEFc1, vmax);
92       float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
93       float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
94       float32x4_t vout89ABc2 = vminq_f32(vacc89ABc2, vmax);
95       float32x4_t voutCDEFc2 = vminq_f32(vaccCDEFc2, vmax);
96       float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
97       float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
98       float32x4_t vout89ABc3 = vminq_f32(vacc89ABc3, vmax);
99       float32x4_t voutCDEFc3 = vminq_f32(vaccCDEFc3, vmax);
100 
101       vout0123c0 = vmaxq_f32(vout0123c0, vmin);
102       vout4567c0 = vmaxq_f32(vout4567c0, vmin);
103       vout89ABc0 = vmaxq_f32(vout89ABc0, vmin);
104       voutCDEFc0 = vmaxq_f32(voutCDEFc0, vmin);
105       vout0123c1 = vmaxq_f32(vout0123c1, vmin);
106       vout4567c1 = vmaxq_f32(vout4567c1, vmin);
107       vout89ABc1 = vmaxq_f32(vout89ABc1, vmin);
108       voutCDEFc1 = vmaxq_f32(voutCDEFc1, vmin);
109       vout0123c2 = vmaxq_f32(vout0123c2, vmin);
110       vout4567c2 = vmaxq_f32(vout4567c2, vmin);
111       vout89ABc2 = vmaxq_f32(vout89ABc2, vmin);
112       voutCDEFc2 = vmaxq_f32(voutCDEFc2, vmin);
113       vout0123c3 = vmaxq_f32(vout0123c3, vmin);
114       vout4567c3 = vmaxq_f32(vout4567c3, vmin);
115       vout89ABc3 = vmaxq_f32(vout89ABc3, vmin);
116       voutCDEFc3 = vmaxq_f32(voutCDEFc3, vmin);
117 
118       vst1q_f32(c + 0 * m + 0, vout0123c0);
119       vst1q_f32(c + 0 * m + 4, vout4567c0);
120       vst1q_f32(c + 0 * m + 8, vout89ABc0);
121       vst1q_f32(c + 0 * m + 12, voutCDEFc0);
122       vst1q_f32(c + 1 * m + 0, vout0123c1);
123       vst1q_f32(c + 1 * m + 4, vout4567c1);
124       vst1q_f32(c + 1 * m + 8, vout89ABc1);
125       vst1q_f32(c + 1 * m + 12, voutCDEFc1);
126       vst1q_f32(c + 2 * m + 0, vout0123c2);
127       vst1q_f32(c + 2 * m + 4, vout4567c2);
128       vst1q_f32(c + 2 * m + 8, vout89ABc2);
129       vst1q_f32(c + 2 * m + 12, voutCDEFc2);
130       vst1q_f32(c + 3 * m + 0, vout0123c3);
131       vst1q_f32(c + 3 * m + 4, vout4567c3);
132       vst1q_f32(c + 3 * m + 8, vout89ABc3);
133       vst1q_f32(c + 3 * m + 12, voutCDEFc3);
134       c += 4 * m;
135       j -= 4;
136     }
137 
138     // clean up loop, fall back to nr=1
139     if XNN_UNLIKELY(j != 0) {
140       do {
141         uint32_t nnz = *nnzmap++;
142         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
143         float32x4_t vacc4567 = vacc0123;
144         float32x4_t vacc89AB = vacc0123;
145         float32x4_t vaccCDEF = vacc0123;
146         if XNN_LIKELY(nnz != 0) {
147           do {
148             const intptr_t diff = *dmap++;
149             const float32x4_t va0123 = vld1q_f32(a);
150             const float32x4_t va4567 = vld1q_f32(a + 4);
151             const float32x4_t va89AB = vld1q_f32(a + 8);
152             const float32x4_t vaCDEF = vld1q_f32(a + 12);
153             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
154             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
155             vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
156             vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
157             vacc89AB = vfmaq_f32(vacc89AB, va89AB, vb);
158             vaccCDEF = vfmaq_f32(vaccCDEF, vaCDEF, vb);
159           } while (--nnz != 0);
160         }
161         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
162         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
163         float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
164         float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
165 
166         vout0123 = vmaxq_f32(vout0123, vmin);
167         vout4567 = vmaxq_f32(vout4567, vmin);
168         vout89AB = vmaxq_f32(vout89AB, vmin);
169         voutCDEF = vmaxq_f32(voutCDEF, vmin);
170 
171         vst1q_f32(c + 0, vout0123);
172         vst1q_f32(c + 4, vout4567);
173         vst1q_f32(c + 8, vout89AB);
174         vst1q_f32(c + 12, voutCDEF);
175         c += m;
176         j -= 1;
177       } while (j != 0);
178     }
179     c -= m * n;
180     c += 16;
181     a += 16;
182     i -= 16;
183   }
184   if XNN_UNLIKELY(i != 0) {
185     if (i & 8) {
186       const float*restrict w = weights;
187       const int32_t* dmap = widx_dmap;
188       const uint32_t* nnzmap = nidx_nnzmap;
189       size_t j = n;
190       while (j >= 4) {
191         uint32_t nnz = *nnzmap++;
192         float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
193         float32x4_t vacc4567c0 = vacc0123c0;
194         float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
195         float32x4_t vacc4567c1 = vacc0123c1;
196         float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
197         float32x4_t vacc4567c2 = vacc0123c2;
198         float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
199         float32x4_t vacc4567c3 = vacc0123c3;
200         if XNN_LIKELY(nnz != 0) {
201           do {
202             const intptr_t diff = *dmap++;
203             const float32x4_t va0123 = vld1q_f32(a);
204             const float32x4_t va4567 = vld1q_f32(a + 4);
205             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
206             const float32x4_t vb = vld1q_f32(w); w += 4;
207 
208             vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
209             vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
210             vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
211             vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
212             vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
213             vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
214             vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
215             vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
216           } while (--nnz != 0);
217         }
218         float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
219         float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
220         float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
221         float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
222         float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
223         float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
224         float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
225         float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
226 
227         vout0123c0 = vmaxq_f32(vout0123c0, vmin);
228         vout4567c0 = vmaxq_f32(vout4567c0, vmin);
229         vout0123c1 = vmaxq_f32(vout0123c1, vmin);
230         vout4567c1 = vmaxq_f32(vout4567c1, vmin);
231         vout0123c2 = vmaxq_f32(vout0123c2, vmin);
232         vout4567c2 = vmaxq_f32(vout4567c2, vmin);
233         vout0123c3 = vmaxq_f32(vout0123c3, vmin);
234         vout4567c3 = vmaxq_f32(vout4567c3, vmin);
235 
236         vst1q_f32(c + 0 * m + 0, vout0123c0);
237         vst1q_f32(c + 0 * m + 4, vout4567c0);
238         vst1q_f32(c + 1 * m + 0, vout0123c1);
239         vst1q_f32(c + 1 * m + 4, vout4567c1);
240         vst1q_f32(c + 2 * m + 0, vout0123c2);
241         vst1q_f32(c + 2 * m + 4, vout4567c2);
242         vst1q_f32(c + 3 * m + 0, vout0123c3);
243         vst1q_f32(c + 3 * m + 4, vout4567c3);
244         c += 4 * m;
245         j -= 4;
246       }
247 
248       // clean up loop, fall back to nr=1
249       if XNN_UNLIKELY(j != 0) {
250         do {
251           uint32_t nnz = *nnzmap++;
252           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
253           float32x4_t vacc4567 = vacc0123;
254           if XNN_LIKELY(nnz != 0) {
255             do {
256               const intptr_t diff = *dmap++;
257               const float32x4_t va0123 = vld1q_f32(a);
258               const float32x4_t va4567 = vld1q_f32(a + 4);
259               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
260               const float32x4_t vb = vld1q_dup_f32(w); w += 1;
261               vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
262               vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
263             } while (--nnz != 0);
264           }
265           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
266           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
267 
268           vout0123 = vmaxq_f32(vout0123, vmin);
269           vout4567 = vmaxq_f32(vout4567, vmin);
270 
271           vst1q_f32(c + 0, vout0123);
272           vst1q_f32(c + 4, vout4567);
273           c += m;
274           j -= 1;
275         } while (j != 0);
276       }
277       c -= m * n;
278       c += 8;
279       a += 8;
280     }
281     if (i & 4) {
282       const float*restrict w = weights;
283       const int32_t* dmap = widx_dmap;
284       const uint32_t* nnzmap = nidx_nnzmap;
285       size_t j = n;
286       while (j >= 4) {
287         uint32_t nnz = *nnzmap++;
288         float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
289         float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
290         float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
291         float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
292         if XNN_LIKELY(nnz != 0) {
293           do {
294             const intptr_t diff = *dmap++;
295             const float32x4_t va0123 = vld1q_f32(a);
296             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
297             const float32x4_t vb = vld1q_f32(w); w += 4;
298 
299             vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
300             vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
301             vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
302             vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
303           } while (--nnz != 0);
304         }
305         float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
306         float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
307         float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
308         float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
309 
310         vout0123c0 = vmaxq_f32(vout0123c0, vmin);
311         vout0123c1 = vmaxq_f32(vout0123c1, vmin);
312         vout0123c2 = vmaxq_f32(vout0123c2, vmin);
313         vout0123c3 = vmaxq_f32(vout0123c3, vmin);
314 
315         vst1q_f32(c + 0 * m + 0, vout0123c0);
316         vst1q_f32(c + 1 * m + 0, vout0123c1);
317         vst1q_f32(c + 2 * m + 0, vout0123c2);
318         vst1q_f32(c + 3 * m + 0, vout0123c3);
319         c += 4 * m;
320         j -= 4;
321       }
322 
323       // clean up loop, fall back to nr=1
324       if XNN_UNLIKELY(j != 0) {
325         do {
326           uint32_t nnz = *nnzmap++;
327           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
328           if XNN_LIKELY(nnz != 0) {
329             do {
330               const intptr_t diff = *dmap++;
331               const float32x4_t va0123 = vld1q_f32(a);
332               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
333               const float32x4_t vb = vld1q_dup_f32(w); w += 1;
334               vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
335             } while (--nnz != 0);
336           }
337           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
338 
339           vout0123 = vmaxq_f32(vout0123, vmin);
340 
341           vst1q_f32(c + 0, vout0123);
342           c += m;
343           j -= 1;
344         } while (j != 0);
345       }
346       c -= m * n;
347       c += 4;
348       a += 4;
349     }
350     if (i & 2) {
351       const float*restrict w = weights;
352       const int32_t* dmap = widx_dmap;
353       const uint32_t* nnzmap = nidx_nnzmap;
354       size_t j = n;
355       while (j >= 4) {
356         uint32_t nnz = *nnzmap++;
357         float32x2_t vacc01c0 = vld1_dup_f32(w); w += 1;
358         float32x2_t vacc01c1 = vld1_dup_f32(w); w += 1;
359         float32x2_t vacc01c2 = vld1_dup_f32(w); w += 1;
360         float32x2_t vacc01c3 = vld1_dup_f32(w); w += 1;
361         if XNN_LIKELY(nnz != 0) {
362           do {
363             const intptr_t diff = *dmap++;
364             const float32x2_t va01 = vld1_f32(a);
365             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
366             const float32x4_t vb = vld1q_f32(w); w += 4;
367 
368             vacc01c0 = vfma_laneq_f32(vacc01c0, va01, vb, 0);
369             vacc01c1 = vfma_laneq_f32(vacc01c1, va01, vb, 1);
370             vacc01c2 = vfma_laneq_f32(vacc01c2, va01, vb, 2);
371             vacc01c3 = vfma_laneq_f32(vacc01c3, va01, vb, 3);
372           } while (--nnz != 0);
373         }
374         float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax));
375         float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax));
376         float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax));
377         float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax));
378 
379         vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin));
380         vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin));
381         vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin));
382         vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin));
383 
384         vst1_f32(c + 0 * m + 0, vout01c0);
385         vst1_f32(c + 1 * m + 0, vout01c1);
386         vst1_f32(c + 2 * m + 0, vout01c2);
387         vst1_f32(c + 3 * m + 0, vout01c3);
388         c += 4 * m;
389         j -= 4;
390       }
391 
392       // clean up loop, fall back to nr=1
393       if XNN_UNLIKELY(j != 0) {
394         do {
395           uint32_t nnz = *nnzmap++;
396           float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
397           if XNN_LIKELY(nnz != 0) {
398             do {
399               const intptr_t diff = *dmap++;
400               const float32x2_t va01 = vld1_f32(a);
401               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
402               const float32x2_t vb = vld1_dup_f32(w); w += 1;
403               vacc01 = vfma_f32(vacc01, va01, vb);
404             } while (--nnz != 0);
405           }
406           float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
407           vout01 = vmax_f32(vout01, vget_low_f32(vmin));
408 
409           vst1_f32(c, vout01);
410           c += m;
411           j -= 1;
412         } while (j != 0);
413       }
414       c -= m * n;
415       c += 2;
416       a += 2;
417     }
418     if (i & 1) {
419       const float*restrict w = weights;
420       const int32_t* dmap = widx_dmap;
421       const uint32_t* nnzmap = nidx_nnzmap;
422       size_t j = n;
423       while (j >= 4) {
424         uint32_t nnz = *nnzmap++;
425         float32x2_t vacc0c0 = vld1_dup_f32(w); w += 1;
426         float32x2_t vacc0c1 = vld1_dup_f32(w); w += 1;
427         float32x2_t vacc0c2 = vld1_dup_f32(w); w += 1;
428         float32x2_t vacc0c3 = vld1_dup_f32(w); w += 1;
429         if XNN_LIKELY(nnz != 0) {
430           do {
431             const intptr_t diff = *dmap++;
432             const float32x2_t va0 = vld1_dup_f32(a);
433             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
434             const float32x4_t vb = vld1q_f32(w); w += 4;
435 
436             vacc0c0 = vfma_laneq_f32(vacc0c0, va0, vb, 0);
437             vacc0c1 = vfma_laneq_f32(vacc0c1, va0, vb, 1);
438             vacc0c2 = vfma_laneq_f32(vacc0c2, va0, vb, 2);
439             vacc0c3 = vfma_laneq_f32(vacc0c3, va0, vb, 3);
440           } while (--nnz != 0);
441         }
442         float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax));
443         float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax));
444         float32x2_t vout0c2 = vmin_f32(vacc0c2, vget_low_f32(vmax));
445         float32x2_t vout0c3 = vmin_f32(vacc0c3, vget_low_f32(vmax));
446 
447         vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin));
448         vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin));
449         vout0c2 = vmax_f32(vout0c2, vget_low_f32(vmin));
450         vout0c3 = vmax_f32(vout0c3, vget_low_f32(vmin));
451 
452         vst1_lane_f32(c + 0 * m + 0, vout0c0, 0);
453         vst1_lane_f32(c + 1 * m + 0, vout0c1, 0);
454         vst1_lane_f32(c + 2 * m + 0, vout0c2, 0);
455         vst1_lane_f32(c + 3 * m + 0, vout0c3, 0);
456         c += 4 * m;
457         j -= 4;
458       }
459 
460       // clean up loop, fall back to nr=1
461       if XNN_UNLIKELY(j != 0) {
462         do {
463           uint32_t nnz = *nnzmap++;
464           float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
465           if XNN_LIKELY(nnz != 0) {
466             do {
467               const intptr_t diff = *dmap++;
468               const float32x2_t va0 = vld1_dup_f32(a);
469               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
470               const float32x2_t vb = vld1_dup_f32(w); w += 1;
471               vacc0 = vfma_f32(vacc0, va0, vb);
472             } while (--nnz != 0);
473           }
474           float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
475           vout0 = vmax_f32(vout0, vget_low_f32(vmin));
476 
477           vst1_lane_f32(c, vout0, 1);
478           c += m;
479           j -= 1;
480         } while (j != 0);
481       }
482       c -= m * n;
483       c += 1;
484       a += 1;
485     }
486     }
487 }
488