1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/neon-blocked.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_ukernel_16x4__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_16x4__neonfma(
18 uint32_t m,
19 uint32_t n,
20 const float*restrict a,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict c,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(m != 0);
28
29 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
30 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
31 size_t i = m;
32 while XNN_LIKELY(i >= 16) {
33 const float*restrict w = weights;
34 const int32_t* dmap = widx_dmap;
35 const uint32_t* nnzmap = nidx_nnzmap;
36 size_t j = n;
37 while (j >= 4) {
38 uint32_t nnz = *nnzmap++;
39 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
40 float32x4_t vacc4567c0 = vacc0123c0;
41 float32x4_t vacc89ABc0 = vacc0123c0;
42 float32x4_t vaccCDEFc0 = vacc0123c0;
43 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
44 float32x4_t vacc4567c1 = vacc0123c1;
45 float32x4_t vacc89ABc1 = vacc0123c1;
46 float32x4_t vaccCDEFc1 = vacc0123c1;
47 float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
48 float32x4_t vacc4567c2 = vacc0123c2;
49 float32x4_t vacc89ABc2 = vacc0123c2;
50 float32x4_t vaccCDEFc2 = vacc0123c2;
51 float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
52 float32x4_t vacc4567c3 = vacc0123c3;
53 float32x4_t vacc89ABc3 = vacc0123c3;
54 float32x4_t vaccCDEFc3 = vacc0123c3;
55 if XNN_LIKELY(nnz != 0) {
56 do {
57 const intptr_t diff = *dmap++;
58 const float32x4_t va0123 = vld1q_f32(a);
59 const float32x4_t va4567 = vld1q_f32(a + 4);
60 const float32x4_t va89AB = vld1q_f32(a + 8);
61 const float32x4_t vaCDEF = vld1q_f32(a + 12);
62 __builtin_prefetch(a + 16);
63 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
64 const float32x4_t vb = vld1q_f32(w); w += 4;
65
66 vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
67 vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
68 vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, va89AB, vb, 0);
69 vaccCDEFc0 = vfmaq_laneq_f32(vaccCDEFc0, vaCDEF, vb, 0);
70 vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
71 vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
72 vacc89ABc1 = vfmaq_laneq_f32(vacc89ABc1, va89AB, vb, 1);
73 vaccCDEFc1 = vfmaq_laneq_f32(vaccCDEFc1, vaCDEF, vb, 1);
74 vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
75 vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
76 vacc89ABc2 = vfmaq_laneq_f32(vacc89ABc2, va89AB, vb, 2);
77 vaccCDEFc2 = vfmaq_laneq_f32(vaccCDEFc2, vaCDEF, vb, 2);
78 vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
79 vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
80 vacc89ABc3 = vfmaq_laneq_f32(vacc89ABc3, va89AB, vb, 3);
81 vaccCDEFc3 = vfmaq_laneq_f32(vaccCDEFc3, vaCDEF, vb, 3);
82 } while (--nnz != 0);
83 }
84 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
85 float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
86 float32x4_t vout89ABc0 = vminq_f32(vacc89ABc0, vmax);
87 float32x4_t voutCDEFc0 = vminq_f32(vaccCDEFc0, vmax);
88 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
89 float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
90 float32x4_t vout89ABc1 = vminq_f32(vacc89ABc1, vmax);
91 float32x4_t voutCDEFc1 = vminq_f32(vaccCDEFc1, vmax);
92 float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
93 float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
94 float32x4_t vout89ABc2 = vminq_f32(vacc89ABc2, vmax);
95 float32x4_t voutCDEFc2 = vminq_f32(vaccCDEFc2, vmax);
96 float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
97 float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
98 float32x4_t vout89ABc3 = vminq_f32(vacc89ABc3, vmax);
99 float32x4_t voutCDEFc3 = vminq_f32(vaccCDEFc3, vmax);
100
101 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
102 vout4567c0 = vmaxq_f32(vout4567c0, vmin);
103 vout89ABc0 = vmaxq_f32(vout89ABc0, vmin);
104 voutCDEFc0 = vmaxq_f32(voutCDEFc0, vmin);
105 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
106 vout4567c1 = vmaxq_f32(vout4567c1, vmin);
107 vout89ABc1 = vmaxq_f32(vout89ABc1, vmin);
108 voutCDEFc1 = vmaxq_f32(voutCDEFc1, vmin);
109 vout0123c2 = vmaxq_f32(vout0123c2, vmin);
110 vout4567c2 = vmaxq_f32(vout4567c2, vmin);
111 vout89ABc2 = vmaxq_f32(vout89ABc2, vmin);
112 voutCDEFc2 = vmaxq_f32(voutCDEFc2, vmin);
113 vout0123c3 = vmaxq_f32(vout0123c3, vmin);
114 vout4567c3 = vmaxq_f32(vout4567c3, vmin);
115 vout89ABc3 = vmaxq_f32(vout89ABc3, vmin);
116 voutCDEFc3 = vmaxq_f32(voutCDEFc3, vmin);
117
118 vst1q_f32(c + 0 * m + 0, vout0123c0);
119 vst1q_f32(c + 0 * m + 4, vout4567c0);
120 vst1q_f32(c + 0 * m + 8, vout89ABc0);
121 vst1q_f32(c + 0 * m + 12, voutCDEFc0);
122 vst1q_f32(c + 1 * m + 0, vout0123c1);
123 vst1q_f32(c + 1 * m + 4, vout4567c1);
124 vst1q_f32(c + 1 * m + 8, vout89ABc1);
125 vst1q_f32(c + 1 * m + 12, voutCDEFc1);
126 vst1q_f32(c + 2 * m + 0, vout0123c2);
127 vst1q_f32(c + 2 * m + 4, vout4567c2);
128 vst1q_f32(c + 2 * m + 8, vout89ABc2);
129 vst1q_f32(c + 2 * m + 12, voutCDEFc2);
130 vst1q_f32(c + 3 * m + 0, vout0123c3);
131 vst1q_f32(c + 3 * m + 4, vout4567c3);
132 vst1q_f32(c + 3 * m + 8, vout89ABc3);
133 vst1q_f32(c + 3 * m + 12, voutCDEFc3);
134 c += 4 * m;
135 j -= 4;
136 }
137
138 // clean up loop, fall back to nr=1
139 if XNN_UNLIKELY(j != 0) {
140 do {
141 uint32_t nnz = *nnzmap++;
142 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
143 float32x4_t vacc4567 = vacc0123;
144 float32x4_t vacc89AB = vacc0123;
145 float32x4_t vaccCDEF = vacc0123;
146 if XNN_LIKELY(nnz != 0) {
147 do {
148 const intptr_t diff = *dmap++;
149 const float32x4_t va0123 = vld1q_f32(a);
150 const float32x4_t va4567 = vld1q_f32(a + 4);
151 const float32x4_t va89AB = vld1q_f32(a + 8);
152 const float32x4_t vaCDEF = vld1q_f32(a + 12);
153 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
154 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
155 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
156 vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
157 vacc89AB = vfmaq_f32(vacc89AB, va89AB, vb);
158 vaccCDEF = vfmaq_f32(vaccCDEF, vaCDEF, vb);
159 } while (--nnz != 0);
160 }
161 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
162 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
163 float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
164 float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
165
166 vout0123 = vmaxq_f32(vout0123, vmin);
167 vout4567 = vmaxq_f32(vout4567, vmin);
168 vout89AB = vmaxq_f32(vout89AB, vmin);
169 voutCDEF = vmaxq_f32(voutCDEF, vmin);
170
171 vst1q_f32(c + 0, vout0123);
172 vst1q_f32(c + 4, vout4567);
173 vst1q_f32(c + 8, vout89AB);
174 vst1q_f32(c + 12, voutCDEF);
175 c += m;
176 j -= 1;
177 } while (j != 0);
178 }
179 c -= m * n;
180 c += 16;
181 a += 16;
182 i -= 16;
183 }
184 if XNN_UNLIKELY(i != 0) {
185 if (i & 8) {
186 const float*restrict w = weights;
187 const int32_t* dmap = widx_dmap;
188 const uint32_t* nnzmap = nidx_nnzmap;
189 size_t j = n;
190 while (j >= 4) {
191 uint32_t nnz = *nnzmap++;
192 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
193 float32x4_t vacc4567c0 = vacc0123c0;
194 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
195 float32x4_t vacc4567c1 = vacc0123c1;
196 float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
197 float32x4_t vacc4567c2 = vacc0123c2;
198 float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
199 float32x4_t vacc4567c3 = vacc0123c3;
200 if XNN_LIKELY(nnz != 0) {
201 do {
202 const intptr_t diff = *dmap++;
203 const float32x4_t va0123 = vld1q_f32(a);
204 const float32x4_t va4567 = vld1q_f32(a + 4);
205 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
206 const float32x4_t vb = vld1q_f32(w); w += 4;
207
208 vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
209 vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
210 vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
211 vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
212 vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
213 vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
214 vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
215 vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
216 } while (--nnz != 0);
217 }
218 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
219 float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
220 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
221 float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
222 float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
223 float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
224 float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
225 float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
226
227 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
228 vout4567c0 = vmaxq_f32(vout4567c0, vmin);
229 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
230 vout4567c1 = vmaxq_f32(vout4567c1, vmin);
231 vout0123c2 = vmaxq_f32(vout0123c2, vmin);
232 vout4567c2 = vmaxq_f32(vout4567c2, vmin);
233 vout0123c3 = vmaxq_f32(vout0123c3, vmin);
234 vout4567c3 = vmaxq_f32(vout4567c3, vmin);
235
236 vst1q_f32(c + 0 * m + 0, vout0123c0);
237 vst1q_f32(c + 0 * m + 4, vout4567c0);
238 vst1q_f32(c + 1 * m + 0, vout0123c1);
239 vst1q_f32(c + 1 * m + 4, vout4567c1);
240 vst1q_f32(c + 2 * m + 0, vout0123c2);
241 vst1q_f32(c + 2 * m + 4, vout4567c2);
242 vst1q_f32(c + 3 * m + 0, vout0123c3);
243 vst1q_f32(c + 3 * m + 4, vout4567c3);
244 c += 4 * m;
245 j -= 4;
246 }
247
248 // clean up loop, fall back to nr=1
249 if XNN_UNLIKELY(j != 0) {
250 do {
251 uint32_t nnz = *nnzmap++;
252 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
253 float32x4_t vacc4567 = vacc0123;
254 if XNN_LIKELY(nnz != 0) {
255 do {
256 const intptr_t diff = *dmap++;
257 const float32x4_t va0123 = vld1q_f32(a);
258 const float32x4_t va4567 = vld1q_f32(a + 4);
259 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
260 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
261 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
262 vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
263 } while (--nnz != 0);
264 }
265 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
266 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
267
268 vout0123 = vmaxq_f32(vout0123, vmin);
269 vout4567 = vmaxq_f32(vout4567, vmin);
270
271 vst1q_f32(c + 0, vout0123);
272 vst1q_f32(c + 4, vout4567);
273 c += m;
274 j -= 1;
275 } while (j != 0);
276 }
277 c -= m * n;
278 c += 8;
279 a += 8;
280 }
281 if (i & 4) {
282 const float*restrict w = weights;
283 const int32_t* dmap = widx_dmap;
284 const uint32_t* nnzmap = nidx_nnzmap;
285 size_t j = n;
286 while (j >= 4) {
287 uint32_t nnz = *nnzmap++;
288 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
289 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
290 float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
291 float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
292 if XNN_LIKELY(nnz != 0) {
293 do {
294 const intptr_t diff = *dmap++;
295 const float32x4_t va0123 = vld1q_f32(a);
296 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
297 const float32x4_t vb = vld1q_f32(w); w += 4;
298
299 vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
300 vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
301 vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
302 vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
303 } while (--nnz != 0);
304 }
305 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
306 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
307 float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
308 float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
309
310 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
311 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
312 vout0123c2 = vmaxq_f32(vout0123c2, vmin);
313 vout0123c3 = vmaxq_f32(vout0123c3, vmin);
314
315 vst1q_f32(c + 0 * m + 0, vout0123c0);
316 vst1q_f32(c + 1 * m + 0, vout0123c1);
317 vst1q_f32(c + 2 * m + 0, vout0123c2);
318 vst1q_f32(c + 3 * m + 0, vout0123c3);
319 c += 4 * m;
320 j -= 4;
321 }
322
323 // clean up loop, fall back to nr=1
324 if XNN_UNLIKELY(j != 0) {
325 do {
326 uint32_t nnz = *nnzmap++;
327 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
328 if XNN_LIKELY(nnz != 0) {
329 do {
330 const intptr_t diff = *dmap++;
331 const float32x4_t va0123 = vld1q_f32(a);
332 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
333 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
334 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
335 } while (--nnz != 0);
336 }
337 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
338
339 vout0123 = vmaxq_f32(vout0123, vmin);
340
341 vst1q_f32(c + 0, vout0123);
342 c += m;
343 j -= 1;
344 } while (j != 0);
345 }
346 c -= m * n;
347 c += 4;
348 a += 4;
349 }
350 if (i & 2) {
351 const float*restrict w = weights;
352 const int32_t* dmap = widx_dmap;
353 const uint32_t* nnzmap = nidx_nnzmap;
354 size_t j = n;
355 while (j >= 4) {
356 uint32_t nnz = *nnzmap++;
357 float32x2_t vacc01c0 = vld1_dup_f32(w); w += 1;
358 float32x2_t vacc01c1 = vld1_dup_f32(w); w += 1;
359 float32x2_t vacc01c2 = vld1_dup_f32(w); w += 1;
360 float32x2_t vacc01c3 = vld1_dup_f32(w); w += 1;
361 if XNN_LIKELY(nnz != 0) {
362 do {
363 const intptr_t diff = *dmap++;
364 const float32x2_t va01 = vld1_f32(a);
365 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
366 const float32x4_t vb = vld1q_f32(w); w += 4;
367
368 vacc01c0 = vfma_laneq_f32(vacc01c0, va01, vb, 0);
369 vacc01c1 = vfma_laneq_f32(vacc01c1, va01, vb, 1);
370 vacc01c2 = vfma_laneq_f32(vacc01c2, va01, vb, 2);
371 vacc01c3 = vfma_laneq_f32(vacc01c3, va01, vb, 3);
372 } while (--nnz != 0);
373 }
374 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax));
375 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax));
376 float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax));
377 float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax));
378
379 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin));
380 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin));
381 vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin));
382 vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin));
383
384 vst1_f32(c + 0 * m + 0, vout01c0);
385 vst1_f32(c + 1 * m + 0, vout01c1);
386 vst1_f32(c + 2 * m + 0, vout01c2);
387 vst1_f32(c + 3 * m + 0, vout01c3);
388 c += 4 * m;
389 j -= 4;
390 }
391
392 // clean up loop, fall back to nr=1
393 if XNN_UNLIKELY(j != 0) {
394 do {
395 uint32_t nnz = *nnzmap++;
396 float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
397 if XNN_LIKELY(nnz != 0) {
398 do {
399 const intptr_t diff = *dmap++;
400 const float32x2_t va01 = vld1_f32(a);
401 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
402 const float32x2_t vb = vld1_dup_f32(w); w += 1;
403 vacc01 = vfma_f32(vacc01, va01, vb);
404 } while (--nnz != 0);
405 }
406 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
407 vout01 = vmax_f32(vout01, vget_low_f32(vmin));
408
409 vst1_f32(c, vout01);
410 c += m;
411 j -= 1;
412 } while (j != 0);
413 }
414 c -= m * n;
415 c += 2;
416 a += 2;
417 }
418 if (i & 1) {
419 const float*restrict w = weights;
420 const int32_t* dmap = widx_dmap;
421 const uint32_t* nnzmap = nidx_nnzmap;
422 size_t j = n;
423 while (j >= 4) {
424 uint32_t nnz = *nnzmap++;
425 float32x2_t vacc0c0 = vld1_dup_f32(w); w += 1;
426 float32x2_t vacc0c1 = vld1_dup_f32(w); w += 1;
427 float32x2_t vacc0c2 = vld1_dup_f32(w); w += 1;
428 float32x2_t vacc0c3 = vld1_dup_f32(w); w += 1;
429 if XNN_LIKELY(nnz != 0) {
430 do {
431 const intptr_t diff = *dmap++;
432 const float32x2_t va0 = vld1_dup_f32(a);
433 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
434 const float32x4_t vb = vld1q_f32(w); w += 4;
435
436 vacc0c0 = vfma_laneq_f32(vacc0c0, va0, vb, 0);
437 vacc0c1 = vfma_laneq_f32(vacc0c1, va0, vb, 1);
438 vacc0c2 = vfma_laneq_f32(vacc0c2, va0, vb, 2);
439 vacc0c3 = vfma_laneq_f32(vacc0c3, va0, vb, 3);
440 } while (--nnz != 0);
441 }
442 float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax));
443 float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax));
444 float32x2_t vout0c2 = vmin_f32(vacc0c2, vget_low_f32(vmax));
445 float32x2_t vout0c3 = vmin_f32(vacc0c3, vget_low_f32(vmax));
446
447 vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin));
448 vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin));
449 vout0c2 = vmax_f32(vout0c2, vget_low_f32(vmin));
450 vout0c3 = vmax_f32(vout0c3, vget_low_f32(vmin));
451
452 vst1_lane_f32(c + 0 * m + 0, vout0c0, 0);
453 vst1_lane_f32(c + 1 * m + 0, vout0c1, 0);
454 vst1_lane_f32(c + 2 * m + 0, vout0c2, 0);
455 vst1_lane_f32(c + 3 * m + 0, vout0c3, 0);
456 c += 4 * m;
457 j -= 4;
458 }
459
460 // clean up loop, fall back to nr=1
461 if XNN_UNLIKELY(j != 0) {
462 do {
463 uint32_t nnz = *nnzmap++;
464 float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
465 if XNN_LIKELY(nnz != 0) {
466 do {
467 const intptr_t diff = *dmap++;
468 const float32x2_t va0 = vld1_dup_f32(a);
469 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
470 const float32x2_t vb = vld1_dup_f32(w); w += 1;
471 vacc0 = vfma_f32(vacc0, va0, vb);
472 } while (--nnz != 0);
473 }
474 float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
475 vout0 = vmax_f32(vout0, vget_low_f32(vmin));
476
477 vst1_lane_f32(c, vout0, 1);
478 c += m;
479 j -= 1;
480 } while (j != 0);
481 }
482 c -= m * n;
483 c += 1;
484 a += 1;
485 }
486 }
487 }
488