• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon-blocked.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_ukernel_12x4__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_12x4__neonfma(
18     uint32_t m,
19     uint32_t n,
20     const float*restrict a,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict c,
25     const union xnn_f32_output_params params[restrict static 1])
26 {
27   assert(m != 0);
28 
29   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
30   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
31   size_t i = m;
32   while XNN_LIKELY(i >= 12) {
33     const float*restrict w = weights;
34     const int32_t* dmap = widx_dmap;
35     const uint32_t* nnzmap = nidx_nnzmap;
36     size_t j = n;
37     while (j >= 4) {
38       uint32_t nnz = *nnzmap++;
39       float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
40       float32x4_t vacc4567c0 = vacc0123c0;
41       float32x4_t vacc89ABc0 = vacc0123c0;
42       float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
43       float32x4_t vacc4567c1 = vacc0123c1;
44       float32x4_t vacc89ABc1 = vacc0123c1;
45       float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
46       float32x4_t vacc4567c2 = vacc0123c2;
47       float32x4_t vacc89ABc2 = vacc0123c2;
48       float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
49       float32x4_t vacc4567c3 = vacc0123c3;
50       float32x4_t vacc89ABc3 = vacc0123c3;
51       if XNN_LIKELY(nnz != 0) {
52         do {
53           const intptr_t diff = *dmap++;
54           const float32x4_t va0123 = vld1q_f32(a);
55           const float32x4_t va4567 = vld1q_f32(a + 4);
56           const float32x4_t va89AB = vld1q_f32(a + 8);
57           a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
58           const float32x4_t vb = vld1q_f32(w); w += 4;
59 
60           vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
61           vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
62           vacc89ABc0 = vfmaq_laneq_f32(vacc89ABc0, va89AB, vb, 0);
63           vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
64           vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
65           vacc89ABc1 = vfmaq_laneq_f32(vacc89ABc1, va89AB, vb, 1);
66           vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
67           vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
68           vacc89ABc2 = vfmaq_laneq_f32(vacc89ABc2, va89AB, vb, 2);
69           vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
70           vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
71           vacc89ABc3 = vfmaq_laneq_f32(vacc89ABc3, va89AB, vb, 3);
72         } while (--nnz != 0);
73       }
74       float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
75       float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
76       float32x4_t vout89ABc0 = vminq_f32(vacc89ABc0, vmax);
77       float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
78       float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
79       float32x4_t vout89ABc1 = vminq_f32(vacc89ABc1, vmax);
80       float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
81       float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
82       float32x4_t vout89ABc2 = vminq_f32(vacc89ABc2, vmax);
83       float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
84       float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
85       float32x4_t vout89ABc3 = vminq_f32(vacc89ABc3, vmax);
86 
87       vout0123c0 = vmaxq_f32(vout0123c0, vmin);
88       vout4567c0 = vmaxq_f32(vout4567c0, vmin);
89       vout89ABc0 = vmaxq_f32(vout89ABc0, vmin);
90       vout0123c1 = vmaxq_f32(vout0123c1, vmin);
91       vout4567c1 = vmaxq_f32(vout4567c1, vmin);
92       vout89ABc1 = vmaxq_f32(vout89ABc1, vmin);
93       vout0123c2 = vmaxq_f32(vout0123c2, vmin);
94       vout4567c2 = vmaxq_f32(vout4567c2, vmin);
95       vout89ABc2 = vmaxq_f32(vout89ABc2, vmin);
96       vout0123c3 = vmaxq_f32(vout0123c3, vmin);
97       vout4567c3 = vmaxq_f32(vout4567c3, vmin);
98       vout89ABc3 = vmaxq_f32(vout89ABc3, vmin);
99 
100       vst1q_f32(c + 0 * m + 0, vout0123c0);
101       vst1q_f32(c + 0 * m + 4, vout4567c0);
102       vst1q_f32(c + 0 * m + 8, vout89ABc0);
103       vst1q_f32(c + 1 * m + 0, vout0123c1);
104       vst1q_f32(c + 1 * m + 4, vout4567c1);
105       vst1q_f32(c + 1 * m + 8, vout89ABc1);
106       vst1q_f32(c + 2 * m + 0, vout0123c2);
107       vst1q_f32(c + 2 * m + 4, vout4567c2);
108       vst1q_f32(c + 2 * m + 8, vout89ABc2);
109       vst1q_f32(c + 3 * m + 0, vout0123c3);
110       vst1q_f32(c + 3 * m + 4, vout4567c3);
111       vst1q_f32(c + 3 * m + 8, vout89ABc3);
112       c += 4 * m;
113       j -= 4;
114     }
115 
116     // clean up loop, fall back to nr=1
117     if XNN_UNLIKELY(j != 0) {
118       do {
119         uint32_t nnz = *nnzmap++;
120         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
121         float32x4_t vacc4567 = vacc0123;
122         float32x4_t vacc89AB = vacc0123;
123         if XNN_LIKELY(nnz != 0) {
124           do {
125             const intptr_t diff = *dmap++;
126             const float32x4_t va0123 = vld1q_f32(a);
127             const float32x4_t va4567 = vld1q_f32(a + 4);
128             const float32x4_t va89AB = vld1q_f32(a + 8);
129             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
130             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
131             vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
132             vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
133             vacc89AB = vfmaq_f32(vacc89AB, va89AB, vb);
134           } while (--nnz != 0);
135         }
136         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
137         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
138         float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
139 
140         vout0123 = vmaxq_f32(vout0123, vmin);
141         vout4567 = vmaxq_f32(vout4567, vmin);
142         vout89AB = vmaxq_f32(vout89AB, vmin);
143 
144         vst1q_f32(c + 0, vout0123);
145         vst1q_f32(c + 4, vout4567);
146         vst1q_f32(c + 8, vout89AB);
147         c += m;
148         j -= 1;
149       } while (j != 0);
150     }
151     c -= m * n;
152     c += 12;
153     a += 12;
154     i -= 12;
155   }
156   if XNN_UNLIKELY(i != 0) {
157     if (i & 8) {
158       const float*restrict w = weights;
159       const int32_t* dmap = widx_dmap;
160       const uint32_t* nnzmap = nidx_nnzmap;
161       size_t j = n;
162       while (j >= 4) {
163         uint32_t nnz = *nnzmap++;
164         float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
165         float32x4_t vacc4567c0 = vacc0123c0;
166         float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
167         float32x4_t vacc4567c1 = vacc0123c1;
168         float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
169         float32x4_t vacc4567c2 = vacc0123c2;
170         float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
171         float32x4_t vacc4567c3 = vacc0123c3;
172         if XNN_LIKELY(nnz != 0) {
173           do {
174             const intptr_t diff = *dmap++;
175             const float32x4_t va0123 = vld1q_f32(a);
176             const float32x4_t va4567 = vld1q_f32(a + 4);
177             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
178             const float32x4_t vb = vld1q_f32(w); w += 4;
179 
180             vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
181             vacc4567c0 = vfmaq_laneq_f32(vacc4567c0, va4567, vb, 0);
182             vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
183             vacc4567c1 = vfmaq_laneq_f32(vacc4567c1, va4567, vb, 1);
184             vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
185             vacc4567c2 = vfmaq_laneq_f32(vacc4567c2, va4567, vb, 2);
186             vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
187             vacc4567c3 = vfmaq_laneq_f32(vacc4567c3, va4567, vb, 3);
188           } while (--nnz != 0);
189         }
190         float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
191         float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
192         float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
193         float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
194         float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
195         float32x4_t vout4567c2 = vminq_f32(vacc4567c2, vmax);
196         float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
197         float32x4_t vout4567c3 = vminq_f32(vacc4567c3, vmax);
198 
199         vout0123c0 = vmaxq_f32(vout0123c0, vmin);
200         vout4567c0 = vmaxq_f32(vout4567c0, vmin);
201         vout0123c1 = vmaxq_f32(vout0123c1, vmin);
202         vout4567c1 = vmaxq_f32(vout4567c1, vmin);
203         vout0123c2 = vmaxq_f32(vout0123c2, vmin);
204         vout4567c2 = vmaxq_f32(vout4567c2, vmin);
205         vout0123c3 = vmaxq_f32(vout0123c3, vmin);
206         vout4567c3 = vmaxq_f32(vout4567c3, vmin);
207 
208         vst1q_f32(c + 0 * m + 0, vout0123c0);
209         vst1q_f32(c + 0 * m + 4, vout4567c0);
210         vst1q_f32(c + 1 * m + 0, vout0123c1);
211         vst1q_f32(c + 1 * m + 4, vout4567c1);
212         vst1q_f32(c + 2 * m + 0, vout0123c2);
213         vst1q_f32(c + 2 * m + 4, vout4567c2);
214         vst1q_f32(c + 3 * m + 0, vout0123c3);
215         vst1q_f32(c + 3 * m + 4, vout4567c3);
216         c += 4 * m;
217         j -= 4;
218       }
219 
220       // clean up loop, fall back to nr=1
221       if XNN_UNLIKELY(j != 0) {
222         do {
223           uint32_t nnz = *nnzmap++;
224           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
225           float32x4_t vacc4567 = vacc0123;
226           if XNN_LIKELY(nnz != 0) {
227             do {
228               const intptr_t diff = *dmap++;
229               const float32x4_t va0123 = vld1q_f32(a);
230               const float32x4_t va4567 = vld1q_f32(a + 4);
231               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
232               const float32x4_t vb = vld1q_dup_f32(w); w += 1;
233               vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
234               vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
235             } while (--nnz != 0);
236           }
237           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
238           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
239 
240           vout0123 = vmaxq_f32(vout0123, vmin);
241           vout4567 = vmaxq_f32(vout4567, vmin);
242 
243           vst1q_f32(c + 0, vout0123);
244           vst1q_f32(c + 4, vout4567);
245           c += m;
246           j -= 1;
247         } while (j != 0);
248       }
249       c -= m * n;
250       c += 8;
251       a += 8;
252     }
253     if (i & 4) {
254       const float*restrict w = weights;
255       const int32_t* dmap = widx_dmap;
256       const uint32_t* nnzmap = nidx_nnzmap;
257       size_t j = n;
258       while (j >= 4) {
259         uint32_t nnz = *nnzmap++;
260         float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
261         float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
262         float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
263         float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
264         if XNN_LIKELY(nnz != 0) {
265           do {
266             const intptr_t diff = *dmap++;
267             const float32x4_t va0123 = vld1q_f32(a);
268             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
269             const float32x4_t vb = vld1q_f32(w); w += 4;
270 
271             vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
272             vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
273             vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
274             vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
275           } while (--nnz != 0);
276         }
277         float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
278         float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
279         float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
280         float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
281 
282         vout0123c0 = vmaxq_f32(vout0123c0, vmin);
283         vout0123c1 = vmaxq_f32(vout0123c1, vmin);
284         vout0123c2 = vmaxq_f32(vout0123c2, vmin);
285         vout0123c3 = vmaxq_f32(vout0123c3, vmin);
286 
287         vst1q_f32(c + 0 * m + 0, vout0123c0);
288         vst1q_f32(c + 1 * m + 0, vout0123c1);
289         vst1q_f32(c + 2 * m + 0, vout0123c2);
290         vst1q_f32(c + 3 * m + 0, vout0123c3);
291         c += 4 * m;
292         j -= 4;
293       }
294 
295       // clean up loop, fall back to nr=1
296       if XNN_UNLIKELY(j != 0) {
297         do {
298           uint32_t nnz = *nnzmap++;
299           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
300           if XNN_LIKELY(nnz != 0) {
301             do {
302               const intptr_t diff = *dmap++;
303               const float32x4_t va0123 = vld1q_f32(a);
304               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
305               const float32x4_t vb = vld1q_dup_f32(w); w += 1;
306               vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
307             } while (--nnz != 0);
308           }
309           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
310 
311           vout0123 = vmaxq_f32(vout0123, vmin);
312 
313           vst1q_f32(c + 0, vout0123);
314           c += m;
315           j -= 1;
316         } while (j != 0);
317       }
318       c -= m * n;
319       c += 4;
320       a += 4;
321     }
322     if (i & 2) {
323       const float*restrict w = weights;
324       const int32_t* dmap = widx_dmap;
325       const uint32_t* nnzmap = nidx_nnzmap;
326       size_t j = n;
327       while (j >= 4) {
328         uint32_t nnz = *nnzmap++;
329         float32x2_t vacc01c0 = vld1_dup_f32(w); w += 1;
330         float32x2_t vacc01c1 = vld1_dup_f32(w); w += 1;
331         float32x2_t vacc01c2 = vld1_dup_f32(w); w += 1;
332         float32x2_t vacc01c3 = vld1_dup_f32(w); w += 1;
333         if XNN_LIKELY(nnz != 0) {
334           do {
335             const intptr_t diff = *dmap++;
336             const float32x2_t va01 = vld1_f32(a);
337             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
338             const float32x4_t vb = vld1q_f32(w); w += 4;
339 
340             vacc01c0 = vfma_laneq_f32(vacc01c0, va01, vb, 0);
341             vacc01c1 = vfma_laneq_f32(vacc01c1, va01, vb, 1);
342             vacc01c2 = vfma_laneq_f32(vacc01c2, va01, vb, 2);
343             vacc01c3 = vfma_laneq_f32(vacc01c3, va01, vb, 3);
344           } while (--nnz != 0);
345         }
346         float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax));
347         float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax));
348         float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax));
349         float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax));
350 
351         vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin));
352         vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin));
353         vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin));
354         vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin));
355 
356         vst1_f32(c + 0 * m + 0, vout01c0);
357         vst1_f32(c + 1 * m + 0, vout01c1);
358         vst1_f32(c + 2 * m + 0, vout01c2);
359         vst1_f32(c + 3 * m + 0, vout01c3);
360         c += 4 * m;
361         j -= 4;
362       }
363 
364       // clean up loop, fall back to nr=1
365       if XNN_UNLIKELY(j != 0) {
366         do {
367           uint32_t nnz = *nnzmap++;
368           float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
369           if XNN_LIKELY(nnz != 0) {
370             do {
371               const intptr_t diff = *dmap++;
372               const float32x2_t va01 = vld1_f32(a);
373               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
374               const float32x2_t vb = vld1_dup_f32(w); w += 1;
375               vacc01 = vfma_f32(vacc01, va01, vb);
376             } while (--nnz != 0);
377           }
378           float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
379           vout01 = vmax_f32(vout01, vget_low_f32(vmin));
380 
381           vst1_f32(c, vout01);
382           c += m;
383           j -= 1;
384         } while (j != 0);
385       }
386       c -= m * n;
387       c += 2;
388       a += 2;
389     }
390     if (i & 1) {
391       const float*restrict w = weights;
392       const int32_t* dmap = widx_dmap;
393       const uint32_t* nnzmap = nidx_nnzmap;
394       size_t j = n;
395       while (j >= 4) {
396         uint32_t nnz = *nnzmap++;
397         float32x2_t vacc0c0 = vld1_dup_f32(w); w += 1;
398         float32x2_t vacc0c1 = vld1_dup_f32(w); w += 1;
399         float32x2_t vacc0c2 = vld1_dup_f32(w); w += 1;
400         float32x2_t vacc0c3 = vld1_dup_f32(w); w += 1;
401         if XNN_LIKELY(nnz != 0) {
402           do {
403             const intptr_t diff = *dmap++;
404             const float32x2_t va0 = vld1_dup_f32(a);
405             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
406             const float32x4_t vb = vld1q_f32(w); w += 4;
407 
408             vacc0c0 = vfma_laneq_f32(vacc0c0, va0, vb, 0);
409             vacc0c1 = vfma_laneq_f32(vacc0c1, va0, vb, 1);
410             vacc0c2 = vfma_laneq_f32(vacc0c2, va0, vb, 2);
411             vacc0c3 = vfma_laneq_f32(vacc0c3, va0, vb, 3);
412           } while (--nnz != 0);
413         }
414         float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax));
415         float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax));
416         float32x2_t vout0c2 = vmin_f32(vacc0c2, vget_low_f32(vmax));
417         float32x2_t vout0c3 = vmin_f32(vacc0c3, vget_low_f32(vmax));
418 
419         vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin));
420         vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin));
421         vout0c2 = vmax_f32(vout0c2, vget_low_f32(vmin));
422         vout0c3 = vmax_f32(vout0c3, vget_low_f32(vmin));
423 
424         vst1_lane_f32(c + 0 * m + 0, vout0c0, 0);
425         vst1_lane_f32(c + 1 * m + 0, vout0c1, 0);
426         vst1_lane_f32(c + 2 * m + 0, vout0c2, 0);
427         vst1_lane_f32(c + 3 * m + 0, vout0c3, 0);
428         c += 4 * m;
429         j -= 4;
430       }
431 
432       // clean up loop, fall back to nr=1
433       if XNN_UNLIKELY(j != 0) {
434         do {
435           uint32_t nnz = *nnzmap++;
436           float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
437           if XNN_LIKELY(nnz != 0) {
438             do {
439               const intptr_t diff = *dmap++;
440               const float32x2_t va0 = vld1_dup_f32(a);
441               a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
442               const float32x2_t vb = vld1_dup_f32(w); w += 1;
443               vacc0 = vfma_f32(vacc0, va0, vb);
444             } while (--nnz != 0);
445           }
446           float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
447           vout0 = vmax_f32(vout0, vget_low_f32(vmin));
448 
449           vst1_lane_f32(c, vout0, 1);
450           c += m;
451           j -= 1;
452         } while (j != 0);
453       }
454       c -= m * n;
455       c += 1;
456       a += 1;
457     }
458     }
459 }
460