• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/5x5p2-neon.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top == 2);
32 
33   const uint32x4_t vmask = vld1q_u32(params->neon.mask);
34   const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
35   const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
36 
37   const float32x4_t vw0123 = vld1q_f32(weights);
38   const float32x4_t vw4567 = vld1q_f32(weights + 4);
39   const float32x4_t vw89AB = vld1q_f32(weights + 8);
40   const float32x4_t vwCDEF = vld1q_f32(weights + 12);
41   const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
42   const float32x4_t vwKLMN = vld1q_f32(weights + 20);
43   const float32x2_t vwOP = vld1_f32(weights + 24);
44 
45   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
46 
47   const float* i0 = zero;
48   const float* i1 = zero;
49   const float* i2 = input;
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
52   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
53   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
54 
55   float* o0 = output;
56   float* o1 = (float*) ((uintptr_t) o0 + input_width);
57   float* o2 = (float*) ((uintptr_t) o1 + input_width);
58 
59   size_t output_height = input_height;
60   do {
61     if XNN_UNPREDICTABLE(output_height < 2) {
62       i3 = zero;
63       o1 = o0;
64     }
65     if XNN_UNPREDICTABLE(output_height < 3) {
66       i4 = zero;
67       o2 = o1;
68     }
69     if XNN_UNPREDICTABLE(output_height < 4) {
70       i5 = zero;
71     }
72     if XNN_UNPREDICTABLE(output_height < 5) {
73       i6 = zero;
74     }
75 
76     float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
77     float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
78     float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
79     float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
80     float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
81     float32x4_t vi5x0123 = vmovq_n_f32(0.0f);
82     float32x4_t vi6x0123 = vmovq_n_f32(0.0f);
83 
84     float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
85     float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
86     float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
87     float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
88     float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
89     float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
90     float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
91 
92     size_t w = input_width;
93     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
94       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
95       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
96       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
97 
98       const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
99       const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
100       const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
101       const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
102       const float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
103       const float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
104       const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
105 
106       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
107       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
108       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
109 
110       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
111       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
112       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
113 
114       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
115       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
116       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
117 
118       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
119       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
120       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
121 
122       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
123       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
124       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
125 
126       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
127       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
128       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
129       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
130       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
131       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
132       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
133 
134       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
135       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
136       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
137 
138       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
139       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
140       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
141 
142       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
143       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
144       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
145 
146       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
147       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
148       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
149 
150       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
151       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
152       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
153 
154       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
155       vi0x0123 = vi0x4567;
156       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
157       vi1x0123 = vi1x4567;
158       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
159       vi2x0123 = vi2x4567;
160       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
161       vi3x0123 = vi3x4567;
162       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
163       vi4x0123 = vi4x4567;
164       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
165       vi5x0123 = vi5x4567;
166       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
167       vi6x0123 = vi6x4567;
168 
169       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
170       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
171       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
172 
173       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
174       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
175       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
176 
177       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
178       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
179       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
180 
181       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
182       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
183       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
184 
185       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
186       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
187       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
188 
189       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
190       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
191       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
192       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
193       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
194       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
195       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
196 
197       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
198       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
199       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
200 
201       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
202       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
203       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
204 
205       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
206       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
207       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
208 
209       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
210       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
211       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
212 
213       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
214       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
215       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
216 
217       const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
218       vi0x4567 = vi0x89AB;
219       const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
220       vi1x4567 = vi1x89AB;
221       const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
222       vi2x4567 = vi2x89AB;
223       const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
224       vi3x4567 = vi3x89AB;
225       const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
226       vi4x4567 = vi4x89AB;
227       const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
228       vi5x4567 = vi5x89AB;
229       const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
230       vi6x4567 = vi6x89AB;
231 
232       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
233       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
234       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
235 
236       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
237       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
238       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
239 
240       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
241       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
242       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
243 
244       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
245       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
246       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
247 
248       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
249       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
250       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
251 
252       vo0p0 = vaddq_f32(vo0p0, vo0p1);
253       vo1p0 = vaddq_f32(vo1p0, vo1p1);
254       vo2p0 = vaddq_f32(vo2p0, vo2p1);
255 
256       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
257       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
258       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
259 
260       vo0 = vminq_f32(vo0, vmax);
261       vo1 = vminq_f32(vo1, vmax);
262       vo2 = vminq_f32(vo2, vmax);
263 
264       vst1q_f32(o2, vo2); o2 += 4;
265       vst1q_f32(o1, vo1); o1 += 4;
266       vst1q_f32(o0, vo0); o0 += 4;
267     }
268     // Always process the last block of 5..8 pixels.
269     if XNN_LIKELY(w > 4 * sizeof(float)) {
270       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
271       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
272       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
273 
274       float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
275       float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
276       float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
277       float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
278       float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
279       float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
280       float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
281 
282       vi0x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x89AB)));
283       vi1x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x89AB)));
284       vi2x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x89AB)));
285       vi3x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x89AB)));
286       vi4x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x89AB)));
287       vi5x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x89AB)));
288       vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB)));
289 
290       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
291       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
292       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
293 
294       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
295       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
296       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
297 
298       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
299       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
300       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
301 
302       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
303       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
304       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
305 
306       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
307       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
308       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
309 
310       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
311       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
312       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
313       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
314       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
315       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
316       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
317 
318       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
319       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
320       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
321 
322       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
323       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
324       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
325 
326       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
327       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
328       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
329 
330       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
331       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
332       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
333 
334       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
335       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
336       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
337 
338       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
339       vi0x0123 = vi0x4567;
340       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
341       vi1x0123 = vi1x4567;
342       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
343       vi2x0123 = vi2x4567;
344       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
345       vi3x0123 = vi3x4567;
346       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
347       vi4x0123 = vi4x4567;
348       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
349       vi5x0123 = vi5x4567;
350       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
351       vi6x0123 = vi6x4567;
352 
353       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
354       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
355       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
356 
357       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
358       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
359       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
360 
361       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
362       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
363       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
364 
365       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
366       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
367       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
368 
369       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
370       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
371       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
372 
373       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
374       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
375       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
376       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
377       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
378       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
379       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
380 
381       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
382       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
383       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
384 
385       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
386       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
387       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
388 
389       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
390       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
391       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
392 
393       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
394       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
395       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
396 
397       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
398       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
399       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
400 
401       const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
402       vi0x4567 = vi0x89AB;
403       const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
404       vi1x4567 = vi1x89AB;
405       const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
406       vi2x4567 = vi2x89AB;
407       const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
408       vi3x4567 = vi3x89AB;
409       const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
410       vi4x4567 = vi4x89AB;
411       const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
412       vi5x4567 = vi5x89AB;
413       const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
414       vi6x4567 = vi6x89AB;
415 
416       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
417       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
418       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
419 
420       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
421       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
422       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
423 
424       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
425       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
426       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
427 
428       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
429       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
430       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
431 
432       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
433       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
434       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
435 
436       vo0p0 = vaddq_f32(vo0p0, vo0p1);
437       vo1p0 = vaddq_f32(vo1p0, vo1p1);
438       vo2p0 = vaddq_f32(vo2p0, vo2p1);
439 
440       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
441       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
442       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
443 
444       vo0 = vminq_f32(vo0, vmax);
445       vo1 = vminq_f32(vo1, vmax);
446       vo2 = vminq_f32(vo2, vmax);
447 
448       vst1q_f32(o2, vo2); o2 += 4;
449       vst1q_f32(o1, vo1); o1 += 4;
450       vst1q_f32(o0, vo0); o0 += 4;
451 
452       w -= 4 * sizeof(float);
453     }
454     assert(w >= 1 * sizeof(float));
455     assert(w <= 4 * sizeof(float));
456     {
457       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
458       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
459       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
460 
461       vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
462       vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
463       vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
464       vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));
465       vi4x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x4567)));
466       vi5x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x4567)));
467       vi6x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x4567)));
468 
469       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
470       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
471       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
472 
473       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
474       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
475       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
476 
477       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
478       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
479       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
480 
481       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
482       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
483       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
484 
485       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
486       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
487       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
488 
489       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
490       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
491       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
492       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
493       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
494       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
495       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
496 
497       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
498       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
499       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
500 
501       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
502       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
503       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
504 
505       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
506       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
507       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
508 
509       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
510       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
511       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
512 
513       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
514       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
515       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
516 
517       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
518       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
519       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
520       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
521       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
522       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
523       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
524 
525       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
526       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
527       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
528 
529       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
530       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
531       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
532 
533       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
534       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
535       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
536 
537       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
538       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
539       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
540 
541       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
542       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
543       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
544 
545       const float32x4_t vzero = vmovq_n_f32(0.0f);
546       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
547       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
548       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
549       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);
550       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1);
551       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1);
552       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1);
553 
554       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
555       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
556       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
557 
558       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
559       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
560       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
561 
562       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
563       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
564       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
565 
566       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
567       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
568       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
569 
570       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
571       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
572       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
573 
574       const float32x4_t vi0x6789 = vextq_f32(vi0x5678, vzero, 1);
575       const float32x4_t vi1x6789 = vextq_f32(vi1x5678, vzero, 1);
576       const float32x4_t vi2x6789 = vextq_f32(vi2x5678, vzero, 1);
577       const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1);
578       const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1);
579       const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1);
580       const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1);
581 
582       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
583       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
584       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
585 
586       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
587       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
588       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
589 
590       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
591       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
592       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
593 
594       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
595       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
596       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
597 
598       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
599       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
600       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
601 
602       vo0p0 = vaddq_f32(vo0p0, vo0p1);
603       vo1p0 = vaddq_f32(vo1p0, vo1p1);
604       vo2p0 = vaddq_f32(vo2p0, vo2p1);
605 
606       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
607       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
608       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
609 
610       vo0 = vminq_f32(vo0, vmax);
611       vo1 = vminq_f32(vo1, vmax);
612       vo2 = vminq_f32(vo2, vmax);
613 
614       if XNN_LIKELY(w & (4 * sizeof(float))) {
615         vst1q_f32(o2, vo2); o2 += 4;
616         vst1q_f32(o1, vo1); o1 += 4;
617         vst1q_f32(o0, vo0); o0 += 4;
618       } else {
619         float32x2_t vo0_lo = vget_low_f32(vo0);
620         float32x2_t vo1_lo = vget_low_f32(vo1);
621         float32x2_t vo2_lo = vget_low_f32(vo2);
622         if (w & (2 * sizeof(float))) {
623           vst1_f32(o2, vo2_lo); o2 += 2;
624           vst1_f32(o1, vo1_lo); o1 += 2;
625           vst1_f32(o0, vo0_lo); o0 += 2;
626 
627           vo0_lo = vget_high_f32(vo0);
628           vo1_lo = vget_high_f32(vo1);
629           vo2_lo = vget_high_f32(vo2);
630         }
631         if (w & (1 * sizeof(float))) {
632           vst1_lane_f32(o2, vo2_lo, 0); o2 += 1;
633           vst1_lane_f32(o1, vo1_lo, 0); o1 += 1;
634           vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
635         }
636       }
637     }
638 
639     i0 = (const float*) ((uintptr_t) i3 - input_decrement);
640     i1 = (const float*) ((uintptr_t) i4 - input_decrement);
641     i2 = (const float*) ((uintptr_t) i1 + input_width);
642     i3 = (const float*) ((uintptr_t) i2 + input_width);
643     i4 = (const float*) ((uintptr_t) i3 + input_width);
644     i5 = (const float*) ((uintptr_t) i4 + input_width);
645     i6 = (const float*) ((uintptr_t) i5 + input_width);
646 
647     o0 = o2;
648     o1 = (float*) ((uintptr_t) o0 + input_width);
649     o2 = (float*) ((uintptr_t) o1 + input_width);
650 
651     output_height = doz(output_height, 3);
652   } while (output_height != 0);
653 }
654