• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-conv-hwc/3x3s2p1c3-neon-x2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2(
20     size_t input_height,
21     size_t input_width,
22     size_t output_y_start,
23     size_t output_y_end,
24     const float* input,
25     const float* zero,
26     const float* weights,
27     float* output,
28     size_t input_padding_top,
29     size_t output_channels,
30     size_t output_height_stride,
31     size_t output_width_stride,
32     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(input_width != 0);
35   assert(output_y_end > output_y_start);
36   assert(input_padding_top <= 1);
37   assert(output_channels != 0);
38 
39   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40   const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41   const size_t output_width = (input_width + 1) / 2;
42   const size_t output_channel_decrement = output_width * output_width_stride - 8 * sizeof(float);
43   const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 8) * sizeof(float);
44 
45   // Adjustment for padding processed below
46   const float* i0 = (const float*) ((uintptr_t) input +
47     input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53   float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54 
55   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56     i0 = zero;
57   }
58 
59   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
60   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
61 
62   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64     const size_t input_y4 = input_y2 + 2;
65     if XNN_UNPREDICTABLE(input_y2 > input_height) {
66       i1 = zero;
67     }
68     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69       i2 = zero;
70     }
71     if XNN_UNPREDICTABLE(input_y4 > input_height) {
72       i3 = zero;
73     }
74     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75       i4 = zero;
76     }
77     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78       o1 = o0;
79     }
80 
81     const float* w = weights;
82     size_t c = output_channels;
83     do {
84       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
85       float32x4_t vi0x0 = vmovq_n_f32(0.0f);
86       float32x4_t vi1x0 = vmovq_n_f32(0.0f);
87       float32x4_t vi2x0 = vmovq_n_f32(0.0f);
88       float32x4_t vi3x0 = vmovq_n_f32(0.0f);
89       float32x4_t vi4x0 = vmovq_n_f32(0.0f);
90 
91       size_t iw = input_width;
92       for (; iw >= 4; iw -= 4) {
93         float32x4_t vo0x0c0123 = vld1q_f32(w);
94         float32x4_t vo0x0c4567 = vld1q_f32(w + 4);
95         float32x4_t vo1x0c0123 = vo0x0c0123;
96         float32x4_t vo1x0c4567 = vo0x0c4567;
97         float32x4_t vo0x1c0123 = vo0x0c0123;
98         float32x4_t vo0x1c4567 = vo0x0c4567;
99         float32x4_t vo1x1c0123 = vo0x0c0123;
100         float32x4_t vo1x1c4567 = vo0x0c4567;
101 
102         const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
103         const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
104 
105         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
106         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
107         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
108         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
109         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
110         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
111 
112         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
113         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
114         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c0x4567, vget_low_f32(vi0x0), 1);
115         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 1);
116 
117         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 1);
118         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1);
119         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c0x4567, vget_high_f32(vi0x1), 1);
120         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 1);
121 
122         const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
123         const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
124 
125         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
126         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
127         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c0x4567, vget_low_f32(vi1x0), 1);
128         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c0x4567, vget_low_f32(vi3x0), 1);
129 
130         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 1);
131         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 1);
132         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c0x4567, vget_high_f32(vi1x1), 1);
133         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c0x4567, vget_high_f32(vi3x1), 1);
134 
135         const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
136         const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
137 
138         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
139         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
140         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1);
141         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c0x4567, vget_low_f32(vi4x0), 1);
142 
143         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1);
144         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 1);
145         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 1);
146         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c0x4567, vget_high_f32(vi4x1), 1);
147 
148         const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
149         const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
150 
151         // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
152         const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
153         const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
154         const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
155         const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
156         const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
157 
158         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
159         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
160         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c1x4567, vget_high_f32(vi0x0), 0);
161         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_high_f32(vi2x0), 0);
162 
163         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_low_f32(vi0x2), 0);
164         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_low_f32(vi2x2), 0);
165         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c1x4567, vget_low_f32(vi0x2), 0);
166         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c1x4567, vget_low_f32(vi2x2), 0);
167 
168         const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
169         const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
170 
171         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
172         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
173         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c1x4567, vget_high_f32(vi1x0), 0);
174         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c1x4567, vget_high_f32(vi3x0), 0);
175 
176         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_low_f32(vi1x2), 0);
177         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_low_f32(vi3x2), 0);
178         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c1x4567, vget_low_f32(vi1x2), 0);
179         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c1x4567, vget_low_f32(vi3x2), 0);
180 
181         const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
182         const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
183 
184         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
185         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
186         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0);
187         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c1x4567, vget_high_f32(vi4x0), 0);
188 
189         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_low_f32(vi2x2), 0);
190         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_low_f32(vi4x2), 0);
191         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c1x4567, vget_low_f32(vi2x2), 0);
192         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c1x4567, vget_low_f32(vi4x2), 0);
193 
194         const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
195         const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
196 
197         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
198         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
199         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c2x4567, vget_high_f32(vi0x0), 1);
200         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c2x4567, vget_high_f32(vi2x0), 1);
201 
202         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 1);
203         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 1);
204         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c2x4567, vget_low_f32(vi0x2), 1);
205         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c2x4567, vget_low_f32(vi2x2), 1);
206 
207         const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
208         const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
209 
210         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
211         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
212         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c2x4567, vget_high_f32(vi1x0), 1);
213         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c2x4567, vget_high_f32(vi3x0), 1);
214 
215         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 1);
216         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 1);
217         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c2x4567, vget_low_f32(vi1x2), 1);
218         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c2x4567, vget_low_f32(vi3x2), 1);
219 
220         const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
221         const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
222 
223         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
224         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
225         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c2x4567, vget_high_f32(vi2x0), 1);
226         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c2x4567, vget_high_f32(vi4x0), 1);
227 
228         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 1);
229         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 1);
230         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c2x4567, vget_low_f32(vi2x2), 1);
231         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c2x4567, vget_low_f32(vi4x2), 1);
232 
233         const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
234         const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
235 
236         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
237         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
238         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c0x4567, vget_low_f32(vi0x1), 0);
239         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c0x4567, vget_low_f32(vi2x1), 0);
240 
241         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_high_f32(vi0x2), 0);
242         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_high_f32(vi2x2), 0);
243         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c0x4567, vget_high_f32(vi0x2), 0);
244         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c0x4567, vget_high_f32(vi2x2), 0);
245 
246         const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
247         const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
248 
249         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
250         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
251         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c0x4567, vget_low_f32(vi1x1), 0);
252         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c0x4567, vget_low_f32(vi3x1), 0);
253 
254         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_high_f32(vi1x2), 0);
255         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_high_f32(vi3x2), 0);
256         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c0x4567, vget_high_f32(vi1x2), 0);
257         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c0x4567, vget_high_f32(vi3x2), 0);
258 
259         const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
260         const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
261 
262         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
263         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
264         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0);
265         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c0x4567, vget_low_f32(vi4x1), 0);
266 
267         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_high_f32(vi2x2), 0);
268         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_high_f32(vi4x2), 0);
269         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c0x4567, vget_high_f32(vi2x2), 0);
270         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c0x4567, vget_high_f32(vi4x2), 0);
271 
272         const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
273         const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
274 
275         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
276         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
277         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c1x4567, vget_low_f32(vi0x1), 1);
278         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c1x4567, vget_low_f32(vi2x1), 1);
279 
280         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 1);
281         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 1);
282         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c1x4567, vget_high_f32(vi0x2), 1);
283         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c1x4567, vget_high_f32(vi2x2), 1);
284 
285         const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
286         const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
287 
288         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
289         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
290         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c1x4567, vget_low_f32(vi1x1), 1);
291         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c1x4567, vget_low_f32(vi3x1), 1);
292 
293         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 1);
294         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 1);
295         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c1x4567, vget_high_f32(vi1x2), 1);
296         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c1x4567, vget_high_f32(vi3x2), 1);
297 
298         const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
299         const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
300 
301         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
302         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
303         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c1x4567, vget_low_f32(vi2x1), 1);
304         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c1x4567, vget_low_f32(vi4x1), 1);
305 
306         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 1);
307         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 1);
308         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c1x4567, vget_high_f32(vi2x2), 1);
309         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c1x4567, vget_high_f32(vi4x2), 1);
310 
311         const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
312         const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
313 
314         // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
315         const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
316         const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
317         const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
318         const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
319         const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
320 
321         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
322         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
323         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c2x4567, vget_high_f32(vi0x1), 0);
324         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c2x4567, vget_high_f32(vi2x1), 0);
325 
326         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_low_f32(vi0x3), 0);
327         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_low_f32(vi2x3), 0);
328         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c2x4567, vget_low_f32(vi0x3), 0);
329         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c2x4567, vget_low_f32(vi2x3), 0);
330 
331         const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
332         const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
333 
334         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
335         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
336         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c2x4567, vget_high_f32(vi1x1), 0);
337         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c2x4567, vget_high_f32(vi3x1), 0);
338 
339         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_low_f32(vi1x3), 0);
340         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_low_f32(vi3x3), 0);
341         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c2x4567, vget_low_f32(vi1x3), 0);
342         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c2x4567, vget_low_f32(vi3x3), 0);
343 
344         const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
345         const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
346 
347         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
348         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
349         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c2x4567, vget_high_f32(vi2x1), 0);
350         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c2x4567, vget_high_f32(vi4x1), 0);
351 
352         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_low_f32(vi2x3), 0);
353         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_low_f32(vi4x3), 0);
354         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c2x4567, vget_low_f32(vi2x3), 0);
355         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c2x4567, vget_low_f32(vi4x3), 0);
356 
357         const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
358         const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
359 
360         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
361         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
362         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c0x4567, vget_high_f32(vi0x1), 1);
363         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c0x4567, vget_high_f32(vi2x1), 1);
364 
365         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 1);
366         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 1);
367         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c0x4567, vget_low_f32(vi0x3), 1);
368         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c0x4567, vget_low_f32(vi2x3), 1);
369 
370         const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
371         const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
372 
373         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
374         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
375         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c0x4567, vget_high_f32(vi1x1), 1);
376         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c0x4567, vget_high_f32(vi3x1), 1);
377 
378         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 1);
379         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 1);
380         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c0x4567, vget_low_f32(vi1x3), 1);
381         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c0x4567, vget_low_f32(vi3x3), 1);
382 
383         const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
384         const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
385 
386         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
387         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
388         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c0x4567, vget_high_f32(vi2x1), 1);
389         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c0x4567, vget_high_f32(vi4x1), 1);
390 
391         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 1);
392         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 1);
393         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c0x4567, vget_low_f32(vi2x3), 1);
394         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c0x4567, vget_low_f32(vi4x3), 1);
395 
396         const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
397         const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
398 
399         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_low_f32(vi0x2), 0);
400         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_low_f32(vi2x2), 0);
401         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c1x4567, vget_low_f32(vi0x2), 0);
402         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c1x4567, vget_low_f32(vi2x2), 0);
403 
404         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_high_f32(vi0x3), 0);
405         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_high_f32(vi2x3), 0);
406         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c1x4567, vget_high_f32(vi0x3), 0);
407         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c1x4567, vget_high_f32(vi2x3), 0);
408 
409         const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
410         const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
411 
412         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_low_f32(vi1x2), 0);
413         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_low_f32(vi3x2), 0);
414         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c1x4567, vget_low_f32(vi1x2), 0);
415         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c1x4567, vget_low_f32(vi3x2), 0);
416 
417         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_high_f32(vi1x3), 0);
418         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_high_f32(vi3x3), 0);
419         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c1x4567, vget_high_f32(vi1x3), 0);
420         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c1x4567, vget_high_f32(vi3x3), 0);
421 
422         const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
423         const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
424 
425         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_low_f32(vi2x2), 0);
426         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_low_f32(vi4x2), 0);
427         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c1x4567, vget_low_f32(vi2x2), 0);
428         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c1x4567, vget_low_f32(vi4x2), 0);
429 
430         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_high_f32(vi2x3), 0);
431         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_high_f32(vi4x3), 0);
432         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c1x4567, vget_high_f32(vi2x3), 0);
433         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c1x4567, vget_high_f32(vi4x3), 0);
434 
435         const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
436         const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
437 
438         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 1);
439         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 1);
440         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c2x4567, vget_low_f32(vi0x2), 1);
441         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c2x4567, vget_low_f32(vi2x2), 1);
442 
443         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 1);
444         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 1);
445         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c2x4567, vget_high_f32(vi0x3), 1);
446         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c2x4567, vget_high_f32(vi2x3), 1);
447 
448         const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
449         const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
450 
451         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 1);
452         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 1);
453         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c2x4567, vget_low_f32(vi1x2), 1);
454         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c2x4567, vget_low_f32(vi3x2), 1);
455 
456         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 1);
457         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 1);
458         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c2x4567, vget_high_f32(vi1x3), 1);
459         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c2x4567, vget_high_f32(vi3x3), 1);
460 
461         const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
462         const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
463 
464         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 1);
465         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 1);
466         vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c2x4567, vget_low_f32(vi2x2), 1);
467         vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c2x4567, vget_low_f32(vi4x2), 1);
468 
469         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 1);
470         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 1);
471         vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c2x4567, vget_high_f32(vi2x3), 1);
472         vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c2x4567, vget_high_f32(vi4x3), 1);
473 
474         vi0x0 = vi0x3;
475         vi1x0 = vi1x3;
476         vi2x0 = vi2x3;
477         vi3x0 = vi3x3;
478         vi4x0 = vi4x3;
479 
480 
481         vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
482         vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
483         vo0x0c4567 = vmaxq_f32(vo0x0c4567, vmin);
484         vo1x0c4567 = vmaxq_f32(vo1x0c4567, vmin);
485 
486         vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
487         vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
488         vo0x1c4567 = vmaxq_f32(vo0x1c4567, vmin);
489         vo1x1c4567 = vmaxq_f32(vo1x1c4567, vmin);
490 
491         vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
492         vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
493         vo0x0c4567 = vminq_f32(vo0x0c4567, vmax);
494         vo1x0c4567 = vminq_f32(vo1x0c4567, vmax);
495 
496         vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
497         vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
498         vo0x1c4567 = vminq_f32(vo0x1c4567, vmax);
499         vo1x1c4567 = vminq_f32(vo1x1c4567, vmax);
500 
501         if XNN_LIKELY(c >= 8) {
502           vst1q_f32(o1, vo1x0c0123);
503           vst1q_f32(o1 + 4, vo1x0c4567);
504           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
505           vst1q_f32(o0, vo0x0c0123);
506           vst1q_f32(o0 + 4, vo0x0c4567);
507           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
508 
509           vst1q_f32(o1, vo1x1c0123);
510           vst1q_f32(o1 + 4, vo1x1c4567);
511           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
512           vst1q_f32(o0, vo0x1c0123);
513           vst1q_f32(o0 + 4, vo0x1c4567);
514           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
515         } else {
516           float* o0_tmp = o0;
517           float* o1_tmp = o1;
518           if (c & 4) {
519             vst1q_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c0123);
520             vo1x1c0123 = vo1x1c4567;
521             vst1q_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c0123);
522             vo0x1c0123 = vo0x1c4567;
523 
524             vst1q_f32(o1_tmp, vo1x0c0123); o1_tmp += 4;
525             vo1x0c0123 = vo1x0c4567;
526             vst1q_f32(o0_tmp, vo0x0c0123); o0_tmp += 4;
527             vo0x0c0123 = vo0x0c4567;
528           }
529           float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
530           float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
531           float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
532           float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
533           if (c & 2) {
534             vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
535             vo1x1c01 = vget_high_f32(vo1x1c0123);
536             vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
537             vo0x1c01 = vget_high_f32(vo0x1c0123);
538 
539             vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
540             vo1x0c01 = vget_high_f32(vo1x0c0123);
541             vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
542             vo0x0c01 = vget_high_f32(vo0x0c0123);
543           }
544           if (c & 1) {
545             vst1_lane_f32(o1_tmp, vo1x0c01, 0);
546             vst1_lane_f32(o0_tmp, vo0x0c01, 0);
547 
548             vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
549             vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
550           }
551 
552           o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
553           o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
554         }
555       }
556       assert(iw < 4);
557       if XNN_UNLIKELY(iw & 2) {
558         float32x4_t vo0c0123 = vld1q_f32(w);
559         float32x4_t vo0c4567 = vld1q_f32(w + 4);
560         float32x4_t vo1c0123 = vo0c0123;
561         float32x4_t vo1c4567 = vo0c4567;
562 
563         const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
564         const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
565 
566         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
567         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
568         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 1);
569         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 1);
570 
571         const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
572         const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
573 
574         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
575         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
576         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 1);
577         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 1);
578 
579         const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
580         const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
581 
582         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
583         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
584         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1);
585         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 1);
586 
587         const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
588         const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
589 
590         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
591         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
592         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_high_f32(vi0x0), 0);
593         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_high_f32(vi2x0), 0);
594 
595         const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
596         const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
597 
598         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
599         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
600         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_high_f32(vi1x0), 0);
601         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_high_f32(vi3x0), 0);
602 
603         const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
604         const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
605 
606         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
607         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
608         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0);
609         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_high_f32(vi4x0), 0);
610 
611         const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
612         const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
613 
614         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
615         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
616         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 1);
617         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 1);
618 
619         const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
620         const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
621 
622         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
623         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
624         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 1);
625         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 1);
626 
627         const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
628         const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
629 
630         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
631         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
632         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 1);
633         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 1);
634 
635         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
636         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
637         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
638         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
639         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
640         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
641 
642         const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
643         const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
644 
645         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
646         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
647         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_low_f32(vi0x1), 0);
648         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_low_f32(vi2x1), 0);
649 
650         const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
651         const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
652 
653         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
654         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
655         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_low_f32(vi1x1), 0);
656         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_low_f32(vi3x1), 0);
657 
658         const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
659         const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
660 
661         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
662         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
663         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0);
664         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_low_f32(vi4x1), 0);
665 
666         const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
667         const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
668 
669         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
670         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
671         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vget_low_f32(vi0x1), 1);
672         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 1);
673 
674         const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
675         const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
676 
677         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
678         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
679         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vget_low_f32(vi1x1), 1);
680         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vget_low_f32(vi3x1), 1);
681 
682         const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
683         const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
684 
685         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
686         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
687         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 1);
688         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vget_low_f32(vi4x1), 1);
689 
690         const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
691         const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
692 
693         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
694         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
695         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vget_high_f32(vi0x1), 0);
696         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vget_high_f32(vi2x1), 0);
697 
698         const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
699         const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
700 
701         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
702         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
703         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vget_high_f32(vi1x1), 0);
704         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vget_high_f32(vi3x1), 0);
705 
706         const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
707         const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
708 
709         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
710         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
711         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vget_high_f32(vi2x1), 0);
712         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vget_high_f32(vi4x1), 0);
713 
714         const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
715         const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
716 
717         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
718         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
719         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c0x4567, vget_high_f32(vi0x1), 1);
720         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c0x4567, vget_high_f32(vi2x1), 1);
721 
722         const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
723         const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
724 
725         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
726         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
727         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c0x4567, vget_high_f32(vi1x1), 1);
728         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c0x4567, vget_high_f32(vi3x1), 1);
729 
730         const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
731         const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
732 
733         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
734         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
735         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c0x4567, vget_high_f32(vi2x1), 1);
736         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c0x4567, vget_high_f32(vi4x1), 1);
737 
738         // viMx2 = ( iM2c2, iM2c1 )
739         const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
740         const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
741         const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
742         const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
743         const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
744 
745         const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
746         const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
747 
748         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vi0x2, 0);
749         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vi2x2, 0);
750         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c1x4567, vi0x2, 0);
751         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c1x4567, vi2x2, 0);
752 
753         const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
754         const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
755 
756         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vi1x2, 0);
757         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vi3x2, 0);
758         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c1x4567, vi1x2, 0);
759         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c1x4567, vi3x2, 0);
760 
761         const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
762         const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
763 
764         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vi2x2, 0);
765         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vi4x2, 0);
766         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c1x4567, vi2x2, 0);
767         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c1x4567, vi4x2, 0);
768 
769         const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
770         const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
771 
772         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 1);
773         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 1);
774         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c2x4567, vi0x2, 1);
775         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c2x4567, vi2x2, 1);
776 
777         const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
778         const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
779 
780         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 1);
781         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 1);
782         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c2x4567, vi1x2, 1);
783         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c2x4567, vi3x2, 1);
784 
785         const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
786         const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
787 
788         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 1);
789         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 1);
790         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c2x4567, vi2x2, 1);
791         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c2x4567, vi4x2, 1);
792 
793         vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
794         vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
795         vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
796         vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
797         vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
798 
799 
800         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
801         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
802         vo0c4567 = vmaxq_f32(vo0c4567, vmin);
803         vo1c4567 = vmaxq_f32(vo1c4567, vmin);
804 
805         vo0c0123 = vminq_f32(vo0c0123, vmax);
806         vo1c0123 = vminq_f32(vo1c0123, vmax);
807         vo0c4567 = vminq_f32(vo0c4567, vmax);
808         vo1c4567 = vminq_f32(vo1c4567, vmax);
809 
810         if XNN_LIKELY(c >= 8) {
811           vst1q_f32(o1, vo1c0123);
812           vst1q_f32(o1 + 4, vo1c4567);
813           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
814           vst1q_f32(o0, vo0c0123);
815           vst1q_f32(o0 + 4, vo0c4567);
816           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
817         } else {
818           float* o0_tmp = o0;
819           float* o1_tmp = o1;
820           if (c & 4) {
821             vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
822             vo1c0123 = vo1c4567;
823             vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
824             vo0c0123 = vo0c4567;
825           }
826           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
827           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
828           if (c & 2) {
829             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
830             vo1c01 = vget_high_f32(vo1c0123);
831             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
832             vo0c01 = vget_high_f32(vo0c0123);
833           }
834           if (c & 1) {
835             vst1_lane_f32(o1_tmp, vo1c01, 0);
836             vst1_lane_f32(o0_tmp, vo0c01, 0);
837           }
838 
839           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
840           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
841         }
842       }
843       if XNN_UNLIKELY(iw & 1) {
844         float32x4_t vo0c0123 = vld1q_f32(w);
845         float32x4_t vo0c4567 = vld1q_f32(w + 4);
846         float32x4_t vo1c0123 = vo0c0123;
847         float32x4_t vo1c4567 = vo0c4567;
848 
849         const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
850         const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
851 
852         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
853         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
854         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 1);
855         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 1);
856 
857         const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
858         const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
859 
860         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
861         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
862         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 1);
863         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 1);
864 
865         const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
866         const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
867 
868         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
869         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
870         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1);
871         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 1);
872 
873         const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
874         const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
875 
876         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
877         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
878         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_high_f32(vi0x0), 0);
879         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_high_f32(vi2x0), 0);
880 
881         const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
882         const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
883 
884         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
885         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
886         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_high_f32(vi1x0), 0);
887         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_high_f32(vi3x0), 0);
888 
889         const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
890         const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
891 
892         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
893         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
894         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0);
895         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_high_f32(vi4x0), 0);
896 
897         const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
898         const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
899 
900         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
901         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
902         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 1);
903         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 1);
904 
905         const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
906         const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
907 
908         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
909         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
910         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 1);
911         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 1);
912 
913         const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
914         const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
915 
916         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
917         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
918         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 1);
919         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 1);
920 
921         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
922         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 3;
923         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 3;
924         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 3;
925         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 3;
926         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 3;
927 
928         const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
929         const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
930 
931         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
932         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
933         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_low_f32(vi0x1), 0);
934         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_low_f32(vi2x1), 0);
935 
936         const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
937         const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
938 
939         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
940         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
941         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_low_f32(vi1x1), 0);
942         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_low_f32(vi3x1), 0);
943 
944         const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
945         const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
946 
947         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
948         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
949         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0);
950         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_low_f32(vi4x1), 0);
951 
952         const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
953         const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
954 
955         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
956         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
957         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vget_low_f32(vi0x1), 1);
958         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 1);
959 
960         const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
961         const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
962 
963         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
964         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
965         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vget_low_f32(vi1x1), 1);
966         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vget_low_f32(vi3x1), 1);
967 
968         const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
969         const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
970 
971         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
972         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
973         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 1);
974         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vget_low_f32(vi4x1), 1);
975 
976         const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
977         const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
978 
979         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
980         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
981         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vget_high_f32(vi0x1), 0);
982         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vget_high_f32(vi2x1), 0);
983 
984         const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
985         const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
986 
987         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
988         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
989         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vget_high_f32(vi1x1), 0);
990         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vget_high_f32(vi3x1), 0);
991 
992         const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
993         const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
994 
995         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
996         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
997         vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vget_high_f32(vi2x1), 0);
998         vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vget_high_f32(vi4x1), 0);
999 
1000 
1001         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
1002         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
1003         vo0c4567 = vmaxq_f32(vo0c4567, vmin);
1004         vo1c4567 = vmaxq_f32(vo1c4567, vmin);
1005 
1006         vo0c0123 = vminq_f32(vo0c0123, vmax);
1007         vo1c0123 = vminq_f32(vo1c0123, vmax);
1008         vo0c4567 = vminq_f32(vo0c4567, vmax);
1009         vo1c4567 = vminq_f32(vo1c4567, vmax);
1010 
1011         if XNN_LIKELY(c >= 8) {
1012           vst1q_f32(o1, vo1c0123);
1013           vst1q_f32(o1 + 4, vo1c4567);
1014           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
1015           vst1q_f32(o0, vo0c0123);
1016           vst1q_f32(o0 + 4, vo0c4567);
1017           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
1018         } else {
1019           float* o0_tmp = o0;
1020           float* o1_tmp = o1;
1021           if (c & 4) {
1022             vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
1023             vo1c0123 = vo1c4567;
1024             vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
1025             vo0c0123 = vo0c4567;
1026           }
1027           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
1028           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
1029           if (c & 2) {
1030             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
1031             vo1c01 = vget_high_f32(vo1c0123);
1032             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
1033             vo0c01 = vget_high_f32(vo0c0123);
1034           }
1035           if (c & 1) {
1036             vst1_lane_f32(o1_tmp, vo1c01, 0);
1037             vst1_lane_f32(o0_tmp, vo0c01, 0);
1038           }
1039           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
1040           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
1041         }
1042       }
1043       // Move output pointers back to the position of the first pixel in a row,
1044       // and forward to the next block of output channels
1045       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
1046       o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
1047       // Revert input pointers to the position of the first pixel in a row
1048       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
1049       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
1050       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
1051       i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
1052       i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
1053       // Move to the block of weights for the next 8 output channels
1054       w += 224;
1055       c = doz(c, 8);
1056     } while (c != 0);
1057     // Move output pointers back to the position of the first channel, and forward to the next block of rows
1058     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
1059     o1 = (float*) ((uintptr_t) o1 + output_height_increment);
1060     // Move input pointers forward to the next four rows
1061     i0 = i4;
1062     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1063     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1064     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
1065     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
1066   }
1067 }
1068