1 // Auto-generated file. Do not edit!
2 // Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10
11 #include <assert.h>
12
13 #include <arm_neon.h>
14
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17
18
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41 const size_t output_width = input_width / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 8 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 8) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
59 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
60 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
61
62 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64 const size_t input_y4 = input_y2 + 2;
65 if XNN_UNPREDICTABLE(input_y2 > input_height) {
66 i1 = zero;
67 }
68 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69 i2 = zero;
70 }
71 if XNN_UNPREDICTABLE(input_y4 > input_height) {
72 i3 = zero;
73 }
74 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75 i4 = zero;
76 }
77 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78 o1 = o0;
79 }
80
81 const float* w = weights;
82 size_t c = output_channels;
83 do {
84 // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
85 float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
86 float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
88 float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
89 float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
90
91 size_t iw = input_width - 1;
92 for (; iw >= 4; iw -= 4) {
93 float32x4_t vo0x0c0123 = vld1q_f32(w);
94 float32x4_t vo0x0c4567 = vld1q_f32(w + 4);
95 float32x4_t vo1x0c0123 = vo0x0c0123;
96 float32x4_t vo1x0c4567 = vo0x0c4567;
97 float32x4_t vo0x1c0123 = vo0x0c0123;
98 float32x4_t vo0x1c4567 = vo0x0c4567;
99 float32x4_t vo1x1c0123 = vo0x0c0123;
100 float32x4_t vo1x1c4567 = vo0x0c4567;
101
102 const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
103 const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
104
105 // viMx1 = ( iM2c1, iM2c0, iM1c2, iM1c1 )
106 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
107 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
108 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
109 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
110 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
111
112 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
113 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
114 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
115 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
116
117 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 0);
118 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0);
119 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c0x4567, vget_high_f32(vi0x1), 0);
120 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 0);
121
122 const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
123 const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
124
125 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
126 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
127 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
128 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
129
130 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 0);
131 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 0);
132 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c0x4567, vget_high_f32(vi1x1), 0);
133 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c0x4567, vget_high_f32(vi3x1), 0);
134
135 const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
136 const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
137
138 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
139 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
140 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
141 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
142
143 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0);
144 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 0);
145 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 0);
146 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c0x4567, vget_high_f32(vi4x1), 0);
147
148 const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
149 const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
150
151 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
152 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
153 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
154 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
155
156 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_high_f32(vi0x1), 1);
157 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1);
158 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c1x4567, vget_high_f32(vi0x1), 1);
159 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c1x4567, vget_high_f32(vi2x1), 1);
160
161 const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
162 const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
163
164 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
165 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
166 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
167 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
168
169 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_high_f32(vi1x1), 1);
170 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_high_f32(vi3x1), 1);
171 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c1x4567, vget_high_f32(vi1x1), 1);
172 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c1x4567, vget_high_f32(vi3x1), 1);
173
174 const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
175 const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
176
177 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
178 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
179 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
180 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
181
182 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1);
183 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_high_f32(vi4x1), 1);
184 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c1x4567, vget_high_f32(vi2x1), 1);
185 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c1x4567, vget_high_f32(vi4x1), 1);
186
187 const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
188 const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
189
190 // viMx2 = ( iM3c2, iM3c1, iM3c0, iM2c2 )
191 const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
192 const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
193 const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
194 const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
195 const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
196
197 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
198 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
199 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
200 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
201
202 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 0);
203 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 0);
204 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk00c2x4567, vget_low_f32(vi0x2), 0);
205 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c2x4567, vget_low_f32(vi2x2), 0);
206
207 const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
208 const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
209
210 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
211 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
212 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
213 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
214
215 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 0);
216 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 0);
217 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk10c2x4567, vget_low_f32(vi1x2), 0);
218 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk10c2x4567, vget_low_f32(vi3x2), 0);
219
220 const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
221 const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
222
223 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
224 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
225 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
226 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
227
228 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 0);
229 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 0);
230 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c2x4567, vget_low_f32(vi2x2), 0);
231 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk20c2x4567, vget_low_f32(vi4x2), 0);
232
233 const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
234 const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
235
236 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
237 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
238 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
239 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
240
241 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_low_f32(vi0x2), 1);
242 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_low_f32(vi2x2), 1);
243 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c0x4567, vget_low_f32(vi0x2), 1);
244 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c0x4567, vget_low_f32(vi2x2), 1);
245
246 const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
247 const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
248
249 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
250 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
251 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
252 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
253
254 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_low_f32(vi1x2), 1);
255 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_low_f32(vi3x2), 1);
256 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c0x4567, vget_low_f32(vi1x2), 1);
257 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c0x4567, vget_low_f32(vi3x2), 1);
258
259 const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
260 const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
261
262 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
263 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
264 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
265 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
266
267 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_low_f32(vi2x2), 1);
268 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_low_f32(vi4x2), 1);
269 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c0x4567, vget_low_f32(vi2x2), 1);
270 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c0x4567, vget_low_f32(vi4x2), 1);
271
272 const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
273 const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
274
275 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
276 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
277 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c1x4567, vget_low_f32(vi0x1), 0);
278 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c1x4567, vget_low_f32(vi2x1), 0);
279
280 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 0);
281 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 0);
282 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c1x4567, vget_high_f32(vi0x2), 0);
283 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c1x4567, vget_high_f32(vi2x2), 0);
284
285 const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
286 const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
287
288 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
289 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
290 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c1x4567, vget_low_f32(vi1x1), 0);
291 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c1x4567, vget_low_f32(vi3x1), 0);
292
293 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 0);
294 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 0);
295 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c1x4567, vget_high_f32(vi1x2), 0);
296 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c1x4567, vget_high_f32(vi3x2), 0);
297
298 const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
299 const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
300
301 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
302 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
303 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0);
304 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c1x4567, vget_low_f32(vi4x1), 0);
305
306 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 0);
307 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 0);
308 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c1x4567, vget_high_f32(vi2x2), 0);
309 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c1x4567, vget_high_f32(vi4x2), 0);
310
311 const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
312 const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
313
314 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
315 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
316 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk01c2x4567, vget_low_f32(vi0x1), 1);
317 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c2x4567, vget_low_f32(vi2x1), 1);
318
319 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_high_f32(vi0x2), 1);
320 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_high_f32(vi2x2), 1);
321 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk01c2x4567, vget_high_f32(vi0x2), 1);
322 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk01c2x4567, vget_high_f32(vi2x2), 1);
323
324 const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
325 const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
326
327 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
328 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
329 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk11c2x4567, vget_low_f32(vi1x1), 1);
330 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk11c2x4567, vget_low_f32(vi3x1), 1);
331
332 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_high_f32(vi1x2), 1);
333 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_high_f32(vi3x2), 1);
334 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk11c2x4567, vget_high_f32(vi1x2), 1);
335 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk11c2x4567, vget_high_f32(vi3x2), 1);
336
337 const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
338 const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
339
340 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
341 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
342 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1);
343 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk21c2x4567, vget_low_f32(vi4x1), 1);
344
345 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_high_f32(vi2x2), 1);
346 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_high_f32(vi4x2), 1);
347 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk21c2x4567, vget_high_f32(vi2x2), 1);
348 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk21c2x4567, vget_high_f32(vi4x2), 1);
349
350 const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
351 const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
352
353 // viMx3 = ( iM5c0, iM4c2, iM4c1, iM4c0 )
354 const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
355 const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
356 const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
357 const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
358 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
359
360 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
361 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
362 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c0x4567, vget_high_f32(vi0x1), 0);
363 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c0x4567, vget_high_f32(vi2x1), 0);
364
365 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 0);
366 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 0);
367 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c0x4567, vget_low_f32(vi0x3), 0);
368 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c0x4567, vget_low_f32(vi2x3), 0);
369
370 const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
371 const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
372
373 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
374 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
375 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c0x4567, vget_high_f32(vi1x1), 0);
376 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c0x4567, vget_high_f32(vi3x1), 0);
377
378 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 0);
379 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 0);
380 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c0x4567, vget_low_f32(vi1x3), 0);
381 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c0x4567, vget_low_f32(vi3x3), 0);
382
383 const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
384 const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
385
386 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
387 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
388 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c0x4567, vget_high_f32(vi2x1), 0);
389 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c0x4567, vget_high_f32(vi4x1), 0);
390
391 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 0);
392 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 0);
393 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c0x4567, vget_low_f32(vi2x3), 0);
394 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c0x4567, vget_low_f32(vi4x3), 0);
395
396 const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
397 const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
398
399 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
400 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
401 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c1x4567, vget_high_f32(vi0x1), 1);
402 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c1x4567, vget_high_f32(vi2x1), 1);
403
404 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_low_f32(vi0x3), 1);
405 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_low_f32(vi2x3), 1);
406 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c1x4567, vget_low_f32(vi0x3), 1);
407 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c1x4567, vget_low_f32(vi2x3), 1);
408
409 const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
410 const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
411
412 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
413 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
414 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c1x4567, vget_high_f32(vi1x1), 1);
415 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c1x4567, vget_high_f32(vi3x1), 1);
416
417 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_low_f32(vi1x3), 1);
418 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_low_f32(vi3x3), 1);
419 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c1x4567, vget_low_f32(vi1x3), 1);
420 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c1x4567, vget_low_f32(vi3x3), 1);
421
422 const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
423 const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
424
425 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
426 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
427 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c1x4567, vget_high_f32(vi2x1), 1);
428 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c1x4567, vget_high_f32(vi4x1), 1);
429
430 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_low_f32(vi2x3), 1);
431 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_low_f32(vi4x3), 1);
432 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c1x4567, vget_low_f32(vi2x3), 1);
433 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c1x4567, vget_low_f32(vi4x3), 1);
434
435 const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
436 const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
437
438 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 0);
439 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 0);
440 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk02c2x4567, vget_low_f32(vi0x2), 0);
441 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk02c2x4567, vget_low_f32(vi2x2), 0);
442
443 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 0);
444 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 0);
445 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk02c2x4567, vget_high_f32(vi0x3), 0);
446 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk02c2x4567, vget_high_f32(vi2x3), 0);
447
448 const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
449 const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
450
451 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 0);
452 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 0);
453 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk12c2x4567, vget_low_f32(vi1x2), 0);
454 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk12c2x4567, vget_low_f32(vi3x2), 0);
455
456 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 0);
457 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 0);
458 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk12c2x4567, vget_high_f32(vi1x3), 0);
459 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk12c2x4567, vget_high_f32(vi3x3), 0);
460
461 const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
462 const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
463
464 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 0);
465 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 0);
466 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk22c2x4567, vget_low_f32(vi2x2), 0);
467 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk22c2x4567, vget_low_f32(vi4x2), 0);
468
469 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 0);
470 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 0);
471 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk22c2x4567, vget_high_f32(vi2x3), 0);
472 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk22c2x4567, vget_high_f32(vi4x3), 0);
473
474 vi0x0 = vi0x3;
475 vi1x0 = vi1x3;
476 vi2x0 = vi2x3;
477 vi3x0 = vi3x3;
478 vi4x0 = vi4x3;
479
480
481 vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
482 vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
483 vo0x0c4567 = vmaxq_f32(vo0x0c4567, vmin);
484 vo1x0c4567 = vmaxq_f32(vo1x0c4567, vmin);
485
486 vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
487 vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
488 vo0x1c4567 = vmaxq_f32(vo0x1c4567, vmin);
489 vo1x1c4567 = vmaxq_f32(vo1x1c4567, vmin);
490
491 vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
492 vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
493 vo0x0c4567 = vminq_f32(vo0x0c4567, vmax);
494 vo1x0c4567 = vminq_f32(vo1x0c4567, vmax);
495
496 vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
497 vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
498 vo0x1c4567 = vminq_f32(vo0x1c4567, vmax);
499 vo1x1c4567 = vminq_f32(vo1x1c4567, vmax);
500
501 if XNN_LIKELY(c >= 8) {
502 vst1q_f32(o1, vo1x0c0123);
503 vst1q_f32(o1 + 4, vo1x0c4567);
504 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
505 vst1q_f32(o0, vo0x0c0123);
506 vst1q_f32(o0 + 4, vo0x0c4567);
507 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
508
509 vst1q_f32(o1, vo1x1c0123);
510 vst1q_f32(o1 + 4, vo1x1c4567);
511 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
512 vst1q_f32(o0, vo0x1c0123);
513 vst1q_f32(o0 + 4, vo0x1c4567);
514 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
515 } else {
516 float* o0_tmp = o0;
517 float* o1_tmp = o1;
518 if (c & 4) {
519 vst1q_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c0123);
520 vo1x1c0123 = vo1x1c4567;
521 vst1q_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c0123);
522 vo0x1c0123 = vo0x1c4567;
523
524 vst1q_f32(o1_tmp, vo1x0c0123); o1_tmp += 4;
525 vo1x0c0123 = vo1x0c4567;
526 vst1q_f32(o0_tmp, vo0x0c0123); o0_tmp += 4;
527 vo0x0c0123 = vo0x0c4567;
528 }
529 float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
530 float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
531 float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
532 float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
533 if (c & 2) {
534 vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
535 vo1x1c01 = vget_high_f32(vo1x1c0123);
536 vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
537 vo0x1c01 = vget_high_f32(vo0x1c0123);
538
539 vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
540 vo1x0c01 = vget_high_f32(vo1x0c0123);
541 vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
542 vo0x0c01 = vget_high_f32(vo0x0c0123);
543 }
544 if (c & 1) {
545 vst1_lane_f32(o1_tmp, vo1x0c01, 0);
546 vst1_lane_f32(o0_tmp, vo0x0c01, 0);
547
548 vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
549 vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
550 }
551
552 o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
553 o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
554 }
555 }
556 assert(iw < 4);
557 if XNN_LIKELY(iw & 2) {
558 float32x4_t vo0c0123 = vld1q_f32(w);
559 float32x4_t vo0c4567 = vld1q_f32(w + 4);
560 float32x4_t vo1c0123 = vo0c0123;
561 float32x4_t vo1c4567 = vo0c4567;
562
563 const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
564 const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
565
566 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
567 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
568 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
569 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
570
571 const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
572 const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
573
574 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
575 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
576 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
577 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
578
579 const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
580 const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
581
582 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
583 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
584 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
585 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
586
587 const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
588 const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
589
590 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
591 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
592 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
593 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
594
595 const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
596 const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
597
598 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
599 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
600 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
601 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
602
603 const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
604 const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
605
606 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
607 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
608 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
609 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
610
611 const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
612 const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
613
614 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
615 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
616 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
617 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
618
619 const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
620 const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
621
622 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
623 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
624 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
625 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
626
627 const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
628 const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
629
630 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
631 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
632 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
633 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
634
635 const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
636 const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
637
638 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
639 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
640 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
641 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
642
643 const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
644 const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
645
646 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
647 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
648 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
649 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
650
651 const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
652 const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
653
654 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
655 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
656 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
657 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
658
659 const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
660 const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
661
662 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
663 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
664 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
665 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
666 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
667 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
668
669 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
670 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
671 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vget_low_f32(vi0x1), 0);
672 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 0);
673
674 const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
675 const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
676
677 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
678 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
679 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vget_low_f32(vi1x1), 0);
680 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vget_low_f32(vi3x1), 0);
681
682 const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
683 const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
684
685 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
686 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
687 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0);
688 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vget_low_f32(vi4x1), 0);
689
690 const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
691 const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
692
693 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
694 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
695 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vget_low_f32(vi0x1), 1);
696 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vget_low_f32(vi2x1), 1);
697
698 const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
699 const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
700
701 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
702 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
703 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vget_low_f32(vi1x1), 1);
704 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vget_low_f32(vi3x1), 1);
705
706 const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
707 const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
708
709 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
710 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
711 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1);
712 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vget_low_f32(vi4x1), 1);
713
714 const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
715 const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
716
717 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
718 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
719 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c0x4567, vget_high_f32(vi0x1), 0);
720 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c0x4567, vget_high_f32(vi2x1), 0);
721
722 const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
723 const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
724
725 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
726 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
727 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c0x4567, vget_high_f32(vi1x1), 0);
728 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c0x4567, vget_high_f32(vi3x1), 0);
729
730 const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
731 const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
732
733 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
734 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
735 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c0x4567, vget_high_f32(vi2x1), 0);
736 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c0x4567, vget_high_f32(vi4x1), 0);
737
738 const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
739 const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
740
741 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
742 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
743 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c1x4567, vget_high_f32(vi0x1), 1);
744 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c1x4567, vget_high_f32(vi2x1), 1);
745
746 const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
747 const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
748
749 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
750 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
751 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c1x4567, vget_high_f32(vi1x1), 1);
752 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c1x4567, vget_high_f32(vi3x1), 1);
753
754 const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
755 const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
756
757 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
758 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
759 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c1x4567, vget_high_f32(vi2x1), 1);
760 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c1x4567, vget_high_f32(vi4x1), 1);
761
762 const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
763 const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
764
765 // viMx2 = ( iM2c2, iM2c1 )
766 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
767 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
768 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
769 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
770 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
771
772 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
773 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
774 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c2x4567, vi0x2, 0);
775 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c2x4567, vi2x2, 0);
776
777 const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
778 const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
779
780 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
781 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
782 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c2x4567, vi1x2, 0);
783 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c2x4567, vi3x2, 0);
784
785 const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
786 const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
787
788 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
789 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
790 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c2x4567, vi2x2, 0);
791 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c2x4567, vi4x2, 0);
792
793 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
794 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
795 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
796 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
797 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
798
799
800 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
801 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
802 vo0c4567 = vmaxq_f32(vo0c4567, vmin);
803 vo1c4567 = vmaxq_f32(vo1c4567, vmin);
804
805 vo0c0123 = vminq_f32(vo0c0123, vmax);
806 vo1c0123 = vminq_f32(vo1c0123, vmax);
807 vo0c4567 = vminq_f32(vo0c4567, vmax);
808 vo1c4567 = vminq_f32(vo1c4567, vmax);
809
810 if XNN_LIKELY(c >= 8) {
811 vst1q_f32(o1, vo1c0123);
812 vst1q_f32(o1 + 4, vo1c4567);
813 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
814 vst1q_f32(o0, vo0c0123);
815 vst1q_f32(o0 + 4, vo0c4567);
816 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
817 } else {
818 float* o0_tmp = o0;
819 float* o1_tmp = o1;
820 if (c & 4) {
821 vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
822 vo1c0123 = vo1c4567;
823 vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
824 vo0c0123 = vo0c4567;
825 }
826 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
827 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
828 if (c & 2) {
829 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
830 vo1c01 = vget_high_f32(vo1c0123);
831 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
832 vo0c01 = vget_high_f32(vo0c0123);
833 }
834 if (c & 1) {
835 vst1_lane_f32(o1_tmp, vo1c01, 0);
836 vst1_lane_f32(o0_tmp, vo0c01, 0);
837 }
838
839 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
840 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
841 }
842 }
843 if XNN_LIKELY(iw & 1) {
844 float32x4_t vo0c0123 = vld1q_f32(w);
845 float32x4_t vo0c4567 = vld1q_f32(w + 4);
846 float32x4_t vo1c0123 = vo0c0123;
847 float32x4_t vo1c4567 = vo0c4567;
848
849 const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
850 const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
851
852 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
853 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
854 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
855 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
856
857 const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
858 const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
859
860 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
861 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
862 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
863 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
864
865 const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
866 const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
867
868 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
869 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
870 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
871 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
872
873 const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
874 const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
875
876 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
877 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
878 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
879 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
880
881 const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
882 const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
883
884 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
885 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
886 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
887 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
888
889 const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
890 const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
891
892 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
893 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
894 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
895 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
896
897 const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
898 const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
899
900 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
901 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
902 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
903 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
904
905 const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
906 const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
907
908 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
909 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
910 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
911 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
912
913 const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
914 const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
915
916 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
917 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
918 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
919 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
920
921 const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
922 const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
923
924 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
925 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
926 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
927 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
928
929 const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
930 const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
931
932 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
933 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
934 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
935 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
936
937 const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
938 const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
939
940 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
941 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
942 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
943 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
944
945 const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
946 const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
947
948 // viMx1 = ( iM1c2, iM1c1 )
949 const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
950 const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
951 const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
952 const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
953 const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
954
955 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
956 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
957 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vi0x1, 0);
958 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vi2x1, 0);
959
960 const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
961 const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
962
963 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
964 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
965 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vi1x1, 0);
966 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vi3x1, 0);
967
968 const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
969 const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
970
971 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
972 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
973 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vi2x1, 0);
974 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vi4x1, 0);
975
976 const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
977 const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
978
979 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
980 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
981 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vi0x1, 1);
982 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vi2x1, 1);
983
984 const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
985 const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
986
987 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
988 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
989 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vi1x1, 1);
990 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vi3x1, 1);
991
992 const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
993 const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
994
995 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
996 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
997 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vi2x1, 1);
998 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vi4x1, 1);
999
1000
1001 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
1002 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
1003 vo0c4567 = vmaxq_f32(vo0c4567, vmin);
1004 vo1c4567 = vmaxq_f32(vo1c4567, vmin);
1005
1006 vo0c0123 = vminq_f32(vo0c0123, vmax);
1007 vo1c0123 = vminq_f32(vo1c0123, vmax);
1008 vo0c4567 = vminq_f32(vo0c4567, vmax);
1009 vo1c4567 = vminq_f32(vo1c4567, vmax);
1010
1011 if XNN_LIKELY(c >= 8) {
1012 vst1q_f32(o1, vo1c0123);
1013 vst1q_f32(o1 + 4, vo1c4567);
1014 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
1015 vst1q_f32(o0, vo0c0123);
1016 vst1q_f32(o0 + 4, vo0c4567);
1017 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
1018 } else {
1019 float* o0_tmp = o0;
1020 float* o1_tmp = o1;
1021 if (c & 4) {
1022 vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
1023 vo1c0123 = vo1c4567;
1024 vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
1025 vo0c0123 = vo0c4567;
1026 }
1027 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
1028 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
1029 if (c & 2) {
1030 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
1031 vo1c01 = vget_high_f32(vo1c0123);
1032 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
1033 vo0c01 = vget_high_f32(vo0c0123);
1034 }
1035 if (c & 1) {
1036 vst1_lane_f32(o1_tmp, vo1c01, 0);
1037 vst1_lane_f32(o0_tmp, vo0c01, 0);
1038 }
1039 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
1040 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
1041 }
1042 }
1043 // Move output pointers back to the position of the first pixel in a row,
1044 // and forward to the next block of output channels
1045 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
1046 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
1047 // Revert input pointers to the position of the first pixel in a row
1048 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
1049 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
1050 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
1051 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
1052 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
1053 // Move to the block of weights for the next 8 output channels
1054 w += 224;
1055 c = doz(c, 8);
1056 } while (c != 0);
1057 // Move output pointers back to the position of the first channel, and forward to the next block of rows
1058 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
1059 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
1060 // Move input pointers forward to the next four rows
1061 i0 = i4;
1062 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1063 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1064 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
1065 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
1066 }
1067 }
1068