1 // Auto-generated file. Do not edit!
2 // Template: src/f16-dwconv2d-chw/5x5p2-neonfp16arith.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4(size_t input_height,size_t input_width,const void * input,const void * weights,const void * zero,void * output,uint32_t padding_top,const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4(
19 size_t input_height,
20 size_t input_width,
21 const void* input,
22 const void* weights,
23 const void* zero,
24 void* output,
25 uint32_t padding_top,
26 const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(__fp16) == 0);
31 assert(padding_top == 2);
32
33 const uint16x4_t vmask = vld1_u16(params->neonfp16arith.mask);
34 const float16x4_t vmax = vld1_dup_f16(¶ms->neonfp16arith.max);
35 const float16x4_t vmin = vld1_dup_f16(¶ms->neonfp16arith.min);
36
37 const __fp16* w0 = (const __fp16*)weights;
38 const float16x8_t vw01234567 = vld1q_f16(w0);
39 const float16x8_t vw89ABCDEF = vld1q_f16(w0 + 8);
40 const float16x8_t vwGHIJKLMN = vld1q_f16(w0 + 16);
41 const float16x4_t vwOP = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 24), vmov_n_u32(0), 0));
42
43 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(__fp16));
44
45 const __fp16* i0 = zero;
46 const __fp16* i1 = zero;
47 const __fp16* i2 = input;
48 const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
49 const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
50
51 __fp16* o0 = output;
52
53 size_t output_height = input_height;
54 do {
55 if XNN_UNPREDICTABLE(output_height < 2) {
56 i3 = zero;
57 }
58 if XNN_UNPREDICTABLE(output_height < 3) {
59 i4 = zero;
60 }
61
62 float16x4_t vi0x0123 = vmov_n_f16(0);
63 float16x4_t vi1x0123 = vmov_n_f16(0);
64 float16x4_t vi2x0123 = vmov_n_f16(0);
65 float16x4_t vi3x0123 = vmov_n_f16(0);
66 float16x4_t vi4x0123 = vmov_n_f16(0);
67
68 float16x4_t vi0x4567 = vld1_f16(i0); i0 += 4;
69 float16x4_t vi1x4567 = vld1_f16(i1); i1 += 4;
70 float16x4_t vi2x4567 = vld1_f16(i2); i2 += 4;
71 float16x4_t vi3x4567 = vld1_f16(i3); i3 += 4;
72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4;
73
74 size_t w = input_width;
75 for (; w > 8 * sizeof(__fp16); w -= 4 * sizeof(__fp16)) {
76 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
77
78 const float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
79 const float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
80 const float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
81 const float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
82 const float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
83
84 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
85
86 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
87
88 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
89
90 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
91
92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
93
94 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
95 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
96 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
97 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
99
100 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
101
102 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
103
104 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
105
106 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
107
108 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
109
110 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
111 vi0x0123 = vi0x4567;
112 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
113 vi1x0123 = vi1x4567;
114 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
115 vi2x0123 = vi2x4567;
116 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
117 vi3x0123 = vi3x4567;
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
119 vi4x0123 = vi4x4567;
120
121 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
122
123 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
124
125 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
126
127 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
128
129 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
130
131 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
132 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
133 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
134 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
136
137 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
138
139 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
140
141 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
142
143 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
144
145 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
146
147 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
148 vi0x4567 = vi0x89AB;
149 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
150 vi1x4567 = vi1x89AB;
151 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
152 vi2x4567 = vi2x89AB;
153 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
154 vi3x4567 = vi3x89AB;
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
156 vi4x4567 = vi4x89AB;
157
158 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
159
160 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
161
162 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
163
164 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
165
166 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
167
168
169 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
170
171 vo0 = vmin_f16(vo0, vmax);
172
173 vst1_f16(o0, vo0); o0 += 4;
174 }
175 // Always process the last block of 5..8 pixels.
176 if XNN_LIKELY(w > 4 * sizeof(__fp16)) {
177 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
178
179 float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
180 float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
181 float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
182 float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
183 float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
184
185 vi0x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x89AB)));
186 vi1x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x89AB)));
187 vi2x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x89AB)));
188 vi3x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x89AB)));
189 vi4x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x89AB)));
190
191 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
192
193 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
194
195 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
196
197 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
198
199 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
200
201 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
202 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
203 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
204 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
205 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
206
207 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
208
209 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
210
211 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
212
213 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
214
215 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
216
217 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
218 vi0x0123 = vi0x4567;
219 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
220 vi1x0123 = vi1x4567;
221 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
222 vi2x0123 = vi2x4567;
223 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
224 vi3x0123 = vi3x4567;
225 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
226 vi4x0123 = vi4x4567;
227
228 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
229
230 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
231
232 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
233
234 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
235
236 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
237
238 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
239 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
240 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
241 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
242 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
243
244 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
245
246 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
247
248 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
249
250 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
251
252 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
253
254 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
255 vi0x4567 = vi0x89AB;
256 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
257 vi1x4567 = vi1x89AB;
258 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
259 vi2x4567 = vi2x89AB;
260 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
261 vi3x4567 = vi3x89AB;
262 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
263 vi4x4567 = vi4x89AB;
264
265 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
266
267 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
268
269 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
270
271 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
272
273 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
274
275
276 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
277
278 vo0 = vmin_f16(vo0, vmax);
279
280 vst1_f16(o0, vo0); o0 += 4;
281
282 w -= 4 * sizeof(__fp16);
283 }
284 assert(w >= 1 * sizeof(__fp16));
285 assert(w <= 4 * sizeof(__fp16));
286 {
287 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
288
289 vi0x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x4567)));
290 vi1x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x4567)));
291 vi2x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x4567)));
292 vi3x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x4567)));
293 vi4x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x4567)));
294
295 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
296
297 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
298
299 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
300
301 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
302
303 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
304
305 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
306 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
307 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
308 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
309 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
310
311 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
312
313 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
314
315 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
316
317 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
318
319 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
320
321 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
322 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
323 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
324 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
325 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
326
327 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
328
329 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
330
331 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
332
333 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
334
335 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
336
337 const float16x4_t vzero = vmov_n_f16(0);
338 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vzero, 1);
339 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vzero, 1);
340 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vzero, 1);
341 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vzero, 1);
342 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vzero, 1);
343
344 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
345
346 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
347
348 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
349
350 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
351
352 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
353
354 const float16x4_t vi0x6789 = vext_f16(vi0x5678, vzero, 1);
355 const float16x4_t vi1x6789 = vext_f16(vi1x5678, vzero, 1);
356 const float16x4_t vi2x6789 = vext_f16(vi2x5678, vzero, 1);
357 const float16x4_t vi3x6789 = vext_f16(vi3x5678, vzero, 1);
358 const float16x4_t vi4x6789 = vext_f16(vi4x5678, vzero, 1);
359
360 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
361
362 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
363
364 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
365
366 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
367
368 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
369
370
371 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
372
373 vo0 = vmin_f16(vo0, vmax);
374
375 if XNN_LIKELY(w & (4 * sizeof(__fp16))) {
376 vst1_f16(o0, vo0); o0 += 4;
377 } else {
378 if (w & (2 * sizeof(__fp16))) {
379 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2;
380
381 vo0 = vext_f16(vo0, vo0, 2);
382 }
383 if (w & (1 * sizeof(__fp16))) {
384 vst1_lane_f16(o0, vo0, 0); o0 += 1;
385 }
386 }
387 }
388
389 i0 = (const __fp16*) ((uintptr_t) i1 - input_decrement);
390 i1 = (const __fp16*) ((uintptr_t) i2 - input_decrement);
391 i2 = (const __fp16*) ((uintptr_t) i1 + input_width);
392 i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
393 i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
394
395
396 } while (--output_height != 0);
397 }
398