• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv/up-psimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <psimd.h>
13 
14 #include <xnnpack/dwconv.h>
15 
16 
xnn_f32_dwconv_ukernel_up4x25__psimd(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_dwconv_ukernel_up4x25__psimd(
18     size_t channels,
19     size_t output_width,
20     const float** input,
21     const float* weights,
22     float* output,
23     size_t input_stride,
24     size_t output_increment,
25     const union xnn_f32_output_params params[restrict static 1])
26 {
27   assert(channels != 0);
28   assert(output_width != 0);
29 
30   const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
31   const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
32   do {
33     const float* i0 = input[0];
34     assert(i0 != NULL);
35     const float* i1 = input[1];
36     assert(i1 != NULL);
37     const float* i2 = input[2];
38     assert(i2 != NULL);
39     const float* i3 = input[3];
40     assert(i3 != NULL);
41     const float* i4 = input[4];
42     assert(i4 != NULL);
43     const float* i5 = input[5];
44     assert(i5 != NULL);
45     const float* i6 = input[6];
46     assert(i6 != NULL);
47     const float* i7 = input[7];
48     assert(i7 != NULL);
49     const float* i8 = input[8];
50     assert(i8 != NULL);
51     const float* i9 = input[9];
52     assert(i9 != NULL);
53     const float* i10 = input[10];
54     assert(i10 != NULL);
55     const float* i11 = input[11];
56     assert(i11 != NULL);
57     const float* i12 = input[12];
58     assert(i12 != NULL);
59     const float* i13 = input[13];
60     assert(i13 != NULL);
61     const float* i14 = input[14];
62     assert(i14 != NULL);
63     const float* i15 = input[15];
64     assert(i15 != NULL);
65     const float* i16 = input[16];
66     assert(i16 != NULL);
67     const float* i17 = input[17];
68     assert(i17 != NULL);
69     const float* i18 = input[18];
70     assert(i18 != NULL);
71     const float* i19 = input[19];
72     assert(i19 != NULL);
73     const float* i20 = input[20];
74     assert(i20 != NULL);
75     const float* i21 = input[21];
76     assert(i21 != NULL);
77     const float* i22 = input[22];
78     assert(i22 != NULL);
79     const float* i23 = input[23];
80     assert(i23 != NULL);
81     const float* i24 = input[24];
82     assert(i24 != NULL);
83     input = (const float**) ((uintptr_t) input + input_stride);
84 
85     size_t c = channels;
86     const float* w = weights;
87     for (; c >= 4; c -= 4) {
88       psimd_f32 vacc0123p0 = psimd_load_f32(w);
89 
90 
91       const psimd_f32 vi0x0123 = psimd_load_f32(i0);
92       i0 += 4;
93 
94       const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
95       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
96 
97       const psimd_f32 vi1x0123 = psimd_load_f32(i1);
98       i1 += 4;
99 
100       const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
101       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
102 
103       const psimd_f32 vi2x0123 = psimd_load_f32(i2);
104       i2 += 4;
105 
106       const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
107       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
108 
109       const psimd_f32 vi3x0123 = psimd_load_f32(i3);
110       i3 += 4;
111 
112       const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
113       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
114 
115       const psimd_f32 vi4x0123 = psimd_load_f32(i4);
116       i4 += 4;
117 
118       const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
119       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
120 
121       const psimd_f32 vi5x0123 = psimd_load_f32(i5);
122       i5 += 4;
123 
124       const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
125       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
126 
127       const psimd_f32 vi6x0123 = psimd_load_f32(i6);
128       i6 += 4;
129 
130       const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
131       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
132 
133       const psimd_f32 vi7x0123 = psimd_load_f32(i7);
134       i7 += 4;
135 
136       const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
137       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
138 
139       const psimd_f32 vi8x0123 = psimd_load_f32(i8);
140       i8 += 4;
141 
142       const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
143       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
144 
145       const psimd_f32 vi9x0123 = psimd_load_f32(i9);
146       i9 += 4;
147 
148       const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
149       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
150 
151       const psimd_f32 vi10x0123 = psimd_load_f32(i10);
152       i10 += 4;
153 
154       const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
155       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
156 
157       const psimd_f32 vi11x0123 = psimd_load_f32(i11);
158       i11 += 4;
159 
160       const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
161       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
162 
163       const psimd_f32 vi12x0123 = psimd_load_f32(i12);
164       i12 += 4;
165 
166       const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
167       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
168 
169       const psimd_f32 vi13x0123 = psimd_load_f32(i13);
170       i13 += 4;
171 
172       const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
173       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
174 
175       const psimd_f32 vi14x0123 = psimd_load_f32(i14);
176       i14 += 4;
177 
178       const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
179       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
180 
181       const psimd_f32 vi15x0123 = psimd_load_f32(i15);
182       i15 += 4;
183 
184       const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
185       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
186 
187       const psimd_f32 vi16x0123 = psimd_load_f32(i16);
188       i16 += 4;
189 
190       const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
191       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
192 
193       const psimd_f32 vi17x0123 = psimd_load_f32(i17);
194       i17 += 4;
195 
196       const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
197       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
198 
199       const psimd_f32 vi18x0123 = psimd_load_f32(i18);
200       i18 += 4;
201 
202       const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
203       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
204 
205       const psimd_f32 vi19x0123 = psimd_load_f32(i19);
206       i19 += 4;
207 
208       const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
209       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
210 
211       const psimd_f32 vi20x0123 = psimd_load_f32(i20);
212       i20 += 4;
213 
214       const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
215       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
216 
217       const psimd_f32 vi21x0123 = psimd_load_f32(i21);
218       i21 += 4;
219 
220       const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
221       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
222 
223       const psimd_f32 vi22x0123 = psimd_load_f32(i22);
224       i22 += 4;
225 
226       const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
227       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
228 
229       const psimd_f32 vi23x0123 = psimd_load_f32(i23);
230       i23 += 4;
231 
232       const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
233       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
234 
235       const psimd_f32 vi24x0123 = psimd_load_f32(i24);
236       i24 += 4;
237 
238       const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
239       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
240 
241       w += 104;
242 
243 
244       psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
245       vacc0123 = psimd_min_f32(vacc0123, vmax);
246 
247       psimd_store_f32(output, vacc0123);
248       output += 4;
249     }
250     if XNN_UNLIKELY(c != 0) {
251       psimd_f32 vacc0123p0 = psimd_load_f32(w);
252 
253       const psimd_f32 vi0x0123 = psimd_load_f32(i0);
254       const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
255       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
256 
257       const psimd_f32 vi1x0123 = psimd_load_f32(i1);
258       const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
259       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
260 
261       const psimd_f32 vi2x0123 = psimd_load_f32(i2);
262       const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
263       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
264 
265       const psimd_f32 vi3x0123 = psimd_load_f32(i3);
266       const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
267       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
268 
269       const psimd_f32 vi4x0123 = psimd_load_f32(i4);
270       const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
271       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
272 
273       const psimd_f32 vi5x0123 = psimd_load_f32(i5);
274       const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
275       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
276 
277       const psimd_f32 vi6x0123 = psimd_load_f32(i6);
278       const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
279       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
280 
281       const psimd_f32 vi7x0123 = psimd_load_f32(i7);
282       const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
283       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
284 
285       const psimd_f32 vi8x0123 = psimd_load_f32(i8);
286       const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
287       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
288 
289       const psimd_f32 vi9x0123 = psimd_load_f32(i9);
290       const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
291       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
292 
293       const psimd_f32 vi10x0123 = psimd_load_f32(i10);
294       const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
295       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
296 
297       const psimd_f32 vi11x0123 = psimd_load_f32(i11);
298       const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
299       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
300 
301       const psimd_f32 vi12x0123 = psimd_load_f32(i12);
302       const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
303       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
304 
305       const psimd_f32 vi13x0123 = psimd_load_f32(i13);
306       const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
307       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
308 
309       const psimd_f32 vi14x0123 = psimd_load_f32(i14);
310       const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
311       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
312 
313       const psimd_f32 vi15x0123 = psimd_load_f32(i15);
314       const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
315       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
316 
317       const psimd_f32 vi16x0123 = psimd_load_f32(i16);
318       const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
319       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
320 
321       const psimd_f32 vi17x0123 = psimd_load_f32(i17);
322       const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
323       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
324 
325       const psimd_f32 vi18x0123 = psimd_load_f32(i18);
326       const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
327       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
328 
329       const psimd_f32 vi19x0123 = psimd_load_f32(i19);
330       const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
331       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
332 
333       const psimd_f32 vi20x0123 = psimd_load_f32(i20);
334       const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
335       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
336 
337       const psimd_f32 vi21x0123 = psimd_load_f32(i21);
338       const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
339       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
340 
341       const psimd_f32 vi22x0123 = psimd_load_f32(i22);
342       const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
343       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
344 
345       const psimd_f32 vi23x0123 = psimd_load_f32(i23);
346       const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
347       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
348 
349       const psimd_f32 vi24x0123 = psimd_load_f32(i24);
350       const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
351       vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
352 
353 
354       psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
355       vacc0123 = psimd_min_f32(vacc0123, vmax);
356 
357       if (c & 2) {
358         psimd_store2_f32(output, vacc0123);
359         vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
360         output += 2;
361       }
362       if (c & 1) {
363         psimd_store1_f32(output, vacc0123);
364         output += 1;
365       }
366     }
367 
368     output = (float*) ((uintptr_t) output + output_increment);
369   } while (--output_width != 0);
370 }
371