1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv/up-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_f32_dwconv_ukernel_up8x25__sse_acc2(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_dwconv_ukernel_up8x25__sse_acc2(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const __m128 vmax = _mm_load_ps(params->sse.max);
31 const __m128 vmin = _mm_load_ps(params->sse.min);
32 do {
33 const float* i0 = input[0];
34 assert(i0 != NULL);
35 const float* i1 = input[1];
36 assert(i1 != NULL);
37 const float* i2 = input[2];
38 assert(i2 != NULL);
39 const float* i3 = input[3];
40 assert(i3 != NULL);
41 const float* i4 = input[4];
42 assert(i4 != NULL);
43 const float* i5 = input[5];
44 assert(i5 != NULL);
45 const float* i6 = input[6];
46 assert(i6 != NULL);
47 const float* i7 = input[7];
48 assert(i7 != NULL);
49 const float* i8 = input[8];
50 assert(i8 != NULL);
51 const float* i9 = input[9];
52 assert(i9 != NULL);
53 const float* i10 = input[10];
54 assert(i10 != NULL);
55 const float* i11 = input[11];
56 assert(i11 != NULL);
57 const float* i12 = input[12];
58 assert(i12 != NULL);
59 const float* i13 = input[13];
60 assert(i13 != NULL);
61 const float* i14 = input[14];
62 assert(i14 != NULL);
63 const float* i15 = input[15];
64 assert(i15 != NULL);
65 const float* i16 = input[16];
66 assert(i16 != NULL);
67 const float* i17 = input[17];
68 assert(i17 != NULL);
69 const float* i18 = input[18];
70 assert(i18 != NULL);
71 const float* i19 = input[19];
72 assert(i19 != NULL);
73 const float* i20 = input[20];
74 assert(i20 != NULL);
75 const float* i21 = input[21];
76 assert(i21 != NULL);
77 const float* i22 = input[22];
78 assert(i22 != NULL);
79 const float* i23 = input[23];
80 assert(i23 != NULL);
81 const float* i24 = input[24];
82 assert(i24 != NULL);
83 input = (const float**) ((uintptr_t) input + input_stride);
84
85 size_t c = channels;
86 const float* w = weights;
87 for (; c >= 8; c -= 8) {
88 __m128 vacc0123p0 = _mm_load_ps(w);
89 __m128 vacc4567p0 = _mm_load_ps(w + 4);
90
91
92 const __m128 vi0x0123 = _mm_loadu_ps(i0);
93 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
94 i0 += 8;
95
96 const __m128 vk0x0123 = _mm_load_ps(w + 8);
97 const __m128 vk0x4567 = _mm_load_ps(w + 12);
98 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
99 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
100
101 const __m128 vi1x0123 = _mm_loadu_ps(i1);
102 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
103 i1 += 8;
104
105 const __m128 vk1x0123 = _mm_load_ps(w + 16);
106 const __m128 vk1x4567 = _mm_load_ps(w + 20);
107 __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
108 __m128 vacc4567p1 = _mm_mul_ps(vi1x4567, vk1x4567);
109
110 const __m128 vi2x0123 = _mm_loadu_ps(i2);
111 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
112 i2 += 8;
113
114 const __m128 vk2x0123 = _mm_load_ps(w + 24);
115 const __m128 vk2x4567 = _mm_load_ps(w + 28);
116 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
117 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
118
119 const __m128 vi3x0123 = _mm_loadu_ps(i3);
120 const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
121 i3 += 8;
122
123 const __m128 vk3x0123 = _mm_load_ps(w + 32);
124 const __m128 vk3x4567 = _mm_load_ps(w + 36);
125 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
126 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi3x4567, vk3x4567));
127
128 const __m128 vi4x0123 = _mm_loadu_ps(i4);
129 const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
130 i4 += 8;
131
132 const __m128 vk4x0123 = _mm_load_ps(w + 40);
133 const __m128 vk4x4567 = _mm_load_ps(w + 44);
134 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
135 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
136
137 const __m128 vi5x0123 = _mm_loadu_ps(i5);
138 const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
139 i5 += 8;
140
141 const __m128 vk5x0123 = _mm_load_ps(w + 48);
142 const __m128 vk5x4567 = _mm_load_ps(w + 52);
143 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
144 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi5x4567, vk5x4567));
145
146 const __m128 vi6x0123 = _mm_loadu_ps(i6);
147 const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
148 i6 += 8;
149
150 const __m128 vk6x0123 = _mm_load_ps(w + 56);
151 const __m128 vk6x4567 = _mm_load_ps(w + 60);
152 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
153 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
154
155 const __m128 vi7x0123 = _mm_loadu_ps(i7);
156 const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
157 i7 += 8;
158
159 const __m128 vk7x0123 = _mm_load_ps(w + 64);
160 const __m128 vk7x4567 = _mm_load_ps(w + 68);
161 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
162 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi7x4567, vk7x4567));
163
164 const __m128 vi8x0123 = _mm_loadu_ps(i8);
165 const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
166 i8 += 8;
167
168 const __m128 vk8x0123 = _mm_load_ps(w + 72);
169 const __m128 vk8x4567 = _mm_load_ps(w + 76);
170 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
171 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
172
173 const __m128 vi9x0123 = _mm_loadu_ps(i9);
174 const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
175 i9 += 8;
176
177 const __m128 vk9x0123 = _mm_load_ps(w + 80);
178 const __m128 vk9x4567 = _mm_load_ps(w + 84);
179 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
180 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi9x4567, vk9x4567));
181
182 const __m128 vi10x0123 = _mm_loadu_ps(i10);
183 const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
184 i10 += 8;
185
186 const __m128 vk10x0123 = _mm_load_ps(w + 88);
187 const __m128 vk10x4567 = _mm_load_ps(w + 92);
188 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
189 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
190
191 const __m128 vi11x0123 = _mm_loadu_ps(i11);
192 const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
193 i11 += 8;
194
195 const __m128 vk11x0123 = _mm_load_ps(w + 96);
196 const __m128 vk11x4567 = _mm_load_ps(w + 100);
197 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
198 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi11x4567, vk11x4567));
199
200 const __m128 vi12x0123 = _mm_loadu_ps(i12);
201 const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
202 i12 += 8;
203
204 const __m128 vk12x0123 = _mm_load_ps(w + 104);
205 const __m128 vk12x4567 = _mm_load_ps(w + 108);
206 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
207 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
208
209 const __m128 vi13x0123 = _mm_loadu_ps(i13);
210 const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
211 i13 += 8;
212
213 const __m128 vk13x0123 = _mm_load_ps(w + 112);
214 const __m128 vk13x4567 = _mm_load_ps(w + 116);
215 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
216 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi13x4567, vk13x4567));
217
218 const __m128 vi14x0123 = _mm_loadu_ps(i14);
219 const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
220 i14 += 8;
221
222 const __m128 vk14x0123 = _mm_load_ps(w + 120);
223 const __m128 vk14x4567 = _mm_load_ps(w + 124);
224 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
225 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
226
227 const __m128 vi15x0123 = _mm_loadu_ps(i15);
228 const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
229 i15 += 8;
230
231 const __m128 vk15x0123 = _mm_load_ps(w + 128);
232 const __m128 vk15x4567 = _mm_load_ps(w + 132);
233 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
234 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi15x4567, vk15x4567));
235
236 const __m128 vi16x0123 = _mm_loadu_ps(i16);
237 const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
238 i16 += 8;
239
240 const __m128 vk16x0123 = _mm_load_ps(w + 136);
241 const __m128 vk16x4567 = _mm_load_ps(w + 140);
242 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
243 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
244
245 const __m128 vi17x0123 = _mm_loadu_ps(i17);
246 const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
247 i17 += 8;
248
249 const __m128 vk17x0123 = _mm_load_ps(w + 144);
250 const __m128 vk17x4567 = _mm_load_ps(w + 148);
251 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
252 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi17x4567, vk17x4567));
253
254 const __m128 vi18x0123 = _mm_loadu_ps(i18);
255 const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
256 i18 += 8;
257
258 const __m128 vk18x0123 = _mm_load_ps(w + 152);
259 const __m128 vk18x4567 = _mm_load_ps(w + 156);
260 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
261 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
262
263 const __m128 vi19x0123 = _mm_loadu_ps(i19);
264 const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
265 i19 += 8;
266
267 const __m128 vk19x0123 = _mm_load_ps(w + 160);
268 const __m128 vk19x4567 = _mm_load_ps(w + 164);
269 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
270 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi19x4567, vk19x4567));
271
272 const __m128 vi20x0123 = _mm_loadu_ps(i20);
273 const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
274 i20 += 8;
275
276 const __m128 vk20x0123 = _mm_load_ps(w + 168);
277 const __m128 vk20x4567 = _mm_load_ps(w + 172);
278 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
279 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
280
281 const __m128 vi21x0123 = _mm_loadu_ps(i21);
282 const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
283 i21 += 8;
284
285 const __m128 vk21x0123 = _mm_load_ps(w + 176);
286 const __m128 vk21x4567 = _mm_load_ps(w + 180);
287 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
288 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi21x4567, vk21x4567));
289
290 const __m128 vi22x0123 = _mm_loadu_ps(i22);
291 const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
292 i22 += 8;
293
294 const __m128 vk22x0123 = _mm_load_ps(w + 184);
295 const __m128 vk22x4567 = _mm_load_ps(w + 188);
296 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
297 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
298
299 const __m128 vi23x0123 = _mm_loadu_ps(i23);
300 const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
301 i23 += 8;
302
303 const __m128 vk23x0123 = _mm_load_ps(w + 192);
304 const __m128 vk23x4567 = _mm_load_ps(w + 196);
305 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
306 vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi23x4567, vk23x4567));
307
308 const __m128 vi24x0123 = _mm_loadu_ps(i24);
309 const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
310 i24 += 8;
311
312 const __m128 vk24x0123 = _mm_load_ps(w + 200);
313 const __m128 vk24x4567 = _mm_load_ps(w + 204);
314 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
315 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
316
317 w += 208;
318
319 // Add up all accumulators to vacc01234567p0
320 vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
321 vacc4567p0 = _mm_add_ps(vacc4567p0, vacc4567p1);
322
323 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
324 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
325 vacc0123 = _mm_min_ps(vacc0123, vmax);
326 vacc4567 = _mm_min_ps(vacc4567, vmax);
327
328 _mm_storeu_ps(output, vacc0123);
329 _mm_storeu_ps(output + 4, vacc4567);
330 output += 8;
331 }
332 for (; c >= 4; c -= 4) {
333 __m128 vacc0123p0 = _mm_load_ps(w);
334
335 const __m128 vi0x0123 = _mm_loadu_ps(i0);
336 i0 += 4;
337
338 const __m128 vk0x0123 = _mm_load_ps(w + 8);
339 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
340
341 const __m128 vi1x0123 = _mm_loadu_ps(i1);
342 i1 += 4;
343
344 const __m128 vk1x0123 = _mm_load_ps(w + 16);
345 __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
346
347 const __m128 vi2x0123 = _mm_loadu_ps(i2);
348 i2 += 4;
349
350 const __m128 vk2x0123 = _mm_load_ps(w + 24);
351 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
352
353 const __m128 vi3x0123 = _mm_loadu_ps(i3);
354 i3 += 4;
355
356 const __m128 vk3x0123 = _mm_load_ps(w + 32);
357 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
358
359 const __m128 vi4x0123 = _mm_loadu_ps(i4);
360 i4 += 4;
361
362 const __m128 vk4x0123 = _mm_load_ps(w + 40);
363 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
364
365 const __m128 vi5x0123 = _mm_loadu_ps(i5);
366 i5 += 4;
367
368 const __m128 vk5x0123 = _mm_load_ps(w + 48);
369 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
370
371 const __m128 vi6x0123 = _mm_loadu_ps(i6);
372 i6 += 4;
373
374 const __m128 vk6x0123 = _mm_load_ps(w + 56);
375 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
376
377 const __m128 vi7x0123 = _mm_loadu_ps(i7);
378 i7 += 4;
379
380 const __m128 vk7x0123 = _mm_load_ps(w + 64);
381 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
382
383 const __m128 vi8x0123 = _mm_loadu_ps(i8);
384 i8 += 4;
385
386 const __m128 vk8x0123 = _mm_load_ps(w + 72);
387 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
388
389 const __m128 vi9x0123 = _mm_loadu_ps(i9);
390 i9 += 4;
391
392 const __m128 vk9x0123 = _mm_load_ps(w + 80);
393 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
394
395 const __m128 vi10x0123 = _mm_loadu_ps(i10);
396 i10 += 4;
397
398 const __m128 vk10x0123 = _mm_load_ps(w + 88);
399 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
400
401 const __m128 vi11x0123 = _mm_loadu_ps(i11);
402 i11 += 4;
403
404 const __m128 vk11x0123 = _mm_load_ps(w + 96);
405 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
406
407 const __m128 vi12x0123 = _mm_loadu_ps(i12);
408 i12 += 4;
409
410 const __m128 vk12x0123 = _mm_load_ps(w + 104);
411 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
412
413 const __m128 vi13x0123 = _mm_loadu_ps(i13);
414 i13 += 4;
415
416 const __m128 vk13x0123 = _mm_load_ps(w + 112);
417 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
418
419 const __m128 vi14x0123 = _mm_loadu_ps(i14);
420 i14 += 4;
421
422 const __m128 vk14x0123 = _mm_load_ps(w + 120);
423 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
424
425 const __m128 vi15x0123 = _mm_loadu_ps(i15);
426 i15 += 4;
427
428 const __m128 vk15x0123 = _mm_load_ps(w + 128);
429 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
430
431 const __m128 vi16x0123 = _mm_loadu_ps(i16);
432 i16 += 4;
433
434 const __m128 vk16x0123 = _mm_load_ps(w + 136);
435 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
436
437 const __m128 vi17x0123 = _mm_loadu_ps(i17);
438 i17 += 4;
439
440 const __m128 vk17x0123 = _mm_load_ps(w + 144);
441 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
442
443 const __m128 vi18x0123 = _mm_loadu_ps(i18);
444 i18 += 4;
445
446 const __m128 vk18x0123 = _mm_load_ps(w + 152);
447 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
448
449 const __m128 vi19x0123 = _mm_loadu_ps(i19);
450 i19 += 4;
451
452 const __m128 vk19x0123 = _mm_load_ps(w + 160);
453 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
454
455 const __m128 vi20x0123 = _mm_loadu_ps(i20);
456 i20 += 4;
457
458 const __m128 vk20x0123 = _mm_load_ps(w + 168);
459 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
460
461 const __m128 vi21x0123 = _mm_loadu_ps(i21);
462 i21 += 4;
463
464 const __m128 vk21x0123 = _mm_load_ps(w + 176);
465 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
466
467 const __m128 vi22x0123 = _mm_loadu_ps(i22);
468 i22 += 4;
469
470 const __m128 vk22x0123 = _mm_load_ps(w + 184);
471 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
472
473 const __m128 vi23x0123 = _mm_loadu_ps(i23);
474 i23 += 4;
475
476 const __m128 vk23x0123 = _mm_load_ps(w + 192);
477 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
478
479 const __m128 vi24x0123 = _mm_loadu_ps(i24);
480 i24 += 4;
481
482 const __m128 vk24x0123 = _mm_load_ps(w + 200);
483 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
484
485 w += 4;
486
487 // Add up all accumulators to vacc01234567p0
488 vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
489
490 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
491 vacc0123 = _mm_min_ps(vacc0123, vmax);
492
493 _mm_storeu_ps(output, vacc0123);
494 output += 4;
495 }
496 if XNN_UNLIKELY(c != 0) {
497 __m128 vacc0123p0 = _mm_load_ps(w);
498
499 const __m128 vi0x0123 = _mm_loadu_ps(i0);
500 const __m128 vk0x0123 = _mm_load_ps(w + 8);
501 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
502
503 const __m128 vi1x0123 = _mm_loadu_ps(i1);
504 const __m128 vk1x0123 = _mm_load_ps(w + 16);
505 __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
506
507 const __m128 vi2x0123 = _mm_loadu_ps(i2);
508 const __m128 vk2x0123 = _mm_load_ps(w + 24);
509 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
510
511 const __m128 vi3x0123 = _mm_loadu_ps(i3);
512 const __m128 vk3x0123 = _mm_load_ps(w + 32);
513 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
514
515 const __m128 vi4x0123 = _mm_loadu_ps(i4);
516 const __m128 vk4x0123 = _mm_load_ps(w + 40);
517 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
518
519 const __m128 vi5x0123 = _mm_loadu_ps(i5);
520 const __m128 vk5x0123 = _mm_load_ps(w + 48);
521 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
522
523 const __m128 vi6x0123 = _mm_loadu_ps(i6);
524 const __m128 vk6x0123 = _mm_load_ps(w + 56);
525 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
526
527 const __m128 vi7x0123 = _mm_loadu_ps(i7);
528 const __m128 vk7x0123 = _mm_load_ps(w + 64);
529 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
530
531 const __m128 vi8x0123 = _mm_loadu_ps(i8);
532 const __m128 vk8x0123 = _mm_load_ps(w + 72);
533 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
534
535 const __m128 vi9x0123 = _mm_loadu_ps(i9);
536 const __m128 vk9x0123 = _mm_load_ps(w + 80);
537 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
538
539 const __m128 vi10x0123 = _mm_loadu_ps(i10);
540 const __m128 vk10x0123 = _mm_load_ps(w + 88);
541 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
542
543 const __m128 vi11x0123 = _mm_loadu_ps(i11);
544 const __m128 vk11x0123 = _mm_load_ps(w + 96);
545 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
546
547 const __m128 vi12x0123 = _mm_loadu_ps(i12);
548 const __m128 vk12x0123 = _mm_load_ps(w + 104);
549 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
550
551 const __m128 vi13x0123 = _mm_loadu_ps(i13);
552 const __m128 vk13x0123 = _mm_load_ps(w + 112);
553 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
554
555 const __m128 vi14x0123 = _mm_loadu_ps(i14);
556 const __m128 vk14x0123 = _mm_load_ps(w + 120);
557 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
558
559 const __m128 vi15x0123 = _mm_loadu_ps(i15);
560 const __m128 vk15x0123 = _mm_load_ps(w + 128);
561 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
562
563 const __m128 vi16x0123 = _mm_loadu_ps(i16);
564 const __m128 vk16x0123 = _mm_load_ps(w + 136);
565 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
566
567 const __m128 vi17x0123 = _mm_loadu_ps(i17);
568 const __m128 vk17x0123 = _mm_load_ps(w + 144);
569 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
570
571 const __m128 vi18x0123 = _mm_loadu_ps(i18);
572 const __m128 vk18x0123 = _mm_load_ps(w + 152);
573 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
574
575 const __m128 vi19x0123 = _mm_loadu_ps(i19);
576 const __m128 vk19x0123 = _mm_load_ps(w + 160);
577 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
578
579 const __m128 vi20x0123 = _mm_loadu_ps(i20);
580 const __m128 vk20x0123 = _mm_load_ps(w + 168);
581 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
582
583 const __m128 vi21x0123 = _mm_loadu_ps(i21);
584 const __m128 vk21x0123 = _mm_load_ps(w + 176);
585 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
586
587 const __m128 vi22x0123 = _mm_loadu_ps(i22);
588 const __m128 vk22x0123 = _mm_load_ps(w + 184);
589 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
590
591 const __m128 vi23x0123 = _mm_loadu_ps(i23);
592 const __m128 vk23x0123 = _mm_load_ps(w + 192);
593 vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
594
595 const __m128 vi24x0123 = _mm_loadu_ps(i24);
596 const __m128 vk24x0123 = _mm_load_ps(w + 200);
597 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
598
599 // Add up all accumulators to vacc01234567p0
600 vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
601
602 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
603 vacc0123 = _mm_min_ps(vacc0123, vmax);
604
605 if (c & 2) {
606 _mm_storel_pi((__m64*) output, vacc0123);
607 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
608 output += 2;
609 }
610 if (c & 1) {
611 _mm_store_ss(output, vacc0123);
612 output += 1;
613 }
614 }
615
616 output = (float*) ((uintptr_t) output + output_increment);
617 } while (--output_width != 0);
618 }
619