1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top == 2);
32
33 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34 const __m128 vmax = _mm_load_ps(params->sse.max);
35 const __m128 vmin = _mm_load_ps(params->sse.min);
36
37 const __m128 vbias = _mm_load1_ps(weights);
38 const __m128 vk00 = _mm_load1_ps(weights + 1);
39 const __m128 vk01 = _mm_load1_ps(weights + 2);
40 const __m128 vk02 = _mm_load1_ps(weights + 3);
41 const __m128 vk03 = _mm_load1_ps(weights + 4);
42 const __m128 vk04 = _mm_load1_ps(weights + 5);
43 const __m128 vk10 = _mm_load1_ps(weights + 6);
44 const __m128 vk11 = _mm_load1_ps(weights + 7);
45 const __m128 vk12 = _mm_load1_ps(weights + 8);
46 const __m128 vk13 = _mm_load1_ps(weights + 9);
47 const __m128 vk14 = _mm_load1_ps(weights + 10);
48 const __m128 vk20 = _mm_load1_ps(weights + 11);
49 const __m128 vk21 = _mm_load1_ps(weights + 12);
50 const __m128 vk22 = _mm_load1_ps(weights + 13);
51 const __m128 vk23 = _mm_load1_ps(weights + 14);
52 const __m128 vk24 = _mm_load1_ps(weights + 15);
53 const __m128 vk30 = _mm_load1_ps(weights + 16);
54 const __m128 vk31 = _mm_load1_ps(weights + 17);
55 const __m128 vk32 = _mm_load1_ps(weights + 18);
56 const __m128 vk33 = _mm_load1_ps(weights + 19);
57 const __m128 vk34 = _mm_load1_ps(weights + 20);
58 const __m128 vk40 = _mm_load1_ps(weights + 21);
59 const __m128 vk41 = _mm_load1_ps(weights + 22);
60 const __m128 vk42 = _mm_load1_ps(weights + 23);
61 const __m128 vk43 = _mm_load1_ps(weights + 24);
62 const __m128 vk44 = _mm_load1_ps(weights + 25);
63
64 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65
66 const float* i0 = zero;
67 const float* i1 = zero;
68 const float* i2 = input;
69 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71
72 float* o0 = output;
73
74 size_t output_height = input_height;
75 do {
76 if XNN_UNPREDICTABLE(output_height < 2) {
77 i3 = zero;
78 }
79 if XNN_UNPREDICTABLE(output_height < 3) {
80 i4 = zero;
81 }
82
83 __m128 vi0x3012 = _mm_setzero_ps();
84 __m128 vi1x3012 = _mm_setzero_ps();
85 __m128 vi2x3012 = _mm_setzero_ps();
86 __m128 vi3x3012 = _mm_setzero_ps();
87 __m128 vi4x3012 = _mm_setzero_ps();
88
89 __m128 vi0x4567 = _mm_loadu_ps(i0);
90 i0 += 4;
91 __m128 vi1x4567 = _mm_loadu_ps(i1);
92 i1 += 4;
93 __m128 vi2x4567 = _mm_loadu_ps(i2);
94 i2 += 4;
95 __m128 vi3x4567 = _mm_loadu_ps(i3);
96 i3 += 4;
97 __m128 vi4x4567 = _mm_loadu_ps(i4);
98 i4 += 4;
99
100 size_t w = input_width;
101 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
102 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
103 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
104 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
105 __m128 vo0p3 = _mm_mul_ps(vi3x4567, vk32);
106 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
107
108 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
109 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
110 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
111 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
112 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
113
114 const __m128 vi0x89AB = _mm_loadu_ps(i0);
115 i0 += 4;
116 const __m128 vi1x89AB = _mm_loadu_ps(i1);
117 i1 += 4;
118 const __m128 vi2x89AB = _mm_loadu_ps(i2);
119 i2 += 4;
120 const __m128 vi3x89AB = _mm_loadu_ps(i3);
121 i3 += 4;
122 const __m128 vi4x89AB = _mm_loadu_ps(i4);
123 i4 += 4;
124
125 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
126 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
127 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
128 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
129 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
130
131 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
132 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x3456, vk11));
133 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi2x3456, vk21));
134 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
135 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
136
137 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
138 vi0x3012 = vi0x7456;
139 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
140 vi1x3012 = vi1x7456;
141 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
142 vi2x3012 = vi2x7456;
143 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
144 vi3x3012 = vi3x7456;
145 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
146 vi4x3012 = vi4x7456;
147
148 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
149 vi0x4567 = vi0x89AB;
150 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
151 vi1x4567 = vi1x89AB;
152 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
153 vi2x4567 = vi2x89AB;
154 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
155 vi3x4567 = vi3x89AB;
156 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
157 vi4x4567 = vi4x89AB;
158
159 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x2345, vk00));
160 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi1x2345, vk10));
161 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
162 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
163 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
164
165 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
166 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
167 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
168 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
170
171 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi0x5678, vk03));
172 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
173 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
174 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x5678, vk33));
175 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43));
176
177 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
178 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
179 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
180 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
182
183 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
184 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
185 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x6789, vk24));
186 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi3x6789, vk34));
187 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
188
189 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
190 vo0p2 = _mm_add_ps(vo0p2, vo0p3);
191 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
192
193 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
194
195 vo0 = _mm_min_ps(vo0, vmax);
196
197 _mm_storeu_ps(o0, vo0);
198 o0 += 4;
199 }
200 // Always process the last block of 5..8 pixels.
201 if XNN_LIKELY(w > 4 * sizeof(float)) {
202 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
203 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
204 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
205 __m128 vo0p3 = _mm_mul_ps(vi3x4567, vk32);
206 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
207
208 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
209 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
210 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
211 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
212 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
213
214 const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
215 i0 += 4;
216 const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
217 i1 += 4;
218 const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
219 i2 += 4;
220 const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
221 i3 += 4;
222 const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
223 i4 += 4;
224
225 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
226 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
227 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
228 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
229 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
230
231 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
232 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x3456, vk11));
233 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi2x3456, vk21));
234 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
235 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
236
237 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
238 vi0x3012 = vi0x7456;
239 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
240 vi1x3012 = vi1x7456;
241 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
242 vi2x3012 = vi2x7456;
243 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
244 vi3x3012 = vi3x7456;
245 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
246 vi4x3012 = vi4x7456;
247
248 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
249 vi0x4567 = vi0x89AB;
250 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
251 vi1x4567 = vi1x89AB;
252 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
253 vi2x4567 = vi2x89AB;
254 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
255 vi3x4567 = vi3x89AB;
256 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
257 vi4x4567 = vi4x89AB;
258
259 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x2345, vk00));
260 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi1x2345, vk10));
261 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
262 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
263 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
264
265 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
266 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
267 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
268 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
269 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
270
271 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi0x5678, vk03));
272 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
273 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
274 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x5678, vk33));
275 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43));
276
277 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
278 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
279 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
280 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
281 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
282
283 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
284 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
285 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x6789, vk24));
286 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi3x6789, vk34));
287 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
288
289 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
290 vo0p2 = _mm_add_ps(vo0p2, vo0p3);
291 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
292
293 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
294
295 vo0 = _mm_min_ps(vo0, vmax);
296
297 _mm_storeu_ps(o0, vo0);
298 o0 += 4;
299
300 w -= 4 * sizeof(float);
301 }
302 assert(w >= 1 * sizeof(float));
303 assert(w <= 4 * sizeof(float));
304 {
305 vi0x4567 = _mm_and_ps(vi0x4567, vmask);
306 vi1x4567 = _mm_and_ps(vi1x4567, vmask);
307 vi2x4567 = _mm_and_ps(vi2x4567, vmask);
308 vi3x4567 = _mm_and_ps(vi3x4567, vmask);
309 vi4x4567 = _mm_and_ps(vi4x4567, vmask);
310
311 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
312 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
313 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
314 __m128 vo0p3 = _mm_mul_ps(vi3x4567, vk32);
315 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
316
317 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
318 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
319 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
320 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
321 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
322
323 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
324 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
325 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
326 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
327 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
328
329 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
330 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x3456, vk11));
331 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi2x3456, vk21));
332 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
333 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
334
335 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
336 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
337 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
338 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
339 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
340
341 const __m128 vzero = _mm_setzero_ps();
342 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
343 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
344 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
345 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
346 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
347
348 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x2345, vk00));
349 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi1x2345, vk10));
350 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
351 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
352 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
353
354 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
355 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
356 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
357 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
358 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
359
360 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi0x5678, vk03));
361 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
362 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
363 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x5678, vk33));
364 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43));
365
366 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
367 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
368 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
369 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
370 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
371
372 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
373 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
374 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x6789, vk24));
375 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi3x6789, vk34));
376 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
377
378 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
379 vo0p2 = _mm_add_ps(vo0p2, vo0p3);
380 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
381
382 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
383
384 vo0 = _mm_min_ps(vo0, vmax);
385
386 if XNN_LIKELY(w & (4 * sizeof(float))) {
387 _mm_storeu_ps(o0, vo0);
388 o0 += 4;
389 } else {
390 if (w & (2 * sizeof(float))) {
391 _mm_storel_pi((__m64*) o0, vo0);
392 o0 += 2;
393
394 vo0 = _mm_movehl_ps(vo0, vo0);
395 }
396 if (w & (1 * sizeof(float))) {
397 _mm_store_ss(o0, vo0);
398 o0 += 1;
399 }
400 }
401 }
402
403 i0 = (const float*) ((uintptr_t) i1 - input_decrement);
404 i1 = (const float*) ((uintptr_t) i2 - input_decrement);
405 i2 = (const float*) ((uintptr_t) i1 + input_width);
406 i3 = (const float*) ((uintptr_t) i2 + input_width);
407 i4 = (const float*) ((uintptr_t) i3 + input_width);
408
409
410 } while (--output_height != 0);
411 }
412