1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top == 2);
32
33 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34 const __m128 vmax = _mm_load_ps(params->sse.max);
35 const __m128 vmin = _mm_load_ps(params->sse.min);
36
37 const __m128 vbias = _mm_load1_ps(weights);
38 const __m128 vk00 = _mm_load1_ps(weights + 1);
39 const __m128 vk01 = _mm_load1_ps(weights + 2);
40 const __m128 vk02 = _mm_load1_ps(weights + 3);
41 const __m128 vk03 = _mm_load1_ps(weights + 4);
42 const __m128 vk04 = _mm_load1_ps(weights + 5);
43 const __m128 vk10 = _mm_load1_ps(weights + 6);
44 const __m128 vk11 = _mm_load1_ps(weights + 7);
45 const __m128 vk12 = _mm_load1_ps(weights + 8);
46 const __m128 vk13 = _mm_load1_ps(weights + 9);
47 const __m128 vk14 = _mm_load1_ps(weights + 10);
48 const __m128 vk20 = _mm_load1_ps(weights + 11);
49 const __m128 vk21 = _mm_load1_ps(weights + 12);
50 const __m128 vk22 = _mm_load1_ps(weights + 13);
51 const __m128 vk23 = _mm_load1_ps(weights + 14);
52 const __m128 vk24 = _mm_load1_ps(weights + 15);
53 const __m128 vk30 = _mm_load1_ps(weights + 16);
54 const __m128 vk31 = _mm_load1_ps(weights + 17);
55 const __m128 vk32 = _mm_load1_ps(weights + 18);
56 const __m128 vk33 = _mm_load1_ps(weights + 19);
57 const __m128 vk34 = _mm_load1_ps(weights + 20);
58 const __m128 vk40 = _mm_load1_ps(weights + 21);
59 const __m128 vk41 = _mm_load1_ps(weights + 22);
60 const __m128 vk42 = _mm_load1_ps(weights + 23);
61 const __m128 vk43 = _mm_load1_ps(weights + 24);
62 const __m128 vk44 = _mm_load1_ps(weights + 25);
63
64 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65
66 const float* i0 = zero;
67 const float* i1 = zero;
68 const float* i2 = input;
69 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
72
73 float* o0 = output;
74 float* o1 = (float*) ((uintptr_t) o0 + input_width);
75
76 size_t output_height = input_height;
77 do {
78 if XNN_UNPREDICTABLE(output_height < 2) {
79 i3 = zero;
80 o1 = o0;
81 }
82 if XNN_UNPREDICTABLE(output_height < 3) {
83 i4 = zero;
84 }
85 if XNN_UNPREDICTABLE(output_height < 4) {
86 i5 = zero;
87 }
88
89 __m128 vi0x3012 = _mm_setzero_ps();
90 __m128 vi1x3012 = _mm_setzero_ps();
91 __m128 vi2x3012 = _mm_setzero_ps();
92 __m128 vi3x3012 = _mm_setzero_ps();
93 __m128 vi4x3012 = _mm_setzero_ps();
94 __m128 vi5x3012 = _mm_setzero_ps();
95
96 __m128 vi0x4567 = _mm_loadu_ps(i0);
97 i0 += 4;
98 __m128 vi1x4567 = _mm_loadu_ps(i1);
99 i1 += 4;
100 __m128 vi2x4567 = _mm_loadu_ps(i2);
101 i2 += 4;
102 __m128 vi3x4567 = _mm_loadu_ps(i3);
103 i3 += 4;
104 __m128 vi4x4567 = _mm_loadu_ps(i4);
105 i4 += 4;
106 __m128 vi5x4567 = _mm_loadu_ps(i5);
107 i5 += 4;
108
109 size_t w = input_width;
110 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
111 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
112 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
113 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
114 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
115 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
116 __m128 vo1p2 = _mm_mul_ps(vi3x4567, vk22);
117 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
118 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
119 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x4567, vk42));
120 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x4567, vk42));
121
122 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
123 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
124 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
125 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
126 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
127 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
128
129 const __m128 vi0x89AB = _mm_loadu_ps(i0);
130 i0 += 4;
131 const __m128 vi1x89AB = _mm_loadu_ps(i1);
132 i1 += 4;
133 const __m128 vi2x89AB = _mm_loadu_ps(i2);
134 i2 += 4;
135 const __m128 vi3x89AB = _mm_loadu_ps(i3);
136 i3 += 4;
137 const __m128 vi4x89AB = _mm_loadu_ps(i4);
138 i4 += 4;
139 const __m128 vi5x89AB = _mm_loadu_ps(i5);
140 i5 += 4;
141
142 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
143 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
144 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
145 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
146 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
147 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
148
149 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x3456, vk01));
150 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x3456, vk01));
151 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
152 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
153 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
154 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
155 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x3456, vk31));
156 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x3456, vk31));
157 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
158 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
159
160 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
161 vi0x3012 = vi0x7456;
162 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
163 vi1x3012 = vi1x7456;
164 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
165 vi2x3012 = vi2x7456;
166 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
167 vi3x3012 = vi3x7456;
168 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
169 vi4x3012 = vi4x7456;
170 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
171 vi5x3012 = vi5x7456;
172
173 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
174 vi0x4567 = vi0x89AB;
175 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
176 vi1x4567 = vi1x89AB;
177 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
178 vi2x4567 = vi2x89AB;
179 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
180 vi3x4567 = vi3x89AB;
181 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
182 vi4x4567 = vi4x89AB;
183 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
184 vi5x4567 = vi5x89AB;
185
186 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x2345, vk00));
187 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x2345, vk00));
188 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x2345, vk10));
189 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi2x2345, vk10));
190 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
191 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
192 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
193 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
194 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
195 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi5x2345, vk40));
196
197 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
198 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
199 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
200 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
201 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
202 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
203
204 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
205 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
206 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk13));
207 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk13));
208 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x5678, vk23));
209 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi3x5678, vk23));
210 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
212 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
213 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
214
215 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
216 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
217 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
218 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
219 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
220 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
221
222 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x6789, vk04));
223 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x6789, vk04));
224 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
225 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
226 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x6789, vk24));
227 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x6789, vk24));
228 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x6789, vk34));
229 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x6789, vk34));
230 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
231 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
232
233 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
234 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
235 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
236 vo1p0 = _mm_add_ps(vo1p0, vo1p2);
237
238 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
239 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
240
241 vo0 = _mm_min_ps(vo0, vmax);
242 vo1 = _mm_min_ps(vo1, vmax);
243
244 _mm_storeu_ps(o1, vo1);
245 o1 += 4;
246 _mm_storeu_ps(o0, vo0);
247 o0 += 4;
248 }
249 // Always process the last block of 5..8 pixels.
250 if XNN_LIKELY(w > 4 * sizeof(float)) {
251 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
252 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
253 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
254 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
255 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
256 __m128 vo1p2 = _mm_mul_ps(vi3x4567, vk22);
257 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
258 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
259 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x4567, vk42));
260 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x4567, vk42));
261
262 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
263 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
264 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
265 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
266 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
267 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
268
269 const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
270 i0 += 4;
271 const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
272 i1 += 4;
273 const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
274 i2 += 4;
275 const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
276 i3 += 4;
277 const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
278 i4 += 4;
279 const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
280 i5 += 4;
281
282 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
283 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
284 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
285 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
286 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
287 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
288
289 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x3456, vk01));
290 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x3456, vk01));
291 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
292 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
293 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
294 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
295 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x3456, vk31));
296 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x3456, vk31));
297 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
298 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
299
300 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
301 vi0x3012 = vi0x7456;
302 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
303 vi1x3012 = vi1x7456;
304 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
305 vi2x3012 = vi2x7456;
306 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
307 vi3x3012 = vi3x7456;
308 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
309 vi4x3012 = vi4x7456;
310 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
311 vi5x3012 = vi5x7456;
312
313 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
314 vi0x4567 = vi0x89AB;
315 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
316 vi1x4567 = vi1x89AB;
317 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
318 vi2x4567 = vi2x89AB;
319 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
320 vi3x4567 = vi3x89AB;
321 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
322 vi4x4567 = vi4x89AB;
323 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
324 vi5x4567 = vi5x89AB;
325
326 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x2345, vk00));
327 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x2345, vk00));
328 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x2345, vk10));
329 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi2x2345, vk10));
330 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
331 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
332 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
333 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
334 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
335 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi5x2345, vk40));
336
337 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
338 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
339 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
340 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
341 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
342 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
343
344 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
345 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
346 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk13));
347 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk13));
348 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x5678, vk23));
349 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi3x5678, vk23));
350 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
351 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
352 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
353 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
354
355 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
356 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
357 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
358 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
359 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
360 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
361
362 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x6789, vk04));
363 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x6789, vk04));
364 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
365 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
366 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x6789, vk24));
367 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x6789, vk24));
368 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x6789, vk34));
369 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x6789, vk34));
370 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
371 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
372
373 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
374 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
375 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
376 vo1p0 = _mm_add_ps(vo1p0, vo1p2);
377
378 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
379 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
380
381 vo0 = _mm_min_ps(vo0, vmax);
382 vo1 = _mm_min_ps(vo1, vmax);
383
384 _mm_storeu_ps(o1, vo1);
385 o1 += 4;
386 _mm_storeu_ps(o0, vo0);
387 o0 += 4;
388
389 w -= 4 * sizeof(float);
390 }
391 assert(w >= 1 * sizeof(float));
392 assert(w <= 4 * sizeof(float));
393 {
394 vi0x4567 = _mm_and_ps(vi0x4567, vmask);
395 vi1x4567 = _mm_and_ps(vi1x4567, vmask);
396 vi2x4567 = _mm_and_ps(vi2x4567, vmask);
397 vi3x4567 = _mm_and_ps(vi3x4567, vmask);
398 vi4x4567 = _mm_and_ps(vi4x4567, vmask);
399 vi5x4567 = _mm_and_ps(vi5x4567, vmask);
400
401 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
402 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
403 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
404 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
405 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22);
406 __m128 vo1p2 = _mm_mul_ps(vi3x4567, vk22);
407 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
408 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
409 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x4567, vk42));
410 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x4567, vk42));
411
412 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
413 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
414 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
415 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
416 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
417 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
418
419 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
420 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
421 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
422 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
423 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
424 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
425
426 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x3456, vk01));
427 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x3456, vk01));
428 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
429 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
430 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
431 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
432 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x3456, vk31));
433 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x3456, vk31));
434 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
435 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
436
437 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
438 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
439 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
440 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
441 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
442 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
443
444 const __m128 vzero = _mm_setzero_ps();
445 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
446 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
447 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
448 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
449 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
450 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
451
452 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x2345, vk00));
453 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x2345, vk00));
454 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x2345, vk10));
455 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi2x2345, vk10));
456 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
457 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
458 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
459 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
460 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40));
461 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi5x2345, vk40));
462
463 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
464 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
465 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
466 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
467 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
468 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
469
470 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
471 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
472 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk13));
473 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk13));
474 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x5678, vk23));
475 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi3x5678, vk23));
476 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
477 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
478 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
479 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
480
481 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
482 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
483 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
484 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
485 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
486 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
487
488 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x6789, vk04));
489 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi1x6789, vk04));
490 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
491 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
492 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x6789, vk24));
493 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x6789, vk24));
494 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x6789, vk34));
495 vo1p2 = _mm_add_ps(vo1p2, _mm_mul_ps(vi4x6789, vk34));
496 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
497 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
498
499 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
500 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
501 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
502 vo1p0 = _mm_add_ps(vo1p0, vo1p2);
503
504 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
505 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
506
507 vo0 = _mm_min_ps(vo0, vmax);
508 vo1 = _mm_min_ps(vo1, vmax);
509
510 if XNN_LIKELY(w & (4 * sizeof(float))) {
511 _mm_storeu_ps(o1, vo1);
512 o1 += 4;
513 _mm_storeu_ps(o0, vo0);
514 o0 += 4;
515 } else {
516 if (w & (2 * sizeof(float))) {
517 _mm_storel_pi((__m64*) o1, vo1);
518 o1 += 2;
519 _mm_storel_pi((__m64*) o0, vo0);
520 o0 += 2;
521
522 vo0 = _mm_movehl_ps(vo0, vo0);
523 vo1 = _mm_movehl_ps(vo1, vo1);
524 }
525 if (w & (1 * sizeof(float))) {
526 _mm_store_ss(o1, vo1);
527 o1 += 1;
528 _mm_store_ss(o0, vo0);
529 o0 += 1;
530 }
531 }
532 }
533
534 i0 = (const float*) ((uintptr_t) i2 - input_decrement);
535 i1 = (const float*) ((uintptr_t) i3 - input_decrement);
536 i2 = (const float*) ((uintptr_t) i1 + input_width);
537 i3 = (const float*) ((uintptr_t) i2 + input_width);
538 i4 = (const float*) ((uintptr_t) i3 + input_width);
539 i5 = (const float*) ((uintptr_t) i4 + input_width);
540
541 o0 = o1;
542 o1 = (float*) ((uintptr_t) o0 + input_width);
543
544 output_height = doz(output_height, 2);
545 } while (output_height != 0);
546 }
547