1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top == 2);
32
33 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34 const __m128 vmax = _mm_load_ps(params->sse.max);
35 const __m128 vmin = _mm_load_ps(params->sse.min);
36
37 const __m128 vbias = _mm_load1_ps(weights);
38 const __m128 vk00 = _mm_load1_ps(weights + 1);
39 const __m128 vk01 = _mm_load1_ps(weights + 2);
40 const __m128 vk02 = _mm_load1_ps(weights + 3);
41 const __m128 vk03 = _mm_load1_ps(weights + 4);
42 const __m128 vk04 = _mm_load1_ps(weights + 5);
43 const __m128 vk10 = _mm_load1_ps(weights + 6);
44 const __m128 vk11 = _mm_load1_ps(weights + 7);
45 const __m128 vk12 = _mm_load1_ps(weights + 8);
46 const __m128 vk13 = _mm_load1_ps(weights + 9);
47 const __m128 vk14 = _mm_load1_ps(weights + 10);
48 const __m128 vk20 = _mm_load1_ps(weights + 11);
49 const __m128 vk21 = _mm_load1_ps(weights + 12);
50 const __m128 vk22 = _mm_load1_ps(weights + 13);
51 const __m128 vk23 = _mm_load1_ps(weights + 14);
52 const __m128 vk24 = _mm_load1_ps(weights + 15);
53 const __m128 vk30 = _mm_load1_ps(weights + 16);
54 const __m128 vk31 = _mm_load1_ps(weights + 17);
55 const __m128 vk32 = _mm_load1_ps(weights + 18);
56 const __m128 vk33 = _mm_load1_ps(weights + 19);
57 const __m128 vk34 = _mm_load1_ps(weights + 20);
58 const __m128 vk40 = _mm_load1_ps(weights + 21);
59 const __m128 vk41 = _mm_load1_ps(weights + 22);
60 const __m128 vk42 = _mm_load1_ps(weights + 23);
61 const __m128 vk43 = _mm_load1_ps(weights + 24);
62 const __m128 vk44 = _mm_load1_ps(weights + 25);
63
64 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65
66 const float* i0 = zero;
67 const float* i1 = zero;
68 const float* i2 = input;
69 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
72 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
73 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
74
75 float* o0 = output;
76 float* o1 = (float*) ((uintptr_t) o0 + input_width);
77 float* o2 = (float*) ((uintptr_t) o1 + input_width);
78 float* o3 = (float*) ((uintptr_t) o2 + input_width);
79
80 size_t output_height = input_height;
81 do {
82 if XNN_UNPREDICTABLE(output_height < 2) {
83 i3 = zero;
84 o1 = o0;
85 }
86 if XNN_UNPREDICTABLE(output_height < 3) {
87 i4 = zero;
88 o2 = o1;
89 }
90 if XNN_UNPREDICTABLE(output_height < 4) {
91 i5 = zero;
92 o3 = o2;
93 }
94 if XNN_UNPREDICTABLE(output_height < 5) {
95 i6 = zero;
96 }
97 if XNN_UNPREDICTABLE(output_height < 6) {
98 i7 = zero;
99 }
100
101 __m128 vi0x3012 = _mm_setzero_ps();
102 __m128 vi1x3012 = _mm_setzero_ps();
103 __m128 vi2x3012 = _mm_setzero_ps();
104 __m128 vi3x3012 = _mm_setzero_ps();
105 __m128 vi4x3012 = _mm_setzero_ps();
106 __m128 vi5x3012 = _mm_setzero_ps();
107 __m128 vi6x3012 = _mm_setzero_ps();
108 __m128 vi7x3012 = _mm_setzero_ps();
109
110 __m128 vi0x4567 = _mm_loadu_ps(i0);
111 i0 += 4;
112 __m128 vi1x4567 = _mm_loadu_ps(i1);
113 i1 += 4;
114 __m128 vi2x4567 = _mm_loadu_ps(i2);
115 i2 += 4;
116 __m128 vi3x4567 = _mm_loadu_ps(i3);
117 i3 += 4;
118 __m128 vi4x4567 = _mm_loadu_ps(i4);
119 i4 += 4;
120 __m128 vi5x4567 = _mm_loadu_ps(i5);
121 i5 += 4;
122 __m128 vi6x4567 = _mm_loadu_ps(i6);
123 i6 += 4;
124 __m128 vi7x4567 = _mm_loadu_ps(i7);
125 i7 += 4;
126
127 size_t w = input_width;
128 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
129 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
130 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
131 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
132 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
133 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
134 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
135 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
136 __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
137 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
138 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
139 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
140 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
141 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
142 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
143 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
144 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
145 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
146 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
147 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
148 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
149
150 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
151 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
152 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
153 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
154 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
155 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
156 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
157 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
158
159 const __m128 vi0x89AB = _mm_loadu_ps(i0);
160 i0 += 4;
161 const __m128 vi1x89AB = _mm_loadu_ps(i1);
162 i1 += 4;
163 const __m128 vi2x89AB = _mm_loadu_ps(i2);
164 i2 += 4;
165 const __m128 vi3x89AB = _mm_loadu_ps(i3);
166 i3 += 4;
167 const __m128 vi4x89AB = _mm_loadu_ps(i4);
168 i4 += 4;
169 const __m128 vi5x89AB = _mm_loadu_ps(i5);
170 i5 += 4;
171 const __m128 vi6x89AB = _mm_loadu_ps(i6);
172 i6 += 4;
173 const __m128 vi7x89AB = _mm_loadu_ps(i7);
174 i7 += 4;
175
176 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
177 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
178 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
179 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
180 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
181 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
182 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
183 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
184
185 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
186 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
187 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
188 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
189 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
190 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
191 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
192 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
193 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
194 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
195 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
196 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
197 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
198 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
199 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
200 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
201 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
202 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
203 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
204 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
205
206 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
207 vi0x3012 = vi0x7456;
208 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
209 vi1x3012 = vi1x7456;
210 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
211 vi2x3012 = vi2x7456;
212 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
213 vi3x3012 = vi3x7456;
214 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
215 vi4x3012 = vi4x7456;
216 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
217 vi5x3012 = vi5x7456;
218 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
219 vi6x3012 = vi6x7456;
220 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
221 vi7x3012 = vi7x7456;
222
223 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
224 vi0x4567 = vi0x89AB;
225 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
226 vi1x4567 = vi1x89AB;
227 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
228 vi2x4567 = vi2x89AB;
229 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
230 vi3x4567 = vi3x89AB;
231 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
232 vi4x4567 = vi4x89AB;
233 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
234 vi5x4567 = vi5x89AB;
235 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
236 vi6x4567 = vi6x89AB;
237 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
238 vi7x4567 = vi7x89AB;
239
240 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
241 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
242 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
243 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
244 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
245 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
246 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
247 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
248 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
249 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
250 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
251 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
252 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
253 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
254 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
255 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
256 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
257 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
258 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
259 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
260
261 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
262 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
263 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
264 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
265 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
266 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
267 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
268 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
269
270 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
271 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
272 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
273 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
274 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
275 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
276 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
277 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
278 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
279 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
280 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
281 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
282 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
283 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
286 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
287 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
288 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
289 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
290
291 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
292 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
293 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
294 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
295 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
296 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
297 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
298 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
299
300 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
301 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
302 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
303 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
304 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
305 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
306 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
307 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
308 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
309 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
310 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
312 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
313 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
314 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
315 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
316 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
318 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
320
321 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
322 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
323 vo2p0 = _mm_add_ps(vo2p0, vo2p1);
324 vo3p0 = _mm_add_ps(vo3p0, vo3p1);
325
326 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
327 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
328 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
329 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
330
331 vo0 = _mm_min_ps(vo0, vmax);
332 vo1 = _mm_min_ps(vo1, vmax);
333 vo2 = _mm_min_ps(vo2, vmax);
334 vo3 = _mm_min_ps(vo3, vmax);
335
336 _mm_storeu_ps(o3, vo3);
337 o3 += 4;
338 _mm_storeu_ps(o2, vo2);
339 o2 += 4;
340 _mm_storeu_ps(o1, vo1);
341 o1 += 4;
342 _mm_storeu_ps(o0, vo0);
343 o0 += 4;
344 }
345 // Always process the last block of 5..8 pixels.
346 if XNN_LIKELY(w > 4 * sizeof(float)) {
347 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
348 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
349 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
350 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
351 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
352 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
353 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
354 __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
355 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
356 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
357 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
358 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
359 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
360 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
361 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
362 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
363 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
364 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
365 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
366 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
367
368 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
369 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
370 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
371 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
372 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
373 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
374 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
375 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
376
377 const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
378 i0 += 4;
379 const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
380 i1 += 4;
381 const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
382 i2 += 4;
383 const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
384 i3 += 4;
385 const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
386 i4 += 4;
387 const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
388 i5 += 4;
389 const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
390 i6 += 4;
391 const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
392 i7 += 4;
393
394 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
395 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
396 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
397 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
398 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
399 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
400 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
401 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
402
403 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
404 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
405 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
406 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
407 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
408 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
409 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
410 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
411 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
412 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
413 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
414 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
415 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
416 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
417 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
418 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
419 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
420 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
421 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
422 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
423
424 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
425 vi0x3012 = vi0x7456;
426 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
427 vi1x3012 = vi1x7456;
428 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
429 vi2x3012 = vi2x7456;
430 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
431 vi3x3012 = vi3x7456;
432 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
433 vi4x3012 = vi4x7456;
434 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
435 vi5x3012 = vi5x7456;
436 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
437 vi6x3012 = vi6x7456;
438 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
439 vi7x3012 = vi7x7456;
440
441 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
442 vi0x4567 = vi0x89AB;
443 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
444 vi1x4567 = vi1x89AB;
445 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
446 vi2x4567 = vi2x89AB;
447 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
448 vi3x4567 = vi3x89AB;
449 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
450 vi4x4567 = vi4x89AB;
451 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
452 vi5x4567 = vi5x89AB;
453 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
454 vi6x4567 = vi6x89AB;
455 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
456 vi7x4567 = vi7x89AB;
457
458 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
459 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
460 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
461 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
462 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
463 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
464 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
465 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
466 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
467 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
468 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
469 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
470 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
471 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
472 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
473 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
474 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
475 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
476 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
477 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
478
479 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
480 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
481 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
482 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
483 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
484 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
485 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
486 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
487
488 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
489 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
490 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
491 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
492 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
493 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
494 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
495 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
496 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
497 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
498 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
499 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
500 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
501 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
502 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
503 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
504 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
505 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
506 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
507 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
508
509 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
510 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
511 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
512 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
513 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
514 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
515 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
516 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
517
518 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
519 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
520 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
521 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
522 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
523 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
524 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
525 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
526 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
527 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
528 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
529 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
530 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
531 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
532 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
533 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
534 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
535 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
536 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
537 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
538
539 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
540 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
541 vo2p0 = _mm_add_ps(vo2p0, vo2p1);
542 vo3p0 = _mm_add_ps(vo3p0, vo3p1);
543
544 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
545 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
546 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
547 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
548
549 vo0 = _mm_min_ps(vo0, vmax);
550 vo1 = _mm_min_ps(vo1, vmax);
551 vo2 = _mm_min_ps(vo2, vmax);
552 vo3 = _mm_min_ps(vo3, vmax);
553
554 _mm_storeu_ps(o3, vo3);
555 o3 += 4;
556 _mm_storeu_ps(o2, vo2);
557 o2 += 4;
558 _mm_storeu_ps(o1, vo1);
559 o1 += 4;
560 _mm_storeu_ps(o0, vo0);
561 o0 += 4;
562
563 w -= 4 * sizeof(float);
564 }
565 assert(w >= 1 * sizeof(float));
566 assert(w <= 4 * sizeof(float));
567 {
568 vi0x4567 = _mm_and_ps(vi0x4567, vmask);
569 vi1x4567 = _mm_and_ps(vi1x4567, vmask);
570 vi2x4567 = _mm_and_ps(vi2x4567, vmask);
571 vi3x4567 = _mm_and_ps(vi3x4567, vmask);
572 vi4x4567 = _mm_and_ps(vi4x4567, vmask);
573 vi5x4567 = _mm_and_ps(vi5x4567, vmask);
574 vi6x4567 = _mm_and_ps(vi6x4567, vmask);
575 vi7x4567 = _mm_and_ps(vi7x4567, vmask);
576
577 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
578 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
579 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
580 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
581 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
582 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
583 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
584 __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
585 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
586 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
587 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
588 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
589 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
590 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
591 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
592 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
593 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
594 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
595 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
596 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
597
598 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
599 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
600 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
601 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
602 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
603 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
604 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
605 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
606
607 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
608 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
609 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
610 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
611 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
612 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
613 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
614 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
615
616 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
617 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
618 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
619 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
620 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
621 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
622 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
623 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
624 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
625 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
626 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
627 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
628 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
629 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
630 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
631 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
632 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
633 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
634 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
635 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
636
637 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
638 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
639 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
640 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
641 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
642 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
643 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
644 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
645
646 const __m128 vzero = _mm_setzero_ps();
647 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
648 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
649 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
650 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
651 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
652 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
653 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
654 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
655
656 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
657 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
658 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
659 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
660 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
661 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
662 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
663 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
664 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
665 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
666 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
667 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
668 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
669 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
670 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
671 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
672 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
673 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
674 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
675 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
676
677 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
678 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
679 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
680 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
681 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
682 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
683 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
684 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
685
686 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
687 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
688 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
689 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
690 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
691 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
692 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
693 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
694 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
695 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
696 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
697 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
698 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
699 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
700 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
701 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
702 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
703 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
704 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
705 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
706
707 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
708 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
709 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
710 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
711 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
712 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
713 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
714 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
715
716 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
717 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
718 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
719 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
720 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
721 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
722 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
723 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
724 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
725 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
726 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
727 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
728 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
729 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
730 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
731 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
732 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
733 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
734 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
735 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
736
737 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
738 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
739 vo2p0 = _mm_add_ps(vo2p0, vo2p1);
740 vo3p0 = _mm_add_ps(vo3p0, vo3p1);
741
742 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
743 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
744 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
745 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
746
747 vo0 = _mm_min_ps(vo0, vmax);
748 vo1 = _mm_min_ps(vo1, vmax);
749 vo2 = _mm_min_ps(vo2, vmax);
750 vo3 = _mm_min_ps(vo3, vmax);
751
752 if XNN_LIKELY(w & (4 * sizeof(float))) {
753 _mm_storeu_ps(o3, vo3);
754 o3 += 4;
755 _mm_storeu_ps(o2, vo2);
756 o2 += 4;
757 _mm_storeu_ps(o1, vo1);
758 o1 += 4;
759 _mm_storeu_ps(o0, vo0);
760 o0 += 4;
761 } else {
762 if (w & (2 * sizeof(float))) {
763 _mm_storel_pi((__m64*) o3, vo3);
764 o3 += 2;
765 _mm_storel_pi((__m64*) o2, vo2);
766 o2 += 2;
767 _mm_storel_pi((__m64*) o1, vo1);
768 o1 += 2;
769 _mm_storel_pi((__m64*) o0, vo0);
770 o0 += 2;
771
772 vo0 = _mm_movehl_ps(vo0, vo0);
773 vo1 = _mm_movehl_ps(vo1, vo1);
774 vo2 = _mm_movehl_ps(vo2, vo2);
775 vo3 = _mm_movehl_ps(vo3, vo3);
776 }
777 if (w & (1 * sizeof(float))) {
778 _mm_store_ss(o3, vo3);
779 o3 += 1;
780 _mm_store_ss(o2, vo2);
781 o2 += 1;
782 _mm_store_ss(o1, vo1);
783 o1 += 1;
784 _mm_store_ss(o0, vo0);
785 o0 += 1;
786 }
787 }
788 }
789
790 i0 = (const float*) ((uintptr_t) i4 - input_decrement);
791 i1 = (const float*) ((uintptr_t) i5 - input_decrement);
792 i2 = (const float*) ((uintptr_t) i1 + input_width);
793 i3 = (const float*) ((uintptr_t) i2 + input_width);
794 i4 = (const float*) ((uintptr_t) i3 + input_width);
795 i5 = (const float*) ((uintptr_t) i4 + input_width);
796 i6 = (const float*) ((uintptr_t) i5 + input_width);
797 i7 = (const float*) ((uintptr_t) i6 + input_width);
798
799 o0 = o3;
800 o1 = (float*) ((uintptr_t) o0 + input_width);
801 o2 = (float*) ((uintptr_t) o1 + input_width);
802 o3 = (float*) ((uintptr_t) o2 + input_width);
803
804 output_height = doz(output_height, 4);
805 } while (output_height != 0);
806 }
807