1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5s2p2-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top >= 1);
32 assert(padding_top <= 2);
33
34 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
35 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
36 const __m128 vmax = _mm_load_ps(params->sse.max);
37 const __m128 vmin = _mm_load_ps(params->sse.min);
38
39 const __m128 vbias = _mm_load1_ps(weights);
40 const __m128 vk00 = _mm_load1_ps(weights + 1);
41 const __m128 vk01 = _mm_load1_ps(weights + 2);
42 const __m128 vk02 = _mm_load1_ps(weights + 3);
43 const __m128 vk03 = _mm_load1_ps(weights + 4);
44 const __m128 vk04 = _mm_load1_ps(weights + 5);
45 const __m128 vk10 = _mm_load1_ps(weights + 6);
46 const __m128 vk11 = _mm_load1_ps(weights + 7);
47 const __m128 vk12 = _mm_load1_ps(weights + 8);
48 const __m128 vk13 = _mm_load1_ps(weights + 9);
49 const __m128 vk14 = _mm_load1_ps(weights + 10);
50 const __m128 vk20 = _mm_load1_ps(weights + 11);
51 const __m128 vk21 = _mm_load1_ps(weights + 12);
52 const __m128 vk22 = _mm_load1_ps(weights + 13);
53 const __m128 vk23 = _mm_load1_ps(weights + 14);
54 const __m128 vk24 = _mm_load1_ps(weights + 15);
55 const __m128 vk30 = _mm_load1_ps(weights + 16);
56 const __m128 vk31 = _mm_load1_ps(weights + 17);
57 const __m128 vk32 = _mm_load1_ps(weights + 18);
58 const __m128 vk33 = _mm_load1_ps(weights + 19);
59 const __m128 vk34 = _mm_load1_ps(weights + 20);
60 const __m128 vk40 = _mm_load1_ps(weights + 21);
61 const __m128 vk41 = _mm_load1_ps(weights + 22);
62 const __m128 vk42 = _mm_load1_ps(weights + 23);
63 const __m128 vk43 = _mm_load1_ps(weights + 24);
64 const __m128 vk44 = _mm_load1_ps(weights + 25);
65
66 const uint32_t padding_top_less_1 = padding_top - 1;
67 const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
68
69 const float* i0 = zero;
70 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
71 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
72 if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
73 i1 = zero;
74 }
75 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
76 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
77 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
78 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
79 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
80 const float* i8 = (const float*) ((uintptr_t) i7 + input_width);
81
82 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
83
84 float* o0 = output;
85 float* o1 = (float*) ((uintptr_t) o0 + output_width);
86 float* o2 = (float*) ((uintptr_t) o1 + output_width);
87
88 size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
89 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
90 do {
91 if XNN_UNPREDICTABLE(padded_input_height < 6) {
92 i3 = zero;
93 }
94 if XNN_UNPREDICTABLE(padded_input_height < 7) {
95 i4 = zero;
96 o1 = o0;
97 }
98 if XNN_UNPREDICTABLE(padded_input_height < 8) {
99 i5 = zero;
100 }
101 if XNN_UNPREDICTABLE(padded_input_height < 9) {
102 i6 = zero;
103 o2 = o1;
104 }
105 if XNN_UNPREDICTABLE(padded_input_height < 10) {
106 i7 = zero;
107 }
108 if XNN_UNPREDICTABLE(padded_input_height < 11) {
109 i8 = zero;
110 }
111
112 __m128 vi0x6024 = _mm_setzero_ps();
113 __m128 vi1x6024 = _mm_setzero_ps();
114 __m128 vi2x6024 = _mm_setzero_ps();
115 __m128 vi3x6024 = _mm_setzero_ps();
116 __m128 vi4x6024 = _mm_setzero_ps();
117 __m128 vi5x6024 = _mm_setzero_ps();
118 __m128 vi6x6024 = _mm_setzero_ps();
119 __m128 vi7x6024 = _mm_setzero_ps();
120 __m128 vi8x6024 = _mm_setzero_ps();
121
122 __m128 vi0x7135 = _mm_setzero_ps();
123 __m128 vi1x7135 = _mm_setzero_ps();
124 __m128 vi2x7135 = _mm_setzero_ps();
125 __m128 vi3x7135 = _mm_setzero_ps();
126 __m128 vi4x7135 = _mm_setzero_ps();
127 __m128 vi5x7135 = _mm_setzero_ps();
128 __m128 vi6x7135 = _mm_setzero_ps();
129 __m128 vi7x7135 = _mm_setzero_ps();
130 __m128 vi8x7135 = _mm_setzero_ps();
131
132 const __m128 vi0x89AB = _mm_loadu_ps(i0);
133 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
134 i0 += 8;
135 const __m128 vi1x89AB = _mm_loadu_ps(i1);
136 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
137 i1 += 8;
138 const __m128 vi2x89AB = _mm_loadu_ps(i2);
139 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
140 i2 += 8;
141 const __m128 vi3x89AB = _mm_loadu_ps(i3);
142 const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
143 i3 += 8;
144 const __m128 vi4x89AB = _mm_loadu_ps(i4);
145 const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
146 i4 += 8;
147 const __m128 vi5x89AB = _mm_loadu_ps(i5);
148 const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
149 i5 += 8;
150 const __m128 vi6x89AB = _mm_loadu_ps(i6);
151 const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
152 i6 += 8;
153 const __m128 vi7x89AB = _mm_loadu_ps(i7);
154 const __m128 vi7xCDEF = _mm_loadu_ps(i7 + 4);
155 i7 += 8;
156 const __m128 vi8x89AB = _mm_loadu_ps(i8);
157 const __m128 vi8xCDEF = _mm_loadu_ps(i8 + 4);
158 i8 += 8;
159
160 __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
161 __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
162 __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
163 __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
164 __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
165 __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
166 __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
167 __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
168 __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
169 __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
170 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
171 __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
172 __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
173 __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
174 __m128 vi7x8ACE = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
175 __m128 vi7x9BDF = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
176 __m128 vi8x8ACE = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
177 __m128 vi8x9BDF = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
178
179 size_t w = input_width;
180 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
181 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
182 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
183 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk02));
184 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
185 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
186 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12));
187 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
188 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
189 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk22));
190 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
191 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
192 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x8ACE, vk32));
193 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
194 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
195 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x8ACE, vk42));
196
197 const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
198 const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
199 const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
200 const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
201 const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
202 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
203 const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
204 const __m128 vi7xE8AC = _mm_shuffle_ps(vi7x8ACE, vi7x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
205 const __m128 vi8xE8AC = _mm_shuffle_ps(vi8x8ACE, vi8x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
206
207 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
208 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
209 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk03));
210 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
212 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk13));
213 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
214 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
215 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk23));
216 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
217 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
218 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x9BDF, vk33));
219 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
220 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
221 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x9BDF, vk43));
222
223 const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
224 vi0x6024 = vi0xE8AC;
225 const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
226 vi1x6024 = vi1xE8AC;
227 const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
228 vi2x6024 = vi2xE8AC;
229 const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
230 vi3x6024 = vi3xE8AC;
231 const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
232 vi4x6024 = vi4xE8AC;
233 const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
234 vi5x6024 = vi5xE8AC;
235 const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
236 vi6x6024 = vi6xE8AC;
237 const __m128 vi7x68AC = _mm_move_ss(vi7xE8AC, vi7x6024);
238 vi7x6024 = vi7xE8AC;
239 const __m128 vi8x68AC = _mm_move_ss(vi8xE8AC, vi8x6024);
240 vi8x6024 = vi8xE8AC;
241
242 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
243 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
244 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
245 const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
246 const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
247 const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
248 const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
249 const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
250 const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
251
252 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
253 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
254 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x68AC, vk00));
255 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
256 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
257 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x68AC, vk10));
258 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
259 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
260 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x68AC, vk20));
261 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
262 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
263 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x68AC, vk30));
264 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
265 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
266 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x68AC, vk40));
267
268 const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
269 const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
270 i0 += 8;
271 const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
272 const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
273 i1 += 8;
274 const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
275 const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
276 i2 += 8;
277 const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
278 const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
279 i3 += 8;
280 const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
281 const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
282 i4 += 8;
283 const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
284 const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
285 i5 += 8;
286 const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
287 const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
288 i6 += 8;
289 const __m128 vi7xGHIJ = _mm_loadu_ps(i7);
290 const __m128 vi7xKLMN = _mm_loadu_ps(i7 + 4);
291 i7 += 8;
292 const __m128 vi8xGHIJ = _mm_loadu_ps(i8);
293 const __m128 vi8xKLMN = _mm_loadu_ps(i8 + 4);
294 i8 += 8;
295
296 const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
297 vi0x7135 = vi0xF9BD;
298 const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
299 vi1x7135 = vi1xF9BD;
300 const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
301 vi2x7135 = vi2xF9BD;
302 const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
303 vi3x7135 = vi3xF9BD;
304 const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
305 vi4x7135 = vi4xF9BD;
306 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
307 vi5x7135 = vi5xF9BD;
308 const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
309 vi6x7135 = vi6xF9BD;
310 const __m128 vi7x79BD = _mm_move_ss(vi7xF9BD, vi7x7135);
311 vi7x7135 = vi7xF9BD;
312 const __m128 vi8x79BD = _mm_move_ss(vi8xF9BD, vi8x7135);
313 vi8x7135 = vi8xF9BD;
314
315 const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
316 const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
317 vi0x9BDF = vi0xHJLN;
318 const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
319 const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
320 vi1x9BDF = vi1xHJLN;
321 const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
322 const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
323 vi2x9BDF = vi2xHJLN;
324 const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
325 const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
326 vi3x9BDF = vi3xHJLN;
327 const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
328 const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
329 vi4x9BDF = vi4xHJLN;
330 const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
331 const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
332 vi5x9BDF = vi5xHJLN;
333 const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
334 const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
335 vi6x9BDF = vi6xHJLN;
336 const __m128 vi7xGIKM = _mm_shuffle_ps(vi7xGHIJ, vi7xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
337 const __m128 vi7xHJLN = _mm_shuffle_ps(vi7xGHIJ, vi7xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
338 vi7x9BDF = vi7xHJLN;
339 const __m128 vi8xGIKM = _mm_shuffle_ps(vi8xGHIJ, vi8xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
340 const __m128 vi8xHJLN = _mm_shuffle_ps(vi8xGHIJ, vi8xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
341 vi8x9BDF = vi8xHJLN;
342
343 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
344 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
345 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x79BD, vk01));
346 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
347 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
348 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11));
349 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
350 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
351 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x79BD, vk21));
352 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
353 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
354 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x79BD, vk31));
355 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
356 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
357 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x79BD, vk41));
358
359 const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
360 vi0x8ACE = vi0xGIKM;
361 const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
362 vi1x8ACE = vi1xGIKM;
363 const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
364 vi2x8ACE = vi2xGIKM;
365 const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
366 vi3x8ACE = vi3xGIKM;
367 const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
368 vi4x8ACE = vi4xGIKM;
369 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
370 vi5x8ACE = vi5xGIKM;
371 const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
372 vi6x8ACE = vi6xGIKM;
373 const __m128 vi7xGACE = _mm_move_ss(vi7x8ACE, vi7xGIKM);
374 vi7x8ACE = vi7xGIKM;
375 const __m128 vi8xGACE = _mm_move_ss(vi8x8ACE, vi8xGIKM);
376 vi8x8ACE = vi8xGIKM;
377
378 const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
379 const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
380 const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
381 const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
382 const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
383 const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
384 const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
385 const __m128 vi7xACEG = _mm_shuffle_ps(vi7xGACE, vi7xGACE, _MM_SHUFFLE(0, 3, 2, 1));
386 const __m128 vi8xACEG = _mm_shuffle_ps(vi8xGACE, vi8xGACE, _MM_SHUFFLE(0, 3, 2, 1));
387
388 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
389 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
390 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4xACEG, vk04));
391 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
392 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
393 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5xACEG, vk14));
394 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
395 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
396 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6xACEG, vk24));
397 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
398 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
399 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7xACEG, vk34));
400 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
401 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
402 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8xACEG, vk44));
403
404
405 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
406 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
407 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
408
409 vo0 = _mm_min_ps(vo0, vmax);
410 vo1 = _mm_min_ps(vo1, vmax);
411 vo2 = _mm_min_ps(vo2, vmax);
412
413 _mm_storeu_ps(o2, vo2);
414 o2 += 4;
415 _mm_storeu_ps(o1, vo1);
416 o1 += 4;
417 _mm_storeu_ps(o0, vo0);
418 o0 += 4;
419 }
420 // Last block has 1-8 pixels to process.
421 assert(w <= 8 * sizeof(float));
422 assert(w >= 1 * sizeof(float));
423 {
424 vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
425 vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
426 vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
427 vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
428 vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
429 vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
430 vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
431 vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
432 vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
433 vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
434 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
435 vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
436 vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
437 vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
438 vi7x8ACE = _mm_and_ps(vi7x8ACE, vmask_even);
439 vi7x9BDF = _mm_and_ps(vi7x9BDF, vmask_odd);
440 vi8x8ACE = _mm_and_ps(vi8x8ACE, vmask_even);
441 vi8x9BDF = _mm_and_ps(vi8x9BDF, vmask_odd);
442
443 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
444 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
445 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk02));
446 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
447 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
448 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12));
449 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
450 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
451 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk22));
452 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
453 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
454 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x8ACE, vk32));
455 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
456 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
457 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x8ACE, vk42));
458
459 const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
460 const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
461 const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
462 const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
463 const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
464 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
465 const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
466 const __m128 vi7xE8AC = _mm_shuffle_ps(vi7x8ACE, vi7x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
467 const __m128 vi8xE8AC = _mm_shuffle_ps(vi8x8ACE, vi8x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
468
469 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
470 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
471 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk03));
472 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
473 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
474 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk13));
475 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
476 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
477 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk23));
478 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
479 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
480 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x9BDF, vk33));
481 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
482 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
483 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x9BDF, vk43));
484
485 const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
486 const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
487 const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
488 const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
489 const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
490 const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
491 const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
492 const __m128 vi7x68AC = _mm_move_ss(vi7xE8AC, vi7x6024);
493 const __m128 vi8x68AC = _mm_move_ss(vi8xE8AC, vi8x6024);
494
495 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
496 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
497 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
498 const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
499 const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
500 const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
501 const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
502 const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
503 const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
504
505 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
506 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
507 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x68AC, vk00));
508 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
509 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
510 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x68AC, vk10));
511 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
512 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
513 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x68AC, vk20));
514 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
515 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
516 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x68AC, vk30));
517 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
518 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
519 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x68AC, vk40));
520
521 const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
522 const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
523 const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
524 const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
525 const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
526 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
527 const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
528 const __m128 vi7x79BD = _mm_move_ss(vi7xF9BD, vi7x7135);
529 const __m128 vi8x79BD = _mm_move_ss(vi8xF9BD, vi8x7135);
530
531 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
532 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
533 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x79BD, vk01));
534 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
535 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
536 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11));
537 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
538 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
539 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x79BD, vk21));
540 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
542 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x79BD, vk31));
543 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
544 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
545 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x79BD, vk41));
546
547 const __m128 vzero = _mm_setzero_ps();
548 const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
549 const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
550 const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
551 const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
552 const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
553 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
554 const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
555 const __m128 vi7xGACE = _mm_move_ss(vi7x8ACE, vzero);
556 const __m128 vi8xGACE = _mm_move_ss(vi8x8ACE, vzero);
557
558 const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
559 const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
560 const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
561 const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
562 const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
563 const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
564 const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
565 const __m128 vi7xACEG = _mm_shuffle_ps(vi7xGACE, vi7xGACE, _MM_SHUFFLE(0, 3, 2, 1));
566 const __m128 vi8xACEG = _mm_shuffle_ps(vi8xGACE, vi8xGACE, _MM_SHUFFLE(0, 3, 2, 1));
567
568 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
569 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
570 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4xACEG, vk04));
571 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
572 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
573 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5xACEG, vk14));
574 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
575 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
576 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6xACEG, vk24));
577 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
578 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
579 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7xACEG, vk34));
580 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
581 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
582 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8xACEG, vk44));
583
584
585 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
586 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
587 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
588
589 vo0 = _mm_min_ps(vo0, vmax);
590 vo1 = _mm_min_ps(vo1, vmax);
591 vo2 = _mm_min_ps(vo2, vmax);
592
593 size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
594 if XNN_LIKELY(w_tmp >= 4) {
595 _mm_storeu_ps(o2, vo2);
596 o2 += 4;
597 _mm_storeu_ps(o1, vo1);
598 o1 += 4;
599 _mm_storeu_ps(o0, vo0);
600 o0 += 4;
601 } else {
602 if (w_tmp & 2) {
603 _mm_storel_pi((__m64*) o2, vo2);
604 o2 += 2;
605 _mm_storel_pi((__m64*) o1, vo1);
606 o1 += 2;
607 _mm_storel_pi((__m64*) o0, vo0);
608 o0 += 2;
609
610 vo0 = _mm_movehl_ps(vo0, vo0);
611 vo1 = _mm_movehl_ps(vo1, vo1);
612 vo2 = _mm_movehl_ps(vo2, vo2);
613 }
614 if (w_tmp & 1) {
615 _mm_store_ss(o2, vo2);
616 o2 += 1;
617 _mm_store_ss(o1, vo1);
618 o1 += 1;
619 _mm_store_ss(o0, vo0);
620 o0 += 1;
621 }
622 }
623 }
624
625 i0 = (const float*) ((uintptr_t) i6 - input_decrement);
626 i1 = (const float*) ((uintptr_t) i7 - input_decrement);
627 i2 = (const float*) ((uintptr_t) i8 - input_decrement);
628 i3 = (const float*) ((uintptr_t) i2 + input_width);
629 i4 = (const float*) ((uintptr_t) i3 + input_width);
630 i5 = (const float*) ((uintptr_t) i4 + input_width);
631 i6 = (const float*) ((uintptr_t) i5 + input_width);
632 i7 = (const float*) ((uintptr_t) i6 + input_width);
633 i8 = (const float*) ((uintptr_t) i7 + input_width);
634
635 o0 = o2;
636 o1 = (float*) ((uintptr_t) o0 + output_width);
637 o2 = (float*) ((uintptr_t) o1 + output_width);
638
639 output_height = doz(output_height, 3);
640 padded_input_height = doz(padded_input_height, 6);
641 } while (output_height != 0);
642 }
643