• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/5x5s2p2-sse.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xmmintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top >= 1);
32   assert(padding_top <= 2);
33 
34   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
35   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
36   const __m128 vmax = _mm_load_ps(params->sse.max);
37   const __m128 vmin = _mm_load_ps(params->sse.min);
38 
39   const __m128 vbias = _mm_load1_ps(weights);
40   const __m128 vk00 = _mm_load1_ps(weights + 1);
41   const __m128 vk01 = _mm_load1_ps(weights + 2);
42   const __m128 vk02 = _mm_load1_ps(weights + 3);
43   const __m128 vk03 = _mm_load1_ps(weights + 4);
44   const __m128 vk04 = _mm_load1_ps(weights + 5);
45   const __m128 vk10 = _mm_load1_ps(weights + 6);
46   const __m128 vk11 = _mm_load1_ps(weights + 7);
47   const __m128 vk12 = _mm_load1_ps(weights + 8);
48   const __m128 vk13 = _mm_load1_ps(weights + 9);
49   const __m128 vk14 = _mm_load1_ps(weights + 10);
50   const __m128 vk20 = _mm_load1_ps(weights + 11);
51   const __m128 vk21 = _mm_load1_ps(weights + 12);
52   const __m128 vk22 = _mm_load1_ps(weights + 13);
53   const __m128 vk23 = _mm_load1_ps(weights + 14);
54   const __m128 vk24 = _mm_load1_ps(weights + 15);
55   const __m128 vk30 = _mm_load1_ps(weights + 16);
56   const __m128 vk31 = _mm_load1_ps(weights + 17);
57   const __m128 vk32 = _mm_load1_ps(weights + 18);
58   const __m128 vk33 = _mm_load1_ps(weights + 19);
59   const __m128 vk34 = _mm_load1_ps(weights + 20);
60   const __m128 vk40 = _mm_load1_ps(weights + 21);
61   const __m128 vk41 = _mm_load1_ps(weights + 22);
62   const __m128 vk42 = _mm_load1_ps(weights + 23);
63   const __m128 vk43 = _mm_load1_ps(weights + 24);
64   const __m128 vk44 = _mm_load1_ps(weights + 25);
65 
66   const uint32_t padding_top_less_1 = padding_top - 1;
67   const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
68 
69   const float* i0 = zero;
70   const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
71   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
72   if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
73     i1 = zero;
74   }
75   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
76   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
77   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
78   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
79   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
80   const float* i8 = (const float*) ((uintptr_t) i7 + input_width);
81 
82   const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
83 
84   float* o0 = output;
85   float* o1 = (float*) ((uintptr_t) o0 + output_width);
86   float* o2 = (float*) ((uintptr_t) o1 + output_width);
87 
88   size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
89   size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
90   do {
91     if XNN_UNPREDICTABLE(padded_input_height < 6) {
92       i3 = zero;
93     }
94     if XNN_UNPREDICTABLE(padded_input_height < 7) {
95       i4 = zero;
96       o1 = o0;
97     }
98     if XNN_UNPREDICTABLE(padded_input_height < 8) {
99       i5 = zero;
100     }
101     if XNN_UNPREDICTABLE(padded_input_height < 9) {
102       i6 = zero;
103       o2 = o1;
104     }
105     if XNN_UNPREDICTABLE(padded_input_height < 10) {
106       i7 = zero;
107     }
108     if XNN_UNPREDICTABLE(padded_input_height < 11) {
109       i8 = zero;
110     }
111 
112     __m128 vi0x6024 = _mm_setzero_ps();
113     __m128 vi1x6024 = _mm_setzero_ps();
114     __m128 vi2x6024 = _mm_setzero_ps();
115     __m128 vi3x6024 = _mm_setzero_ps();
116     __m128 vi4x6024 = _mm_setzero_ps();
117     __m128 vi5x6024 = _mm_setzero_ps();
118     __m128 vi6x6024 = _mm_setzero_ps();
119     __m128 vi7x6024 = _mm_setzero_ps();
120     __m128 vi8x6024 = _mm_setzero_ps();
121 
122     __m128 vi0x7135 = _mm_setzero_ps();
123     __m128 vi1x7135 = _mm_setzero_ps();
124     __m128 vi2x7135 = _mm_setzero_ps();
125     __m128 vi3x7135 = _mm_setzero_ps();
126     __m128 vi4x7135 = _mm_setzero_ps();
127     __m128 vi5x7135 = _mm_setzero_ps();
128     __m128 vi6x7135 = _mm_setzero_ps();
129     __m128 vi7x7135 = _mm_setzero_ps();
130     __m128 vi8x7135 = _mm_setzero_ps();
131 
132     const __m128 vi0x89AB = _mm_loadu_ps(i0);
133     const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
134     i0 += 8;
135     const __m128 vi1x89AB = _mm_loadu_ps(i1);
136     const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
137     i1 += 8;
138     const __m128 vi2x89AB = _mm_loadu_ps(i2);
139     const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
140     i2 += 8;
141     const __m128 vi3x89AB = _mm_loadu_ps(i3);
142     const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
143     i3 += 8;
144     const __m128 vi4x89AB = _mm_loadu_ps(i4);
145     const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
146     i4 += 8;
147     const __m128 vi5x89AB = _mm_loadu_ps(i5);
148     const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
149     i5 += 8;
150     const __m128 vi6x89AB = _mm_loadu_ps(i6);
151     const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
152     i6 += 8;
153     const __m128 vi7x89AB = _mm_loadu_ps(i7);
154     const __m128 vi7xCDEF = _mm_loadu_ps(i7 + 4);
155     i7 += 8;
156     const __m128 vi8x89AB = _mm_loadu_ps(i8);
157     const __m128 vi8xCDEF = _mm_loadu_ps(i8 + 4);
158     i8 += 8;
159 
160     __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
161     __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
162     __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
163     __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
164     __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
165     __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
166     __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
167     __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
168     __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
169     __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
170     __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
171     __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
172     __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
173     __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
174     __m128 vi7x8ACE = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
175     __m128 vi7x9BDF = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
176     __m128 vi8x8ACE = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
177     __m128 vi8x9BDF = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
178 
179     size_t w = input_width;
180     for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
181       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
182       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
183       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk02));
184       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
185       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
186       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12));
187       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
188       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
189       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk22));
190       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
191       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
192       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x8ACE, vk32));
193       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
194       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
195       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x8ACE, vk42));
196 
197       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
198       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
199       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
200       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
201       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
202       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
203       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
204       const __m128 vi7xE8AC = _mm_shuffle_ps(vi7x8ACE, vi7x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
205       const __m128 vi8xE8AC = _mm_shuffle_ps(vi8x8ACE, vi8x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
206 
207       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
208       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
209       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk03));
210       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
211       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
212       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk13));
213       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
214       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
215       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk23));
216       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
217       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
218       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x9BDF, vk33));
219       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
220       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
221       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x9BDF, vk43));
222 
223       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
224       vi0x6024 = vi0xE8AC;
225       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
226       vi1x6024 = vi1xE8AC;
227       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
228       vi2x6024 = vi2xE8AC;
229       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
230       vi3x6024 = vi3xE8AC;
231       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
232       vi4x6024 = vi4xE8AC;
233       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
234       vi5x6024 = vi5xE8AC;
235       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
236       vi6x6024 = vi6xE8AC;
237       const __m128 vi7x68AC = _mm_move_ss(vi7xE8AC, vi7x6024);
238       vi7x6024 = vi7xE8AC;
239       const __m128 vi8x68AC = _mm_move_ss(vi8xE8AC, vi8x6024);
240       vi8x6024 = vi8xE8AC;
241 
242       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
243       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
244       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
245       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
246       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
247       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
248       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
249       const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
250       const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
251 
252       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
253       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
254       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x68AC, vk00));
255       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
256       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
257       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x68AC, vk10));
258       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
259       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
260       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x68AC, vk20));
261       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
262       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
263       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x68AC, vk30));
264       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
265       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
266       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x68AC, vk40));
267 
268       const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
269       const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
270       i0 += 8;
271       const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
272       const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
273       i1 += 8;
274       const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
275       const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
276       i2 += 8;
277       const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
278       const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
279       i3 += 8;
280       const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
281       const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
282       i4 += 8;
283       const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
284       const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
285       i5 += 8;
286       const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
287       const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
288       i6 += 8;
289       const __m128 vi7xGHIJ = _mm_loadu_ps(i7);
290       const __m128 vi7xKLMN = _mm_loadu_ps(i7 + 4);
291       i7 += 8;
292       const __m128 vi8xGHIJ = _mm_loadu_ps(i8);
293       const __m128 vi8xKLMN = _mm_loadu_ps(i8 + 4);
294       i8 += 8;
295 
296       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
297       vi0x7135 = vi0xF9BD;
298       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
299       vi1x7135 = vi1xF9BD;
300       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
301       vi2x7135 = vi2xF9BD;
302       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
303       vi3x7135 = vi3xF9BD;
304       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
305       vi4x7135 = vi4xF9BD;
306       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
307       vi5x7135 = vi5xF9BD;
308       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
309       vi6x7135 = vi6xF9BD;
310       const __m128 vi7x79BD = _mm_move_ss(vi7xF9BD, vi7x7135);
311       vi7x7135 = vi7xF9BD;
312       const __m128 vi8x79BD = _mm_move_ss(vi8xF9BD, vi8x7135);
313       vi8x7135 = vi8xF9BD;
314 
315       const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
316       const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
317       vi0x9BDF = vi0xHJLN;
318       const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
319       const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
320       vi1x9BDF = vi1xHJLN;
321       const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
322       const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
323       vi2x9BDF = vi2xHJLN;
324       const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
325       const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
326       vi3x9BDF = vi3xHJLN;
327       const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
328       const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
329       vi4x9BDF = vi4xHJLN;
330       const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
331       const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
332       vi5x9BDF = vi5xHJLN;
333       const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
334       const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
335       vi6x9BDF = vi6xHJLN;
336       const __m128 vi7xGIKM = _mm_shuffle_ps(vi7xGHIJ, vi7xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
337       const __m128 vi7xHJLN = _mm_shuffle_ps(vi7xGHIJ, vi7xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
338       vi7x9BDF = vi7xHJLN;
339       const __m128 vi8xGIKM = _mm_shuffle_ps(vi8xGHIJ, vi8xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
340       const __m128 vi8xHJLN = _mm_shuffle_ps(vi8xGHIJ, vi8xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
341       vi8x9BDF = vi8xHJLN;
342 
343       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
344       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
345       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x79BD, vk01));
346       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
347       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
348       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11));
349       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
350       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
351       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x79BD, vk21));
352       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
353       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
354       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x79BD, vk31));
355       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
356       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
357       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x79BD, vk41));
358 
359       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
360       vi0x8ACE = vi0xGIKM;
361       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
362       vi1x8ACE = vi1xGIKM;
363       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
364       vi2x8ACE = vi2xGIKM;
365       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
366       vi3x8ACE = vi3xGIKM;
367       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
368       vi4x8ACE = vi4xGIKM;
369       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
370       vi5x8ACE = vi5xGIKM;
371       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
372       vi6x8ACE = vi6xGIKM;
373       const __m128 vi7xGACE = _mm_move_ss(vi7x8ACE, vi7xGIKM);
374       vi7x8ACE = vi7xGIKM;
375       const __m128 vi8xGACE = _mm_move_ss(vi8x8ACE, vi8xGIKM);
376       vi8x8ACE = vi8xGIKM;
377 
378       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
379       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
380       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
381       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
382       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
383       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
384       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
385       const __m128 vi7xACEG = _mm_shuffle_ps(vi7xGACE, vi7xGACE, _MM_SHUFFLE(0, 3, 2, 1));
386       const __m128 vi8xACEG = _mm_shuffle_ps(vi8xGACE, vi8xGACE, _MM_SHUFFLE(0, 3, 2, 1));
387 
388       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
389       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
390       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4xACEG, vk04));
391       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
392       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
393       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5xACEG, vk14));
394       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
395       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
396       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6xACEG, vk24));
397       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
398       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
399       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7xACEG, vk34));
400       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
401       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
402       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8xACEG, vk44));
403 
404 
405       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
406       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
407       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
408 
409       vo0 = _mm_min_ps(vo0, vmax);
410       vo1 = _mm_min_ps(vo1, vmax);
411       vo2 = _mm_min_ps(vo2, vmax);
412 
413       _mm_storeu_ps(o2, vo2);
414       o2 += 4;
415       _mm_storeu_ps(o1, vo1);
416       o1 += 4;
417       _mm_storeu_ps(o0, vo0);
418       o0 += 4;
419     }
420     // Last block has 1-8 pixels to process.
421     assert(w <= 8 * sizeof(float));
422     assert(w >= 1 * sizeof(float));
423     {
424       vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
425       vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
426       vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
427       vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
428       vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
429       vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
430       vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
431       vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
432       vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
433       vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
434       vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
435       vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
436       vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
437       vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
438       vi7x8ACE = _mm_and_ps(vi7x8ACE, vmask_even);
439       vi7x9BDF = _mm_and_ps(vi7x9BDF, vmask_odd);
440       vi8x8ACE = _mm_and_ps(vi8x8ACE, vmask_even);
441       vi8x9BDF = _mm_and_ps(vi8x9BDF, vmask_odd);
442 
443       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
444       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
445       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk02));
446       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
447       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
448       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12));
449       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
450       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
451       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk22));
452       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
453       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
454       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x8ACE, vk32));
455       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
456       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
457       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x8ACE, vk42));
458 
459       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
460       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
461       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
462       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
463       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
464       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
465       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
466       const __m128 vi7xE8AC = _mm_shuffle_ps(vi7x8ACE, vi7x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
467       const __m128 vi8xE8AC = _mm_shuffle_ps(vi8x8ACE, vi8x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
468 
469       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
470       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
471       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk03));
472       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
473       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
474       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk13));
475       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
476       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
477       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk23));
478       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
479       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
480       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x9BDF, vk33));
481       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
482       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
483       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x9BDF, vk43));
484 
485       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
486       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
487       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
488       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
489       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
490       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
491       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
492       const __m128 vi7x68AC = _mm_move_ss(vi7xE8AC, vi7x6024);
493       const __m128 vi8x68AC = _mm_move_ss(vi8xE8AC, vi8x6024);
494 
495       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
496       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
497       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
498       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
499       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
500       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
501       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
502       const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
503       const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
504 
505       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
506       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
507       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x68AC, vk00));
508       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
509       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
510       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x68AC, vk10));
511       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
512       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
513       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x68AC, vk20));
514       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
515       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
516       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x68AC, vk30));
517       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
518       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
519       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x68AC, vk40));
520 
521       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
522       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
523       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
524       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
525       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
526       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
527       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
528       const __m128 vi7x79BD = _mm_move_ss(vi7xF9BD, vi7x7135);
529       const __m128 vi8x79BD = _mm_move_ss(vi8xF9BD, vi8x7135);
530 
531       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
532       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
533       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x79BD, vk01));
534       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
535       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
536       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11));
537       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
538       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
539       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x79BD, vk21));
540       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
541       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
542       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7x79BD, vk31));
543       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
544       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
545       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8x79BD, vk41));
546 
547       const __m128 vzero = _mm_setzero_ps();
548       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
549       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
550       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
551       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
552       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
553       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
554       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
555       const __m128 vi7xGACE = _mm_move_ss(vi7x8ACE, vzero);
556       const __m128 vi8xGACE = _mm_move_ss(vi8x8ACE, vzero);
557 
558       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
559       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
560       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
561       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
562       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
563       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
564       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
565       const __m128 vi7xACEG = _mm_shuffle_ps(vi7xGACE, vi7xGACE, _MM_SHUFFLE(0, 3, 2, 1));
566       const __m128 vi8xACEG = _mm_shuffle_ps(vi8xGACE, vi8xGACE, _MM_SHUFFLE(0, 3, 2, 1));
567 
568       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
569       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
570       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4xACEG, vk04));
571       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
572       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
573       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5xACEG, vk14));
574       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
575       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
576       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6xACEG, vk24));
577       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
578       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
579       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi7xACEG, vk34));
580       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
581       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
582       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi8xACEG, vk44));
583 
584 
585       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
586       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
587       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
588 
589       vo0 = _mm_min_ps(vo0, vmax);
590       vo1 = _mm_min_ps(vo1, vmax);
591       vo2 = _mm_min_ps(vo2, vmax);
592 
593       size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
594       if XNN_LIKELY(w_tmp >= 4) {
595         _mm_storeu_ps(o2, vo2);
596         o2 += 4;
597         _mm_storeu_ps(o1, vo1);
598         o1 += 4;
599         _mm_storeu_ps(o0, vo0);
600         o0 += 4;
601       } else {
602         if (w_tmp & 2) {
603           _mm_storel_pi((__m64*) o2, vo2);
604           o2 += 2;
605           _mm_storel_pi((__m64*) o1, vo1);
606           o1 += 2;
607           _mm_storel_pi((__m64*) o0, vo0);
608           o0 += 2;
609 
610           vo0 = _mm_movehl_ps(vo0, vo0);
611           vo1 = _mm_movehl_ps(vo1, vo1);
612           vo2 = _mm_movehl_ps(vo2, vo2);
613         }
614         if (w_tmp & 1) {
615           _mm_store_ss(o2, vo2);
616           o2 += 1;
617           _mm_store_ss(o1, vo1);
618           o1 += 1;
619           _mm_store_ss(o0, vo0);
620           o0 += 1;
621         }
622       }
623     }
624 
625     i0 = (const float*) ((uintptr_t) i6 - input_decrement);
626     i1 = (const float*) ((uintptr_t) i7 - input_decrement);
627     i2 = (const float*) ((uintptr_t) i8 - input_decrement);
628     i3 = (const float*) ((uintptr_t) i2 + input_width);
629     i4 = (const float*) ((uintptr_t) i3 + input_width);
630     i5 = (const float*) ((uintptr_t) i4 + input_width);
631     i6 = (const float*) ((uintptr_t) i5 + input_width);
632     i7 = (const float*) ((uintptr_t) i6 + input_width);
633     i8 = (const float*) ((uintptr_t) i7 + input_width);
634 
635     o0 = o2;
636     o1 = (float*) ((uintptr_t) o0 + output_width);
637     o2 = (float*) ((uintptr_t) o1 + output_width);
638 
639     output_height = doz(output_height, 3);
640     padded_input_height = doz(padded_input_height, 6);
641   } while (output_height != 0);
642 }
643