• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xmmintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top == 2);
32 
33   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34   const __m128 vmax = _mm_load_ps(params->sse.max);
35   const __m128 vmin = _mm_load_ps(params->sse.min);
36 
37   const __m128 vbias = _mm_load1_ps(weights);
38   const __m128 vk00 = _mm_load1_ps(weights + 1);
39   const __m128 vk01 = _mm_load1_ps(weights + 2);
40   const __m128 vk02 = _mm_load1_ps(weights + 3);
41   const __m128 vk03 = _mm_load1_ps(weights + 4);
42   const __m128 vk04 = _mm_load1_ps(weights + 5);
43   const __m128 vk10 = _mm_load1_ps(weights + 6);
44   const __m128 vk11 = _mm_load1_ps(weights + 7);
45   const __m128 vk12 = _mm_load1_ps(weights + 8);
46   const __m128 vk13 = _mm_load1_ps(weights + 9);
47   const __m128 vk14 = _mm_load1_ps(weights + 10);
48   const __m128 vk20 = _mm_load1_ps(weights + 11);
49   const __m128 vk21 = _mm_load1_ps(weights + 12);
50   const __m128 vk22 = _mm_load1_ps(weights + 13);
51   const __m128 vk23 = _mm_load1_ps(weights + 14);
52   const __m128 vk24 = _mm_load1_ps(weights + 15);
53   const __m128 vk30 = _mm_load1_ps(weights + 16);
54   const __m128 vk31 = _mm_load1_ps(weights + 17);
55   const __m128 vk32 = _mm_load1_ps(weights + 18);
56   const __m128 vk33 = _mm_load1_ps(weights + 19);
57   const __m128 vk34 = _mm_load1_ps(weights + 20);
58   const __m128 vk40 = _mm_load1_ps(weights + 21);
59   const __m128 vk41 = _mm_load1_ps(weights + 22);
60   const __m128 vk42 = _mm_load1_ps(weights + 23);
61   const __m128 vk43 = _mm_load1_ps(weights + 24);
62   const __m128 vk44 = _mm_load1_ps(weights + 25);
63 
64   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65 
66   const float* i0 = zero;
67   const float* i1 = zero;
68   const float* i2 = input;
69   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
72   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
73   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
74 
75   float* o0 = output;
76   float* o1 = (float*) ((uintptr_t) o0 + input_width);
77   float* o2 = (float*) ((uintptr_t) o1 + input_width);
78   float* o3 = (float*) ((uintptr_t) o2 + input_width);
79 
80   size_t output_height = input_height;
81   do {
82     if XNN_UNPREDICTABLE(output_height < 2) {
83       i3 = zero;
84       o1 = o0;
85     }
86     if XNN_UNPREDICTABLE(output_height < 3) {
87       i4 = zero;
88       o2 = o1;
89     }
90     if XNN_UNPREDICTABLE(output_height < 4) {
91       i5 = zero;
92       o3 = o2;
93     }
94     if XNN_UNPREDICTABLE(output_height < 5) {
95       i6 = zero;
96     }
97     if XNN_UNPREDICTABLE(output_height < 6) {
98       i7 = zero;
99     }
100 
101     __m128 vi0x3012 = _mm_setzero_ps();
102     __m128 vi1x3012 = _mm_setzero_ps();
103     __m128 vi2x3012 = _mm_setzero_ps();
104     __m128 vi3x3012 = _mm_setzero_ps();
105     __m128 vi4x3012 = _mm_setzero_ps();
106     __m128 vi5x3012 = _mm_setzero_ps();
107     __m128 vi6x3012 = _mm_setzero_ps();
108     __m128 vi7x3012 = _mm_setzero_ps();
109 
110     __m128 vi0x4567 = _mm_loadu_ps(i0);
111     i0 += 4;
112     __m128 vi1x4567 = _mm_loadu_ps(i1);
113     i1 += 4;
114     __m128 vi2x4567 = _mm_loadu_ps(i2);
115     i2 += 4;
116     __m128 vi3x4567 = _mm_loadu_ps(i3);
117     i3 += 4;
118     __m128 vi4x4567 = _mm_loadu_ps(i4);
119     i4 += 4;
120     __m128 vi5x4567 = _mm_loadu_ps(i5);
121     i5 += 4;
122     __m128 vi6x4567 = _mm_loadu_ps(i6);
123     i6 += 4;
124     __m128 vi7x4567 = _mm_loadu_ps(i7);
125     i7 += 4;
126 
127     size_t w = input_width;
128     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
129       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
130       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
131       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
132       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
133       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
134       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
135       __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
136       __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
137       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
138       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
139       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
140       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
141       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
142       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
143       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
144       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
145       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
146       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
147       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
148       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
149 
150       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
151       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
152       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
153       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
154       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
155       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
156       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
157       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
158 
159       const __m128 vi0x89AB = _mm_loadu_ps(i0);
160       i0 += 4;
161       const __m128 vi1x89AB = _mm_loadu_ps(i1);
162       i1 += 4;
163       const __m128 vi2x89AB = _mm_loadu_ps(i2);
164       i2 += 4;
165       const __m128 vi3x89AB = _mm_loadu_ps(i3);
166       i3 += 4;
167       const __m128 vi4x89AB = _mm_loadu_ps(i4);
168       i4 += 4;
169       const __m128 vi5x89AB = _mm_loadu_ps(i5);
170       i5 += 4;
171       const __m128 vi6x89AB = _mm_loadu_ps(i6);
172       i6 += 4;
173       const __m128 vi7x89AB = _mm_loadu_ps(i7);
174       i7 += 4;
175 
176       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
177       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
178       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
179       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
180       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
181       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
182       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
183       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
184 
185       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
186       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
187       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
188       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
189       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
190       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
191       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
192       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
193       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
194       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
195       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
196       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
197       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
198       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
199       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
200       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
201       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
202       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
203       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
204       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
205 
206       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
207       vi0x3012 = vi0x7456;
208       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
209       vi1x3012 = vi1x7456;
210       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
211       vi2x3012 = vi2x7456;
212       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
213       vi3x3012 = vi3x7456;
214       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
215       vi4x3012 = vi4x7456;
216       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
217       vi5x3012 = vi5x7456;
218       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
219       vi6x3012 = vi6x7456;
220       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
221       vi7x3012 = vi7x7456;
222 
223       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
224       vi0x4567 = vi0x89AB;
225       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
226       vi1x4567 = vi1x89AB;
227       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
228       vi2x4567 = vi2x89AB;
229       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
230       vi3x4567 = vi3x89AB;
231       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
232       vi4x4567 = vi4x89AB;
233       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
234       vi5x4567 = vi5x89AB;
235       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
236       vi6x4567 = vi6x89AB;
237       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
238       vi7x4567 = vi7x89AB;
239 
240       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
241       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
242       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
243       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
244       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
245       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
246       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
247       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
248       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
249       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
250       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
251       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
252       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
253       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
254       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
255       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
256       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
257       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
258       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
259       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
260 
261       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
262       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
263       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
264       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
265       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
266       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
267       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
268       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
269 
270       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
271       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
272       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
273       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
274       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
275       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
276       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
277       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
278       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
279       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
280       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
281       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
282       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
283       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
284       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
285       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
286       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
287       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
288       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
289       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
290 
291       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
292       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
293       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
294       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
295       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
296       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
297       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
298       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
299 
300       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
301       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
302       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
303       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
304       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
305       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
306       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
307       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
308       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
309       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
310       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
311       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
312       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
313       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
314       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
315       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
316       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
317       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
318       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
319       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
320 
321       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
322       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
323       vo2p0 = _mm_add_ps(vo2p0, vo2p1);
324       vo3p0 = _mm_add_ps(vo3p0, vo3p1);
325 
326       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
327       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
328       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
329       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
330 
331       vo0 = _mm_min_ps(vo0, vmax);
332       vo1 = _mm_min_ps(vo1, vmax);
333       vo2 = _mm_min_ps(vo2, vmax);
334       vo3 = _mm_min_ps(vo3, vmax);
335 
336       _mm_storeu_ps(o3, vo3);
337       o3 += 4;
338       _mm_storeu_ps(o2, vo2);
339       o2 += 4;
340       _mm_storeu_ps(o1, vo1);
341       o1 += 4;
342       _mm_storeu_ps(o0, vo0);
343       o0 += 4;
344     }
345     // Always process the last block of 5..8 pixels.
346     if XNN_LIKELY(w > 4 * sizeof(float)) {
347       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
348       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
349       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
350       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
351       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
352       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
353       __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
354       __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
355       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
356       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
357       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
358       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
359       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
360       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
361       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
362       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
363       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
364       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
365       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
366       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
367 
368       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
369       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
370       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
371       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
372       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
373       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
374       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
375       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
376 
377       const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
378       i0 += 4;
379       const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
380       i1 += 4;
381       const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
382       i2 += 4;
383       const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
384       i3 += 4;
385       const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
386       i4 += 4;
387       const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
388       i5 += 4;
389       const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
390       i6 += 4;
391       const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
392       i7 += 4;
393 
394       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
395       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
396       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
397       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
398       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
399       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
400       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
401       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
402 
403       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
404       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
405       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
406       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
407       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
408       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
409       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
410       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
411       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
412       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
413       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
414       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
415       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
416       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
417       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
418       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
419       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
420       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
421       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
422       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
423 
424       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
425       vi0x3012 = vi0x7456;
426       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
427       vi1x3012 = vi1x7456;
428       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
429       vi2x3012 = vi2x7456;
430       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
431       vi3x3012 = vi3x7456;
432       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
433       vi4x3012 = vi4x7456;
434       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
435       vi5x3012 = vi5x7456;
436       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
437       vi6x3012 = vi6x7456;
438       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
439       vi7x3012 = vi7x7456;
440 
441       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
442       vi0x4567 = vi0x89AB;
443       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
444       vi1x4567 = vi1x89AB;
445       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
446       vi2x4567 = vi2x89AB;
447       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
448       vi3x4567 = vi3x89AB;
449       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
450       vi4x4567 = vi4x89AB;
451       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
452       vi5x4567 = vi5x89AB;
453       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
454       vi6x4567 = vi6x89AB;
455       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
456       vi7x4567 = vi7x89AB;
457 
458       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
459       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
460       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
461       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
462       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
463       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
464       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
465       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
466       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
467       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
468       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
469       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
470       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
471       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
472       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
473       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
474       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
475       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
476       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
477       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
478 
479       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
480       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
481       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
482       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
483       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
484       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
485       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
486       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
487 
488       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
489       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
490       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
491       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
492       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
493       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
494       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
495       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
496       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
497       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
498       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
499       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
500       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
501       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
502       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
503       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
504       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
505       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
506       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
507       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
508 
509       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
510       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
511       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
512       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
513       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
514       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
515       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
516       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
517 
518       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
519       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
520       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
521       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
522       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
523       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
524       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
525       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
526       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
527       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
528       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
529       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
530       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
531       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
532       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
533       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
534       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
535       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
536       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
537       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
538 
539       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
540       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
541       vo2p0 = _mm_add_ps(vo2p0, vo2p1);
542       vo3p0 = _mm_add_ps(vo3p0, vo3p1);
543 
544       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
545       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
546       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
547       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
548 
549       vo0 = _mm_min_ps(vo0, vmax);
550       vo1 = _mm_min_ps(vo1, vmax);
551       vo2 = _mm_min_ps(vo2, vmax);
552       vo3 = _mm_min_ps(vo3, vmax);
553 
554       _mm_storeu_ps(o3, vo3);
555       o3 += 4;
556       _mm_storeu_ps(o2, vo2);
557       o2 += 4;
558       _mm_storeu_ps(o1, vo1);
559       o1 += 4;
560       _mm_storeu_ps(o0, vo0);
561       o0 += 4;
562 
563       w -= 4 * sizeof(float);
564     }
565     assert(w >= 1 * sizeof(float));
566     assert(w <= 4 * sizeof(float));
567     {
568       vi0x4567 = _mm_and_ps(vi0x4567, vmask);
569       vi1x4567 = _mm_and_ps(vi1x4567, vmask);
570       vi2x4567 = _mm_and_ps(vi2x4567, vmask);
571       vi3x4567 = _mm_and_ps(vi3x4567, vmask);
572       vi4x4567 = _mm_and_ps(vi4x4567, vmask);
573       vi5x4567 = _mm_and_ps(vi5x4567, vmask);
574       vi6x4567 = _mm_and_ps(vi6x4567, vmask);
575       vi7x4567 = _mm_and_ps(vi7x4567, vmask);
576 
577       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
578       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
579       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
580       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
581       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12);
582       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk12);
583       __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12);
584       __m128 vo3p1 = _mm_mul_ps(vi4x4567, vk12);
585       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
586       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
587       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
588       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
589       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32));
590       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x4567, vk32));
591       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32));
592       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x4567, vk32));
593       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
594       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
595       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
596       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
597 
598       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
599       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
600       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
601       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
602       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
603       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
604       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
605       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
606 
607       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
608       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
609       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
610       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
611       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
612       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
613       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
614       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
615 
616       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01));
617       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk01));
618       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01));
619       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x3456, vk01));
620       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
621       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
622       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
623       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
624       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21));
625       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk21));
626       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21));
627       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x3456, vk21));
628       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
629       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
630       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
631       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
632       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41));
633       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x3456, vk41));
634       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41));
635       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41));
636 
637       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
638       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
639       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
640       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
641       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
642       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
643       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
644       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
645 
646       const __m128 vzero = _mm_setzero_ps();
647       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
648       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
649       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
650       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
651       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
652       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
653       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
654       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
655 
656       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
657       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
658       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
659       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
660       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10));
661       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x2345, vk10));
662       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10));
663       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x2345, vk10));
664       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
665       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
666       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
667       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
668       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30));
669       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x2345, vk30));
670       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30));
671       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x2345, vk30));
672       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
673       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
674       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
675       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
676 
677       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
678       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
679       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
680       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
681       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
682       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
683       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
684       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
685 
686       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03));
687       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x5678, vk03));
688       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03));
689       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi3x5678, vk03));
690       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
691       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
692       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
693       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
694       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23));
695       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x5678, vk23));
696       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23));
697       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23));
698       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
699       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
700       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
701       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
702       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43));
703       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43));
704       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43));
705       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x5678, vk43));
706 
707       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
708       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
709       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
710       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
711       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
712       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
713       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
714       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
715 
716       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
717       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
718       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
719       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
720       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14));
721       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14));
722       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14));
723       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14));
724       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
725       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
726       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
727       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
728       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34));
729       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34));
730       vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34));
731       vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34));
732       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
733       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
734       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
735       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
736 
737       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
738       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
739       vo2p0 = _mm_add_ps(vo2p0, vo2p1);
740       vo3p0 = _mm_add_ps(vo3p0, vo3p1);
741 
742       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
743       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
744       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
745       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
746 
747       vo0 = _mm_min_ps(vo0, vmax);
748       vo1 = _mm_min_ps(vo1, vmax);
749       vo2 = _mm_min_ps(vo2, vmax);
750       vo3 = _mm_min_ps(vo3, vmax);
751 
752       if XNN_LIKELY(w & (4 * sizeof(float))) {
753         _mm_storeu_ps(o3, vo3);
754         o3 += 4;
755         _mm_storeu_ps(o2, vo2);
756         o2 += 4;
757         _mm_storeu_ps(o1, vo1);
758         o1 += 4;
759         _mm_storeu_ps(o0, vo0);
760         o0 += 4;
761       } else {
762         if (w & (2 * sizeof(float))) {
763           _mm_storel_pi((__m64*) o3, vo3);
764           o3 += 2;
765           _mm_storel_pi((__m64*) o2, vo2);
766           o2 += 2;
767           _mm_storel_pi((__m64*) o1, vo1);
768           o1 += 2;
769           _mm_storel_pi((__m64*) o0, vo0);
770           o0 += 2;
771 
772           vo0 = _mm_movehl_ps(vo0, vo0);
773           vo1 = _mm_movehl_ps(vo1, vo1);
774           vo2 = _mm_movehl_ps(vo2, vo2);
775           vo3 = _mm_movehl_ps(vo3, vo3);
776         }
777         if (w & (1 * sizeof(float))) {
778           _mm_store_ss(o3, vo3);
779           o3 += 1;
780           _mm_store_ss(o2, vo2);
781           o2 += 1;
782           _mm_store_ss(o1, vo1);
783           o1 += 1;
784           _mm_store_ss(o0, vo0);
785           o0 += 1;
786         }
787       }
788     }
789 
790     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
791     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
792     i2 = (const float*) ((uintptr_t) i1 + input_width);
793     i3 = (const float*) ((uintptr_t) i2 + input_width);
794     i4 = (const float*) ((uintptr_t) i3 + input_width);
795     i5 = (const float*) ((uintptr_t) i4 + input_width);
796     i6 = (const float*) ((uintptr_t) i5 + input_width);
797     i7 = (const float*) ((uintptr_t) i6 + input_width);
798 
799     o0 = o3;
800     o1 = (float*) ((uintptr_t) o0 + input_width);
801     o2 = (float*) ((uintptr_t) o1 + input_width);
802     o3 = (float*) ((uintptr_t) o2 + input_width);
803 
804     output_height = doz(output_height, 4);
805   } while (output_height != 0);
806 }
807