1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv/up-avx512.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <immintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16
17
xnn_f32_dwconv_ukernel_up16x25__avx512f(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,const union xnn_f32_output_params params[restrict static1])18 void xnn_f32_dwconv_ukernel_up16x25__avx512f(
19 size_t channels,
20 size_t output_width,
21 const float** input,
22 const float* weights,
23 float* output,
24 size_t input_stride,
25 size_t output_increment,
26 const union xnn_f32_output_params params[restrict static 1])
27 {
28 assert(channels != 0);
29 assert(output_width != 0);
30
31 const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
32 const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
33 do {
34 const float* i0 = input[0];
35 assert(i0 != NULL);
36 const float* i1 = input[1];
37 assert(i1 != NULL);
38 const float* i2 = input[2];
39 assert(i2 != NULL);
40 const float* i3 = input[3];
41 assert(i3 != NULL);
42 const float* i4 = input[4];
43 assert(i4 != NULL);
44 const float* i5 = input[5];
45 assert(i5 != NULL);
46 const float* i6 = input[6];
47 assert(i6 != NULL);
48 const float* i7 = input[7];
49 assert(i7 != NULL);
50 const float* i8 = input[8];
51 assert(i8 != NULL);
52 const float* i9 = input[9];
53 assert(i9 != NULL);
54 const float* i10 = input[10];
55 assert(i10 != NULL);
56 const float* i11 = input[11];
57 assert(i11 != NULL);
58 const float* i12 = input[12];
59 assert(i12 != NULL);
60 const float* i13 = input[13];
61 assert(i13 != NULL);
62 const float* i14 = input[14];
63 assert(i14 != NULL);
64 const float* i15 = input[15];
65 assert(i15 != NULL);
66 const float* i16 = input[16];
67 assert(i16 != NULL);
68 const float* i17 = input[17];
69 assert(i17 != NULL);
70 const float* i18 = input[18];
71 assert(i18 != NULL);
72 const float* i19 = input[19];
73 assert(i19 != NULL);
74 const float* i20 = input[20];
75 assert(i20 != NULL);
76 const float* i21 = input[21];
77 assert(i21 != NULL);
78 const float* i22 = input[22];
79 assert(i22 != NULL);
80 const float* i23 = input[23];
81 assert(i23 != NULL);
82 const float* i24 = input[24];
83 assert(i24 != NULL);
84 input = (const float**) ((uintptr_t) input + input_stride);
85
86 size_t c = channels;
87 const float* w = weights;
88 for (; c >= 16; c -= 16) {
89 __m512 vacc0123456789ABCDEFp0 = _mm512_load_ps(w);
90
91
92 const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0);
93 i0 += 16;
94
95 const __m512 vk0x0123456789ABCDEF = _mm512_load_ps(w + 16);
96 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF, vacc0123456789ABCDEFp0);
97
98 const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1);
99 i1 += 16;
100
101 const __m512 vk1x0123456789ABCDEF = _mm512_load_ps(w + 32);
102 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF, vacc0123456789ABCDEFp0);
103
104 const __m512 vi2x0123456789ABCDEF = _mm512_loadu_ps(i2);
105 i2 += 16;
106
107 const __m512 vk2x0123456789ABCDEF = _mm512_load_ps(w + 48);
108 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF, vacc0123456789ABCDEFp0);
109
110 const __m512 vi3x0123456789ABCDEF = _mm512_loadu_ps(i3);
111 i3 += 16;
112
113 const __m512 vk3x0123456789ABCDEF = _mm512_load_ps(w + 64);
114 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF, vacc0123456789ABCDEFp0);
115
116 const __m512 vi4x0123456789ABCDEF = _mm512_loadu_ps(i4);
117 i4 += 16;
118
119 const __m512 vk4x0123456789ABCDEF = _mm512_load_ps(w + 80);
120 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF, vacc0123456789ABCDEFp0);
121
122 const __m512 vi5x0123456789ABCDEF = _mm512_loadu_ps(i5);
123 i5 += 16;
124
125 const __m512 vk5x0123456789ABCDEF = _mm512_load_ps(w + 96);
126 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF, vacc0123456789ABCDEFp0);
127
128 const __m512 vi6x0123456789ABCDEF = _mm512_loadu_ps(i6);
129 i6 += 16;
130
131 const __m512 vk6x0123456789ABCDEF = _mm512_load_ps(w + 112);
132 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF, vacc0123456789ABCDEFp0);
133
134 const __m512 vi7x0123456789ABCDEF = _mm512_loadu_ps(i7);
135 i7 += 16;
136
137 const __m512 vk7x0123456789ABCDEF = _mm512_load_ps(w + 128);
138 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF, vacc0123456789ABCDEFp0);
139
140 const __m512 vi8x0123456789ABCDEF = _mm512_loadu_ps(i8);
141 i8 += 16;
142
143 const __m512 vk8x0123456789ABCDEF = _mm512_load_ps(w + 144);
144 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF, vacc0123456789ABCDEFp0);
145
146 const __m512 vi9x0123456789ABCDEF = _mm512_loadu_ps(i9);
147 i9 += 16;
148
149 const __m512 vk9x0123456789ABCDEF = _mm512_load_ps(w + 160);
150 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF, vacc0123456789ABCDEFp0);
151
152 const __m512 vi10x0123456789ABCDEF = _mm512_loadu_ps(i10);
153 i10 += 16;
154
155 const __m512 vk10x0123456789ABCDEF = _mm512_load_ps(w + 176);
156 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF, vacc0123456789ABCDEFp0);
157
158 const __m512 vi11x0123456789ABCDEF = _mm512_loadu_ps(i11);
159 i11 += 16;
160
161 const __m512 vk11x0123456789ABCDEF = _mm512_load_ps(w + 192);
162 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF, vacc0123456789ABCDEFp0);
163
164 const __m512 vi12x0123456789ABCDEF = _mm512_loadu_ps(i12);
165 i12 += 16;
166
167 const __m512 vk12x0123456789ABCDEF = _mm512_load_ps(w + 208);
168 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF, vacc0123456789ABCDEFp0);
169
170 const __m512 vi13x0123456789ABCDEF = _mm512_loadu_ps(i13);
171 i13 += 16;
172
173 const __m512 vk13x0123456789ABCDEF = _mm512_load_ps(w + 224);
174 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF, vacc0123456789ABCDEFp0);
175
176 const __m512 vi14x0123456789ABCDEF = _mm512_loadu_ps(i14);
177 i14 += 16;
178
179 const __m512 vk14x0123456789ABCDEF = _mm512_load_ps(w + 240);
180 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF, vacc0123456789ABCDEFp0);
181
182 const __m512 vi15x0123456789ABCDEF = _mm512_loadu_ps(i15);
183 i15 += 16;
184
185 const __m512 vk15x0123456789ABCDEF = _mm512_load_ps(w + 256);
186 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF, vacc0123456789ABCDEFp0);
187
188 const __m512 vi16x0123456789ABCDEF = _mm512_loadu_ps(i16);
189 i16 += 16;
190
191 const __m512 vk16x0123456789ABCDEF = _mm512_load_ps(w + 272);
192 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF, vacc0123456789ABCDEFp0);
193
194 const __m512 vi17x0123456789ABCDEF = _mm512_loadu_ps(i17);
195 i17 += 16;
196
197 const __m512 vk17x0123456789ABCDEF = _mm512_load_ps(w + 288);
198 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF, vacc0123456789ABCDEFp0);
199
200 const __m512 vi18x0123456789ABCDEF = _mm512_loadu_ps(i18);
201 i18 += 16;
202
203 const __m512 vk18x0123456789ABCDEF = _mm512_load_ps(w + 304);
204 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF, vacc0123456789ABCDEFp0);
205
206 const __m512 vi19x0123456789ABCDEF = _mm512_loadu_ps(i19);
207 i19 += 16;
208
209 const __m512 vk19x0123456789ABCDEF = _mm512_load_ps(w + 320);
210 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF, vacc0123456789ABCDEFp0);
211
212 const __m512 vi20x0123456789ABCDEF = _mm512_loadu_ps(i20);
213 i20 += 16;
214
215 const __m512 vk20x0123456789ABCDEF = _mm512_load_ps(w + 336);
216 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF, vacc0123456789ABCDEFp0);
217
218 const __m512 vi21x0123456789ABCDEF = _mm512_loadu_ps(i21);
219 i21 += 16;
220
221 const __m512 vk21x0123456789ABCDEF = _mm512_load_ps(w + 352);
222 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF, vacc0123456789ABCDEFp0);
223
224 const __m512 vi22x0123456789ABCDEF = _mm512_loadu_ps(i22);
225 i22 += 16;
226
227 const __m512 vk22x0123456789ABCDEF = _mm512_load_ps(w + 368);
228 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF, vacc0123456789ABCDEFp0);
229
230 const __m512 vi23x0123456789ABCDEF = _mm512_loadu_ps(i23);
231 i23 += 16;
232
233 const __m512 vk23x0123456789ABCDEF = _mm512_load_ps(w + 384);
234 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF, vacc0123456789ABCDEFp0);
235
236 const __m512 vi24x0123456789ABCDEF = _mm512_loadu_ps(i24);
237 i24 += 16;
238
239 const __m512 vk24x0123456789ABCDEF = _mm512_load_ps(w + 400);
240 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF, vacc0123456789ABCDEFp0);
241
242 w += 416;
243
244
245 __m512 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEFp0, vmin);
246 vacc0123456789ABCDEF = _mm512_min_ps(vacc0123456789ABCDEF, vmax);
247
248 _mm512_storeu_ps(output, vacc0123456789ABCDEF);
249 output += 16;
250 }
251 if XNN_UNLIKELY(c != 0) {
252 assert(c >= 1);
253 assert(c <= 16);
254 // Prepare mask for valid 32-bit elements (depends on nc).
255 const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << c) - UINT32_C(1)));
256
257 __m512 vacc0123456789ABCDEFp0 = _mm512_maskz_loadu_ps(vmask, w);
258
259 const __m512 vi0x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i0);
260 const __m512 vk0x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 16);
261 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF, vacc0123456789ABCDEFp0);
262
263 const __m512 vi1x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i1);
264 const __m512 vk1x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 32);
265 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF, vacc0123456789ABCDEFp0);
266
267 const __m512 vi2x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i2);
268 const __m512 vk2x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 48);
269 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF, vacc0123456789ABCDEFp0);
270
271 const __m512 vi3x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i3);
272 const __m512 vk3x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 64);
273 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF, vacc0123456789ABCDEFp0);
274
275 const __m512 vi4x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i4);
276 const __m512 vk4x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 80);
277 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF, vacc0123456789ABCDEFp0);
278
279 const __m512 vi5x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i5);
280 const __m512 vk5x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 96);
281 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF, vacc0123456789ABCDEFp0);
282
283 const __m512 vi6x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i6);
284 const __m512 vk6x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 112);
285 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF, vacc0123456789ABCDEFp0);
286
287 const __m512 vi7x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i7);
288 const __m512 vk7x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 128);
289 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF, vacc0123456789ABCDEFp0);
290
291 const __m512 vi8x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i8);
292 const __m512 vk8x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 144);
293 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF, vacc0123456789ABCDEFp0);
294
295 const __m512 vi9x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i9);
296 const __m512 vk9x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 160);
297 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF, vacc0123456789ABCDEFp0);
298
299 const __m512 vi10x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i10);
300 const __m512 vk10x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 176);
301 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF, vacc0123456789ABCDEFp0);
302
303 const __m512 vi11x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i11);
304 const __m512 vk11x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 192);
305 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF, vacc0123456789ABCDEFp0);
306
307 const __m512 vi12x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i12);
308 const __m512 vk12x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 208);
309 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF, vacc0123456789ABCDEFp0);
310
311 const __m512 vi13x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i13);
312 const __m512 vk13x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 224);
313 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF, vacc0123456789ABCDEFp0);
314
315 const __m512 vi14x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i14);
316 const __m512 vk14x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 240);
317 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF, vacc0123456789ABCDEFp0);
318
319 const __m512 vi15x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i15);
320 const __m512 vk15x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 256);
321 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF, vacc0123456789ABCDEFp0);
322
323 const __m512 vi16x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i16);
324 const __m512 vk16x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 272);
325 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF, vacc0123456789ABCDEFp0);
326
327 const __m512 vi17x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i17);
328 const __m512 vk17x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 288);
329 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF, vacc0123456789ABCDEFp0);
330
331 const __m512 vi18x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i18);
332 const __m512 vk18x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 304);
333 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF, vacc0123456789ABCDEFp0);
334
335 const __m512 vi19x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i19);
336 const __m512 vk19x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 320);
337 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF, vacc0123456789ABCDEFp0);
338
339 const __m512 vi20x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i20);
340 const __m512 vk20x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 336);
341 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF, vacc0123456789ABCDEFp0);
342
343 const __m512 vi21x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i21);
344 const __m512 vk21x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 352);
345 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF, vacc0123456789ABCDEFp0);
346
347 const __m512 vi22x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i22);
348 const __m512 vk22x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 368);
349 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF, vacc0123456789ABCDEFp0);
350
351 const __m512 vi23x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i23);
352 const __m512 vk23x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 384);
353 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF, vacc0123456789ABCDEFp0);
354
355 const __m512 vi24x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i24);
356 const __m512 vk24x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 400);
357 vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF, vacc0123456789ABCDEFp0);
358
359
360 __m512 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEFp0, vmin);
361 vacc0123456789ABCDEF = _mm512_min_ps(vacc0123456789ABCDEF, vmax);
362
363 _mm512_mask_storeu_ps(output, vmask, vacc0123456789ABCDEF);
364 output += c;
365 }
366
367 output = (float*) ((uintptr_t) output + output_increment);
368 } while (--output_width != 0);
369 }
370