Lines Matching refs:__m256
34 const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
35 const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
168 __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
171 const __m256 vi0x01234567 = _mm256_loadu_ps(i0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
172 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
175 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
176 const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
180 const __m256 vi1x01234567 = _mm256_loadu_ps(i1); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
181 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
184 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
185 const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
186 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
187 __m256 vacc89ABCDEFp1 = _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
189 const __m256 vi2x01234567 = _mm256_loadu_ps(i2); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
190 const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
193 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
194 const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
198 const __m256 vi3x01234567 = _mm256_loadu_ps(i3); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
199 const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
202 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
203 const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
207 const __m256 vi4x01234567 = _mm256_loadu_ps(i4); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
208 const __m256 vi4x89ABCDEF = _mm256_loadu_ps(i4 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
211 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
212 const __m256 vk4x89ABCDEF = _mm256_load_ps(w + 88); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
216 const __m256 vi5x01234567 = _mm256_loadu_ps(i5); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
217 const __m256 vi5x89ABCDEF = _mm256_loadu_ps(i5 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
220 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
221 const __m256 vk5x89ABCDEF = _mm256_load_ps(w + 104); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
225 const __m256 vi6x01234567 = _mm256_loadu_ps(i6); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
226 const __m256 vi6x89ABCDEF = _mm256_loadu_ps(i6 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
229 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
230 const __m256 vk6x89ABCDEF = _mm256_load_ps(w + 120); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
234 const __m256 vi7x01234567 = _mm256_loadu_ps(i7); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
235 const __m256 vi7x89ABCDEF = _mm256_loadu_ps(i7 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
238 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
239 const __m256 vk7x89ABCDEF = _mm256_load_ps(w + 136); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
243 const __m256 vi8x01234567 = _mm256_loadu_ps(i8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
244 const __m256 vi8x89ABCDEF = _mm256_loadu_ps(i8 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
247 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
248 const __m256 vk8x89ABCDEF = _mm256_load_ps(w + 152); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
252 const __m256 vi9x01234567 = _mm256_loadu_ps(i9); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
253 const __m256 vi9x89ABCDEF = _mm256_loadu_ps(i9 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
256 const __m256 vk9x01234567 = _mm256_load_ps(w + 160); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
257 const __m256 vk9x89ABCDEF = _mm256_load_ps(w + 168); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
261 const __m256 vi10x01234567 = _mm256_loadu_ps(i10); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
262 const __m256 vi10x89ABCDEF = _mm256_loadu_ps(i10 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
265 const __m256 vk10x01234567 = _mm256_load_ps(w + 176); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
266 const __m256 vk10x89ABCDEF = _mm256_load_ps(w + 184); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
270 const __m256 vi11x01234567 = _mm256_loadu_ps(i11); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
271 const __m256 vi11x89ABCDEF = _mm256_loadu_ps(i11 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
274 const __m256 vk11x01234567 = _mm256_load_ps(w + 192); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
275 const __m256 vk11x89ABCDEF = _mm256_load_ps(w + 200); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
279 const __m256 vi12x01234567 = _mm256_loadu_ps(i12); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
280 const __m256 vi12x89ABCDEF = _mm256_loadu_ps(i12 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
283 const __m256 vk12x01234567 = _mm256_load_ps(w + 208); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
284 const __m256 vk12x89ABCDEF = _mm256_load_ps(w + 216); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
288 const __m256 vi13x01234567 = _mm256_loadu_ps(i13); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
289 const __m256 vi13x89ABCDEF = _mm256_loadu_ps(i13 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
292 const __m256 vk13x01234567 = _mm256_load_ps(w + 224); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
293 const __m256 vk13x89ABCDEF = _mm256_load_ps(w + 232); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
297 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
298 const __m256 vi14x89ABCDEF = _mm256_loadu_ps(i14 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
301 const __m256 vk14x01234567 = _mm256_load_ps(w + 240); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
302 const __m256 vk14x89ABCDEF = _mm256_load_ps(w + 248); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
306 const __m256 vi15x01234567 = _mm256_loadu_ps(i15); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
307 const __m256 vi15x89ABCDEF = _mm256_loadu_ps(i15 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
310 const __m256 vk15x01234567 = _mm256_load_ps(w + 256); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
311 const __m256 vk15x89ABCDEF = _mm256_load_ps(w + 264); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
315 const __m256 vi16x01234567 = _mm256_loadu_ps(i16); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
316 const __m256 vi16x89ABCDEF = _mm256_loadu_ps(i16 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
319 const __m256 vk16x01234567 = _mm256_load_ps(w + 272); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
320 const __m256 vk16x89ABCDEF = _mm256_load_ps(w + 280); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
324 const __m256 vi17x01234567 = _mm256_loadu_ps(i17); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
325 const __m256 vi17x89ABCDEF = _mm256_loadu_ps(i17 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
328 const __m256 vk17x01234567 = _mm256_load_ps(w + 288); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
329 const __m256 vk17x89ABCDEF = _mm256_load_ps(w + 296); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
333 const __m256 vi18x01234567 = _mm256_loadu_ps(i18); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
334 const __m256 vi18x89ABCDEF = _mm256_loadu_ps(i18 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
337 const __m256 vk18x01234567 = _mm256_load_ps(w + 304); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
338 const __m256 vk18x89ABCDEF = _mm256_load_ps(w + 312); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
342 const __m256 vi19x01234567 = _mm256_loadu_ps(i19); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
343 const __m256 vi19x89ABCDEF = _mm256_loadu_ps(i19 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
346 const __m256 vk19x01234567 = _mm256_load_ps(w + 320); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
347 const __m256 vk19x89ABCDEF = _mm256_load_ps(w + 328); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
351 const __m256 vi20x01234567 = _mm256_loadu_ps(i20); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
352 const __m256 vi20x89ABCDEF = _mm256_loadu_ps(i20 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
355 const __m256 vk20x01234567 = _mm256_load_ps(w + 336); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
356 const __m256 vk20x89ABCDEF = _mm256_load_ps(w + 344); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
360 const __m256 vi21x01234567 = _mm256_loadu_ps(i21); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
361 const __m256 vi21x89ABCDEF = _mm256_loadu_ps(i21 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
364 const __m256 vk21x01234567 = _mm256_load_ps(w + 352); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
365 const __m256 vk21x89ABCDEF = _mm256_load_ps(w + 360); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
369 const __m256 vi22x01234567 = _mm256_loadu_ps(i22); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
370 const __m256 vi22x89ABCDEF = _mm256_loadu_ps(i22 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
373 const __m256 vk22x01234567 = _mm256_load_ps(w + 368); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
374 const __m256 vk22x89ABCDEF = _mm256_load_ps(w + 376); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
378 const __m256 vi23x01234567 = _mm256_loadu_ps(i23); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
379 const __m256 vi23x89ABCDEF = _mm256_loadu_ps(i23 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
382 const __m256 vk23x01234567 = _mm256_load_ps(w + 384); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
383 const __m256 vk23x89ABCDEF = _mm256_load_ps(w + 392); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
387 const __m256 vi24x01234567 = _mm256_loadu_ps(i24); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
388 const __m256 vi24x89ABCDEF = _mm256_loadu_ps(i24 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
391 const __m256 vk24x01234567 = _mm256_load_ps(w + 400); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
392 const __m256 vk24x89ABCDEF = _mm256_load_ps(w + 408); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
402 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
403 __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
412 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
414 const __m256 vi0x01234567 = _mm256_loadu_ps(i0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
417 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
420 const __m256 vi1x01234567 = _mm256_loadu_ps(i1); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
423 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
424 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
426 const __m256 vi2x01234567 = _mm256_loadu_ps(i2); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
429 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
432 const __m256 vi3x01234567 = _mm256_loadu_ps(i3); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
435 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
438 const __m256 vi4x01234567 = _mm256_loadu_ps(i4); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
441 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
444 const __m256 vi5x01234567 = _mm256_loadu_ps(i5); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
447 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
450 const __m256 vi6x01234567 = _mm256_loadu_ps(i6); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
453 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
456 const __m256 vi7x01234567 = _mm256_loadu_ps(i7); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
459 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
462 const __m256 vi8x01234567 = _mm256_loadu_ps(i8); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
465 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
468 const __m256 vi9x01234567 = _mm256_loadu_ps(i9); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
471 const __m256 vk9x01234567 = _mm256_load_ps(w + 160); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
474 const __m256 vi10x01234567 = _mm256_loadu_ps(i10); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
477 const __m256 vk10x01234567 = _mm256_load_ps(w + 176); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
480 const __m256 vi11x01234567 = _mm256_loadu_ps(i11); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
483 const __m256 vk11x01234567 = _mm256_load_ps(w + 192); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
486 const __m256 vi12x01234567 = _mm256_loadu_ps(i12); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
489 const __m256 vk12x01234567 = _mm256_load_ps(w + 208); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
492 const __m256 vi13x01234567 = _mm256_loadu_ps(i13); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
495 const __m256 vk13x01234567 = _mm256_load_ps(w + 224); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
498 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
501 const __m256 vk14x01234567 = _mm256_load_ps(w + 240); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
504 const __m256 vi15x01234567 = _mm256_loadu_ps(i15); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
507 const __m256 vk15x01234567 = _mm256_load_ps(w + 256); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
510 const __m256 vi16x01234567 = _mm256_loadu_ps(i16); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
513 const __m256 vk16x01234567 = _mm256_load_ps(w + 272); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
516 const __m256 vi17x01234567 = _mm256_loadu_ps(i17); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
519 const __m256 vk17x01234567 = _mm256_load_ps(w + 288); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
522 const __m256 vi18x01234567 = _mm256_loadu_ps(i18); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
525 const __m256 vk18x01234567 = _mm256_load_ps(w + 304); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
528 const __m256 vi19x01234567 = _mm256_loadu_ps(i19); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
531 const __m256 vk19x01234567 = _mm256_load_ps(w + 320); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
534 const __m256 vi20x01234567 = _mm256_loadu_ps(i20); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
537 const __m256 vk20x01234567 = _mm256_load_ps(w + 336); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
540 const __m256 vi21x01234567 = _mm256_loadu_ps(i21); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
543 const __m256 vk21x01234567 = _mm256_load_ps(w + 352); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
546 const __m256 vi22x01234567 = _mm256_loadu_ps(i22); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
549 const __m256 vk22x01234567 = _mm256_load_ps(w + 368); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
552 const __m256 vi23x01234567 = _mm256_loadu_ps(i23); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
555 const __m256 vk23x01234567 = _mm256_load_ps(w + 384); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
558 const __m256 vi24x01234567 = _mm256_loadu_ps(i24); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
561 const __m256 vk24x01234567 = _mm256_load_ps(w + 400); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
569 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
580 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
582 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
583 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
586 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
587 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
588 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
590 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
591 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
594 const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
595 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
598 const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
599 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
602 const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
603 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
606 const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
607 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
610 const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
611 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
614 const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
615 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
618 const __m256 vi9x01234567 = _mm256_maskload_ps(i9, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
619 const __m256 vk9x01234567 = _mm256_load_ps(w + 160); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
622 const __m256 vi10x01234567 = _mm256_maskload_ps(i10, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
623 const __m256 vk10x01234567 = _mm256_load_ps(w + 176); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
626 const __m256 vi11x01234567 = _mm256_maskload_ps(i11, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
627 const __m256 vk11x01234567 = _mm256_load_ps(w + 192); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
630 const __m256 vi12x01234567 = _mm256_maskload_ps(i12, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
631 const __m256 vk12x01234567 = _mm256_load_ps(w + 208); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
634 const __m256 vi13x01234567 = _mm256_maskload_ps(i13, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
635 const __m256 vk13x01234567 = _mm256_load_ps(w + 224); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
638 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
639 const __m256 vk14x01234567 = _mm256_load_ps(w + 240); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
642 const __m256 vi15x01234567 = _mm256_maskload_ps(i15, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
643 const __m256 vk15x01234567 = _mm256_load_ps(w + 256); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
646 const __m256 vi16x01234567 = _mm256_maskload_ps(i16, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
647 const __m256 vk16x01234567 = _mm256_load_ps(w + 272); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
650 const __m256 vi17x01234567 = _mm256_maskload_ps(i17, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
651 const __m256 vk17x01234567 = _mm256_load_ps(w + 288); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
654 const __m256 vi18x01234567 = _mm256_maskload_ps(i18, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
655 const __m256 vk18x01234567 = _mm256_load_ps(w + 304); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
658 const __m256 vi19x01234567 = _mm256_maskload_ps(i19, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
659 const __m256 vk19x01234567 = _mm256_load_ps(w + 320); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
662 const __m256 vi20x01234567 = _mm256_maskload_ps(i20, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
663 const __m256 vk20x01234567 = _mm256_load_ps(w + 336); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
666 const __m256 vi21x01234567 = _mm256_maskload_ps(i21, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
667 const __m256 vk21x01234567 = _mm256_load_ps(w + 352); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
670 const __m256 vi22x01234567 = _mm256_maskload_ps(i22, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
671 const __m256 vk22x01234567 = _mm256_load_ps(w + 368); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
674 const __m256 vi23x01234567 = _mm256_maskload_ps(i23, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
675 const __m256 vk23x01234567 = _mm256_load_ps(w + 384); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
678 const __m256 vi24x01234567 = _mm256_maskload_ps(i24, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
679 const __m256 vk24x01234567 = _mm256_load_ps(w + 400); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
685 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()