Lines Matching refs:__m256
34 const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
35 const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
88 __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
91 const __m256 vi0x01234567 = _mm256_loadu_ps(i0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
92 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
95 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
96 const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
100 const __m256 vi1x01234567 = _mm256_loadu_ps(i1); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
101 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
104 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
105 const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
106 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
107 __m256 vacc89ABCDEFp1 = _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
109 const __m256 vi2x01234567 = _mm256_loadu_ps(i2); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
110 const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
113 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
114 const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
118 const __m256 vi3x01234567 = _mm256_loadu_ps(i3); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
119 const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
122 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
123 const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
127 const __m256 vi4x01234567 = _mm256_loadu_ps(i4); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
128 const __m256 vi4x89ABCDEF = _mm256_loadu_ps(i4 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
131 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
132 const __m256 vk4x89ABCDEF = _mm256_load_ps(w + 88); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
136 const __m256 vi5x01234567 = _mm256_loadu_ps(i5); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
137 const __m256 vi5x89ABCDEF = _mm256_loadu_ps(i5 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
140 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
141 const __m256 vk5x89ABCDEF = _mm256_load_ps(w + 104); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
145 const __m256 vi6x01234567 = _mm256_loadu_ps(i6); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
146 const __m256 vi6x89ABCDEF = _mm256_loadu_ps(i6 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
149 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
150 const __m256 vk6x89ABCDEF = _mm256_load_ps(w + 120); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
154 const __m256 vi7x01234567 = _mm256_loadu_ps(i7); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
155 const __m256 vi7x89ABCDEF = _mm256_loadu_ps(i7 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
158 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
159 const __m256 vk7x89ABCDEF = _mm256_load_ps(w + 136); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
163 const __m256 vi8x01234567 = _mm256_loadu_ps(i8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
164 const __m256 vi8x89ABCDEF = _mm256_loadu_ps(i8 + 8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
167 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
168 const __m256 vk8x89ABCDEF = _mm256_load_ps(w + 152); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
178 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
179 __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
188 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
190 const __m256 vi0x01234567 = _mm256_loadu_ps(i0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
193 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
196 const __m256 vi1x01234567 = _mm256_loadu_ps(i1); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
199 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
200 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
202 const __m256 vi2x01234567 = _mm256_loadu_ps(i2); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
205 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
208 const __m256 vi3x01234567 = _mm256_loadu_ps(i3); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
211 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
214 const __m256 vi4x01234567 = _mm256_loadu_ps(i4); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
217 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
220 const __m256 vi5x01234567 = _mm256_loadu_ps(i5); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
223 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
226 const __m256 vi6x01234567 = _mm256_loadu_ps(i6); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
229 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
232 const __m256 vi7x01234567 = _mm256_loadu_ps(i7); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
235 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
238 const __m256 vi8x01234567 = _mm256_loadu_ps(i8); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
241 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
249 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
260 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
262 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
263 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
266 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
267 const __m256 vk1x01234567 = _mm256_load_ps(w + 32); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
268 __m256 vacc01234567p1 = _mm256_mul_ps(vi1x01234567, vk1x01234567); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
270 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
271 const __m256 vk2x01234567 = _mm256_load_ps(w + 48); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
274 const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
275 const __m256 vk3x01234567 = _mm256_load_ps(w + 64); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
278 const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
279 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
282 const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
283 const __m256 vk5x01234567 = _mm256_load_ps(w + 96); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
286 const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
287 const __m256 vk6x01234567 = _mm256_load_ps(w + 112); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
290 const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
291 const __m256 vk7x01234567 = _mm256_load_ps(w + 128); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
294 const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
295 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
301 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()