1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/common.h>
11 #include <xnnpack/gavgpool.h>
12 #include <xnnpack/intrinsics-polyfill.h>
13 #include <xnnpack/math.h>
14 #include <xnnpack/maxpool.h>
15 #include <xnnpack/prelu.h>
16 #include <xnnpack/vbinary.h>
17 #include <xnnpack/vcvt.h>
18 #include <xnnpack/vunary.h>
19
20
xnn_f16_f32_vcvt_ukernel__f16c_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])21 void xnn_f16_f32_vcvt_ukernel__f16c_x16(
22 size_t n,
23 const void* input,
24 float* output,
25 const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
26 {
27 assert(n != 0);
28 assert(n % sizeof(uint16_t) == 0);
29 assert(input != NULL);
30 assert(output != NULL);
31
32 const uint16_t* i = (const uint16_t*) input;
33 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
34 const __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
35 const __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
36 i += 16;
37
38 _mm256_storeu_ps(output, vacc0);
39 _mm256_storeu_ps(output + 8, vacc1);
40 output += 16;
41 }
42 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
43 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
44 i += 8;
45
46 _mm256_storeu_ps(output, vacc);
47 output += 8;
48 }
49 if XNN_UNLIKELY(n != 0) {
50 assert(n >= 1 * sizeof(uint16_t));
51 assert(n <= 7 * sizeof(uint16_t));
52 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
53
54 __m128 vacc_lo = _mm256_castps256_ps128(vacc);
55 if (n & (4 * sizeof(uint16_t))) {
56 _mm_storeu_ps(output, vacc_lo);
57 vacc_lo = _mm256_extractf128_ps(vacc, 1);
58 output += 4;
59 }
60 if (n & (2 * sizeof(uint16_t))) {
61 _mm_storel_pi((__m64*) output, vacc_lo);
62 vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
63 output += 2;
64 }
65 if (n & (1 * sizeof(uint16_t))) {
66 _mm_store_ss(output, vacc_lo);
67 }
68 }
69 }
70
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])71 void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
72 size_t rows,
73 size_t channels,
74 const void* input,
75 size_t input_stride,
76 const void* zero,
77 void* buffer,
78 void* output,
79 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
80 {
81 assert(rows > 7);
82 assert(channels != 0);
83
84 const uint16_t* i0 = input;
85 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
86 const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
87 const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
88 const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
89 const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
90 const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
91 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
92
93 uint16_t* b = buffer;
94 size_t c = channels;
95 for (; c != 0; c = doz(c, 8)) {
96 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
97 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
98
99 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
100 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
101
102 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
103 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
104 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
105 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
106 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
107 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
108 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
109 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
110 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
111
112 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
113 }
114
115 for (rows -= 7; rows > 7; rows -= 7) {
116 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
117 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
118 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
119 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
120 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
121 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
122 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
123
124 uint16_t* b = buffer;
125 size_t c = channels;
126 for (; c != 0; c = doz(c, 8)) {
127 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
128
129 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
130
131 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
132 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
133 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
134 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
135 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
136 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
137 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
138 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
139 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
140 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
141 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
142 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
143 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
144
145 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
146 }
147 }
148
149 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
150 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
151 if XNN_UNPREDICTABLE(rows < 2) {
152 i1 = (const uint16_t*) zero;
153 }
154 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
155 if XNN_UNPREDICTABLE(rows <= 2) {
156 i2 = (const uint16_t*) zero;
157 }
158 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
159 if XNN_UNPREDICTABLE(rows < 4) {
160 i3 = (const uint16_t*) zero;
161 }
162 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
163 if XNN_UNPREDICTABLE(rows <= 4) {
164 i4 = (const uint16_t*) zero;
165 }
166 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
167 if XNN_UNPREDICTABLE(rows < 6) {
168 i5 = (const uint16_t*) zero;
169 }
170 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
171 if XNN_UNPREDICTABLE(rows <= 6) {
172 i6 = (const uint16_t*) zero;
173 }
174
175 const __m256 vscale = _mm256_load_ps(params->avx.scale);
176 const __m256 vmin = _mm256_load_ps(params->avx.min);
177 const __m256 vmax = _mm256_load_ps(params->avx.max);
178 for (; channels >= 8; channels -= 8) {
179 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
180
181 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
182
183 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
184 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
185 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
186 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
187 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
188 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
189 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
190 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
191 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
192 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
193 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
194 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
195 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
196
197 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
198
199 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
200
201 vout01234567 = _mm256_min_ps(vout01234567, vmax);
202
203 _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
204 output = (uint16_t*) output + 8;
205 }
206 if XNN_UNLIKELY(channels != 0) {
207 {
208 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
209
210 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
211 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
212 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
213 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
214 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
215 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
216 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
217 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
218 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
219 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
220 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
221 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
222 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
223 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
224
225 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
226 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
227 vout01234567 = _mm256_min_ps(vout01234567, vmax);
228
229 __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
230 if (channels & 4) {
231 _mm_storel_epi64((__m128i*) output, vh01234567);
232 output = (uint16_t*) output + 4;
233 vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
234 }
235 if (channels & 2) {
236 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
237 output = (uint16_t*) output + 2;
238 vh01234567 = _mm_srli_epi64(vh01234567, 32);
239 }
240 if (channels & 1) {
241 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
242 }
243 }
244 }
245 }
246
xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])247 void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
248 size_t rows,
249 size_t channels,
250 const void* input,
251 size_t input_stride,
252 const void* zero,
253 void* output,
254 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
255 {
256 assert(rows != 0);
257 assert(rows <= 7);
258 assert(channels != 0);
259
260 const uint16_t* i0 = input;
261 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
262 if XNN_UNPREDICTABLE(rows < 2) {
263 i1 = (const uint16_t*) zero;
264 }
265 const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
266 if XNN_UNPREDICTABLE(rows <= 2) {
267 i2 = (const uint16_t*) zero;
268 }
269 const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
270 if XNN_UNPREDICTABLE(rows < 4) {
271 i3 = (const uint16_t*) zero;
272 }
273 const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
274 if XNN_UNPREDICTABLE(rows <= 4) {
275 i4 = (const uint16_t*) zero;
276 }
277 const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
278 if XNN_UNPREDICTABLE(rows < 6) {
279 i5 = (const uint16_t*) zero;
280 }
281 const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
282 if XNN_UNPREDICTABLE(rows <= 6) {
283 i6 = (const uint16_t*) zero;
284 }
285
286 const __m256 vscale = _mm256_load_ps(params->avx.scale);
287 const __m256 vmin = _mm256_load_ps(params->avx.min);
288 const __m256 vmax = _mm256_load_ps(params->avx.max);
289 for (; channels >= 8; channels -= 8) {
290 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
291 i0 += 8;
292 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
293 i1 += 8;
294
295 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
296 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
297 i2 += 8;
298
299 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
300 i3 += 8;
301 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
302 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
303 i4 += 8;
304 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
305 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
306 i5 += 8;
307 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
308 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
309 i6 += 8;
310 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
311 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
312
313 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
314
315 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
316
317 vout01234567 = _mm256_min_ps(vout01234567, vmax);
318
319 _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
320 output = (uint16_t*) output + 8;
321 }
322 if XNN_UNLIKELY(channels != 0) {
323 {
324 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
325 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
326
327 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
328 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
329
330 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
331 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
332 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
333 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
334 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
335 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
336 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
337 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
338 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
339
340 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
341 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
342 vout01234567 = _mm256_min_ps(vout01234567, vmax);
343
344 __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
345 if (channels & 4) {
346 _mm_storel_epi64((__m128i*) output, vh01234567);
347 output = (uint16_t*) output + 4;
348 vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
349 }
350 if (channels & 2) {
351 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
352 output = (uint16_t*) output + 2;
353 vh01234567 = _mm_srli_epi64(vh01234567, 32);
354 }
355 if (channels & 1) {
356 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
357 }
358 }
359 }
360 }
361
xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])362 void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
363 size_t output_pixels,
364 size_t kernel_elements,
365 size_t channels,
366 const void** input,
367 size_t input_offset,
368 void* output,
369 size_t input_increment,
370 size_t output_increment,
371 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
372 {
373 assert(output_pixels != 0);
374 assert(kernel_elements != 0);
375 assert(channels != 0);
376
377 const __m256 voutput_min = _mm256_load_ps(params->avx.min);
378 const __m256 voutput_max = _mm256_load_ps(params->avx.max);
379 do {
380 uint16_t* o = output;
381 {
382 const uint16_t* i0 = *input++;
383 const uint16_t* i1 = *input++;
384 const uint16_t* i2 = *input++;
385 const uint16_t* i3 = *input++;
386 const uint16_t* i4 = *input++;
387 const uint16_t* i5 = *input++;
388 const uint16_t* i6 = *input++;
389 const uint16_t* i7 = *input++;
390 const uint16_t* i8 = *input++;
391 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
392 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
393 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
394 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
395 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
396 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
397 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
398 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
399 i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
400 if (kernel_elements < 2) {
401 i1 = i0;
402 }
403 if (kernel_elements <= 2) {
404 i2 = i0;
405 }
406 if (kernel_elements < 4) {
407 i3 = i0;
408 }
409 if (kernel_elements <= 4) {
410 i4 = i0;
411 }
412 if (kernel_elements < 6) {
413 i5 = i0;
414 }
415 if (kernel_elements <= 6) {
416 i6 = i0;
417 }
418 if (kernel_elements < 8) {
419 i7 = i0;
420 }
421 if (kernel_elements <= 8) {
422 i8 = i0;
423 }
424
425 size_t c = channels;
426 for (; c >= 8; c -= 8) {
427 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
428 i0 += 8;
429 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
430 i1 += 8;
431 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
432 i2 += 8;
433 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
434 i3 += 8;
435 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
436 i4 += 8;
437 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
438 i5 += 8;
439 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
440 i6 += 8;
441 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
442 i7 += 8;
443 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
444 i8 += 8;
445
446 const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
447 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
448 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
449 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
450
451 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
452 const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
453 const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
454 const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
455
456 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
457 o += 8;
458 }
459 if (c != 0) {
460 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
461 i0 += 8;
462 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
463 i1 += 8;
464 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
465 i2 += 8;
466 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
467 i3 += 8;
468 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
469 i4 += 8;
470 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
471 i5 += 8;
472 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
473 i6 += 8;
474 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
475 i7 += 8;
476 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
477 i8 += 8;
478
479 const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
480 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
481 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
482 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
483
484 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
485 const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
486 const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
487 __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
488
489 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
490 if (c & 4) {
491 _mm_storel_epi64((__m128i*) o, vh);
492 vh = _mm_unpackhi_epi64(vh, vh);
493 o += 4;
494 }
495 if (c & 2) {
496 *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
497 vh = _mm_srli_epi64(vh, 32);
498 o += 2;
499 }
500 if (c & 1) {
501 *o = _mm_extract_epi16(vh, 0);
502 o += 1;
503 }
504 }
505 }
506
507 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
508 const uint16_t* i0 = *input++;
509 const uint16_t* i1 = *input++;
510 const uint16_t* i2 = *input++;
511 const uint16_t* i3 = *input++;
512 const uint16_t* i4 = *input++;
513 const uint16_t* i5 = *input++;
514 const uint16_t* i6 = *input++;
515 const uint16_t* i7 = *input++;
516 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
517 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
518 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
519 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
520 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
521 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
522 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
523 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
524 if (k < 2) {
525 i1 = i0;
526 }
527 if (k <= 2) {
528 i2 = i0;
529 }
530 if (k < 4) {
531 i3 = i0;
532 }
533 if (k <= 4) {
534 i4 = i0;
535 }
536 if (k < 6) {
537 i5 = i0;
538 }
539 if (k <= 6) {
540 i6 = i0;
541 }
542 if (k < 8) {
543 i7 = i0;
544 }
545
546 o = output;
547 size_t c = channels;
548 for (; c >= 8; c -= 8) {
549 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
550 i0 += 8;
551 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
552 i1 += 8;
553 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
554 i2 += 8;
555 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
556 i3 += 8;
557 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
558 i4 += 8;
559 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
560 i5 += 8;
561 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
562 i6 += 8;
563 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
564 i7 += 8;
565 const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
566
567 const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
568 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
569 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
570 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
571
572 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
573 const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
574 const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
575 const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
576
577 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
578 o += 8;
579 }
580 if (c != 0) {
581 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
582 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
583 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
584 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
585 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
586 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
587 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
588 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
589 const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
590
591 const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
592 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
593 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
594 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
595
596 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
597 const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
598 const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
599 __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
600
601 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
602 if (c & 4) {
603 _mm_storel_epi64((__m128i*) o, vh);
604 vh = _mm_unpackhi_epi64(vh, vh);
605 o += 4;
606 }
607 if (c & 2) {
608 *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
609 vh = _mm_srli_epi64(vh, 32);
610 o += 2;
611 }
612 if (c & 1) {
613 *o = _mm_extract_epi16(vh, 0);
614 o += 1;
615 }
616 }
617 }
618 input = (const void**) ((uintptr_t) input + input_increment);
619 output = (uint16_t*) ((uintptr_t) o + output_increment);
620 } while (--output_pixels != 0);
621 }
622
xnn_f16_prelu_ukernel__f16c_2x16(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride)623 void xnn_f16_prelu_ukernel__f16c_2x16(
624 size_t rows,
625 size_t channels,
626 const void* restrict input,
627 size_t input_stride,
628 const void* restrict weights,
629 void* restrict output,
630 size_t output_stride) XNN_OOB_READS
631 {
632 assert(rows != 0);
633 assert(channels != 0);
634 assert(channels % sizeof(uint16_t) == 0);
635
636 const uint16_t* i0 = (const uint16_t*) input;
637 uint16_t* o0 = (uint16_t*) output;
638 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
639 uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
640
641 const size_t input_increment = input_stride * 2 - channels;
642 const size_t output_increment = output_stride * 2 - channels;
643
644 do {
645 if XNN_UNPREDICTABLE(rows < 2) {
646 i1 = i0;
647 o1 = o0;
648 }
649
650 const uint16_t* w = (const uint16_t*) weights;
651 size_t c = channels;
652 for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
653 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
654 const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
655 w += 16;
656
657 const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
658 const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
659 i0 += 16;
660 const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
661 const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
662 i1 += 16;
663
664 __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
665 __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
666 __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
667 __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
668
669 vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
670 vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
671 vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
672 vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
673
674 _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
675 _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_NO_EXC));
676 _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
677 o0 += 16;
678 _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
679 _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_NO_EXC));
680 _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
681 o1 += 16;
682 }
683 for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
684 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
685 w += 8;
686
687 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
688 i0 += 8;
689 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
690 i1 += 8;
691
692 __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
693 __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
694
695 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
696 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
697
698 _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
699 o0 += 8;
700 _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
701 o1 += 8;
702 }
703 if XNN_UNLIKELY(c != 0) {
704 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
705
706 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
707 i0 = (const uint16_t*) ((uintptr_t) i0 + c);
708 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
709 i1 = (const uint16_t*) ((uintptr_t) i1 + c);
710
711 __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
712 __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
713
714 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
715 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
716
717 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
718 __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
719 if (c & (4 * sizeof(uint16_t))) {
720 _mm_storel_epi64((__m128i*) o0, vh0x01234567);
721 _mm_storel_epi64((__m128i*) o1, vh1x01234567);
722
723 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
724 vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
725
726 o0 += 4;
727 o1 += 4;
728 }
729 if (c & (2 * sizeof(uint16_t))) {
730 *((uint32_t*) o0) = (uint32_t) _mm_cvtsi128_si32(vh0x01234567);
731 *((uint32_t*) o1) = (uint32_t) _mm_cvtsi128_si32(vh1x01234567);
732
733 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
734 vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
735
736 o0 += 2;
737 o1 += 2;
738 }
739 if (c & (1 * sizeof(uint16_t))) {
740 *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
741 *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
742
743 o0 += 1;
744 o1 += 1;
745 }
746 }
747 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
748 o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
749 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
750 o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
751 rows = doz(rows, 2);
752 } while (rows != 0);
753 }
754
xnn_f16_vadd_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])755 void xnn_f16_vadd_minmax_ukernel__f16c_x16(
756 size_t n,
757 const void* restrict a_ptr,
758 const void* restrict b_ptr,
759 void* restrict y_ptr,
760 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
761 {
762 assert(n != 0);
763 assert(n % sizeof(uint16_t) == 0);
764 assert(a_ptr != NULL);
765 assert(b_ptr != NULL);
766 assert(y_ptr != NULL);
767
768 const uint16_t* a = (const uint16_t*) a_ptr;
769 const uint16_t* b = (const uint16_t*) b_ptr;
770 uint16_t* y = (uint16_t*) y_ptr;
771
772 const __m256 vy_min = _mm256_load_ps(params->avx.min);
773 const __m256 vy_max = _mm256_load_ps(params->avx.max);
774
775 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
776 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
777 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
778 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
779 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
780 a += 16;
781 b += 16;
782
783 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
784 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
785
786
787 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
788 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
789
790 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
791 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
792
793 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
794 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
795 y += 16;
796 }
797 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
798 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
799 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
800 a += 8;
801 b += 8;
802
803 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
804
805 vy = _mm256_max_ps(vy, vy_min);
806 vy = _mm256_min_ps(vy, vy_max);
807
808 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
809 y += 8;
810 }
811 if XNN_UNLIKELY(n != 0) {
812 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
813 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
814
815 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
816
817 vy = _mm256_max_ps(vy, vy_min);
818 vy = _mm256_min_ps(vy, vy_max);
819
820 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
821 if (n & (4 * sizeof(uint16_t))) {
822 _mm_storel_epi64((__m128i*) y, vh);
823 vh = _mm_unpackhi_epi64(vh, vh);
824 y += 4;
825 }
826 if (n & (2 * sizeof(uint16_t))) {
827 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
828 vh = _mm_srli_epi64(vh, 32);
829 y += 2;
830 }
831 if (n & (1 * sizeof(uint16_t))) {
832 *y = (uint16_t) _mm_extract_epi16(vh, 0);
833 }
834 }
835 }
836
xnn_f16_vaddc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])837 void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
838 size_t n,
839 const void* restrict a_ptr,
840 const void* restrict b_ptr,
841 void* restrict y_ptr,
842 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
843 {
844 assert(n != 0);
845 assert(n % sizeof(uint16_t) == 0);
846 assert(a_ptr != NULL);
847 assert(b_ptr != NULL);
848 assert(y_ptr != NULL);
849
850 const uint16_t* a = (const uint16_t*) a_ptr;
851 const uint16_t* b = (const uint16_t*) b_ptr;
852 uint16_t* y = (uint16_t*) y_ptr;
853
854 const __m256 vy_min = _mm256_load_ps(params->avx.min);
855 const __m256 vy_max = _mm256_load_ps(params->avx.max);
856
857 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
858 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
859 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
860 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
861 a += 16;
862
863 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
864 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
865
866
867 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
868 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
869
870 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
871 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
872
873 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
874 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
875 y += 16;
876 }
877 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
878 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
879 a += 8;
880
881 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
882
883 vy = _mm256_max_ps(vy, vy_min);
884 vy = _mm256_min_ps(vy, vy_max);
885
886 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
887 y += 8;
888 }
889 if XNN_UNLIKELY(n != 0) {
890 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
891
892 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
893
894 vy = _mm256_max_ps(vy, vy_min);
895 vy = _mm256_min_ps(vy, vy_max);
896
897 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
898 if (n & (4 * sizeof(uint16_t))) {
899 _mm_storel_epi64((__m128i*) y, vh);
900 vh = _mm_unpackhi_epi64(vh, vh);
901 y += 4;
902 }
903 if (n & (2 * sizeof(uint16_t))) {
904 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
905 vh = _mm_srli_epi64(vh, 32);
906 y += 2;
907 }
908 if (n & (1 * sizeof(uint16_t))) {
909 *y = (uint16_t) _mm_extract_epi16(vh, 0);
910 }
911 }
912 }
913
xnn_f16_vmul_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])914 void xnn_f16_vmul_minmax_ukernel__f16c_x16(
915 size_t n,
916 const void* restrict a_ptr,
917 const void* restrict b_ptr,
918 void* restrict y_ptr,
919 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
920 {
921 assert(n != 0);
922 assert(n % sizeof(uint16_t) == 0);
923 assert(a_ptr != NULL);
924 assert(b_ptr != NULL);
925 assert(y_ptr != NULL);
926
927 const uint16_t* a = (const uint16_t*) a_ptr;
928 const uint16_t* b = (const uint16_t*) b_ptr;
929 uint16_t* y = (uint16_t*) y_ptr;
930
931 const __m256 vy_min = _mm256_load_ps(params->avx.min);
932 const __m256 vy_max = _mm256_load_ps(params->avx.max);
933
934 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
935 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
936 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
937 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
938 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
939 a += 16;
940 b += 16;
941
942 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
943 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
944
945
946 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
947 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
948
949 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
950 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
951
952 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
953 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
954 y += 16;
955 }
956 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
957 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
958 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
959 a += 8;
960 b += 8;
961
962 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
963
964 vy = _mm256_max_ps(vy, vy_min);
965 vy = _mm256_min_ps(vy, vy_max);
966
967 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
968 y += 8;
969 }
970 if XNN_UNLIKELY(n != 0) {
971 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
972 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
973
974 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
975
976 vy = _mm256_max_ps(vy, vy_min);
977 vy = _mm256_min_ps(vy, vy_max);
978
979 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
980 if (n & (4 * sizeof(uint16_t))) {
981 _mm_storel_epi64((__m128i*) y, vh);
982 vh = _mm_unpackhi_epi64(vh, vh);
983 y += 4;
984 }
985 if (n & (2 * sizeof(uint16_t))) {
986 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
987 vh = _mm_srli_epi64(vh, 32);
988 y += 2;
989 }
990 if (n & (1 * sizeof(uint16_t))) {
991 *y = (uint16_t) _mm_extract_epi16(vh, 0);
992 }
993 }
994 }
995
xnn_f16_vmulc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])996 void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
997 size_t n,
998 const void* restrict a_ptr,
999 const void* restrict b_ptr,
1000 void* restrict y_ptr,
1001 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1002 {
1003 assert(n != 0);
1004 assert(n % sizeof(uint16_t) == 0);
1005 assert(a_ptr != NULL);
1006 assert(b_ptr != NULL);
1007 assert(y_ptr != NULL);
1008
1009 const uint16_t* a = (const uint16_t*) a_ptr;
1010 const uint16_t* b = (const uint16_t*) b_ptr;
1011 uint16_t* y = (uint16_t*) y_ptr;
1012
1013 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1014 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1015
1016 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1017 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1018 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1019 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1020 a += 16;
1021
1022 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1023 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1024
1025
1026 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1027 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1028
1029 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1030 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1031
1032 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1033 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1034 y += 16;
1035 }
1036 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1037 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1038 a += 8;
1039
1040 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1041
1042 vy = _mm256_max_ps(vy, vy_min);
1043 vy = _mm256_min_ps(vy, vy_max);
1044
1045 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1046 y += 8;
1047 }
1048 if XNN_UNLIKELY(n != 0) {
1049 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1050
1051 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1052
1053 vy = _mm256_max_ps(vy, vy_min);
1054 vy = _mm256_min_ps(vy, vy_max);
1055
1056 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1057 if (n & (4 * sizeof(uint16_t))) {
1058 _mm_storel_epi64((__m128i*) y, vh);
1059 vh = _mm_unpackhi_epi64(vh, vh);
1060 y += 4;
1061 }
1062 if (n & (2 * sizeof(uint16_t))) {
1063 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
1064 vh = _mm_srli_epi64(vh, 32);
1065 y += 2;
1066 }
1067 if (n & (1 * sizeof(uint16_t))) {
1068 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1069 }
1070 }
1071 }
1072
xnn_f16_vhswish_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])1073 void xnn_f16_vhswish_ukernel__f16c_x16(
1074 size_t n,
1075 const void* restrict x_ptr,
1076 void* restrict y_ptr,
1077 const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1078 {
1079 assert(n != 0);
1080 assert(n % sizeof(uint16_t) == 0);
1081
1082 const uint16_t* x = (const uint16_t*) x_ptr;
1083 uint16_t* y = (uint16_t*) y_ptr;
1084
1085 const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
1086 const __m256 vthree = _mm256_load_ps(params->avx.three);
1087 const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
1088 const __m128i vzero = _mm_setzero_si128();
1089
1090 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1091 __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1092 __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
1093 x += 16;
1094
1095 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
1096 vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
1097 __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
1098 vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
1099
1100 vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
1101 vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
1102
1103 vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
1104 vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
1105
1106 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
1107 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
1108
1109 _mm_storeu_si128((__m128i*) y, vacc01234567);
1110 _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
1111 y += 16;
1112 }
1113 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1114 __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1115 x += 8;
1116 __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
1117 vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
1118 vacc = _mm_max_epi16(vacc, vzero);
1119 vacc = _mm_min_epi16(vacc, vsix);
1120 vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
1121 _mm_storeu_si128((__m128i*) y, vacc);
1122 y += 8;
1123 }
1124 if XNN_UNLIKELY(n != 0) {
1125 __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1126 __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
1127 vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
1128 vacc = _mm_max_epi16(vacc, vzero);
1129 vacc = _mm_min_epi16(vacc, vsix);
1130 vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
1131
1132 if (n & (4 * sizeof(uint16_t))) {
1133 _mm_storel_epi64((__m128i*) y, vacc);
1134 vacc = _mm_unpackhi_epi64(vacc, vacc);
1135 y += 4;
1136 }
1137 if (n & (2 * sizeof(uint16_t))) {
1138 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
1139 vacc = _mm_srli_epi64(vacc, 32);
1140 y += 2;
1141 }
1142 if (n & (1 * sizeof(uint16_t))) {
1143 *y = (uint16_t) _mm_extract_epi16(vacc, 0);
1144 }
1145 }
1146 }
1147
xnn_f32_f16_vcvt_ukernel__f16c_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1148 void xnn_f32_f16_vcvt_ukernel__f16c_x16(
1149 size_t n,
1150 const float* input,
1151 void* output,
1152 const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
1153 {
1154 assert(n != 0);
1155 assert(n % sizeof(float) == 0);
1156 assert(input != NULL);
1157 assert(output != NULL);
1158
1159 uint16_t* o = (uint16_t*) output;
1160 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
1161 const __m256 vf0 = _mm256_loadu_ps(input);
1162 const __m256 vf1 = _mm256_loadu_ps(input + 8);
1163 input += 16;
1164
1165 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
1166 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
1167 o += 16;
1168 }
1169 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1170 const __m256 vf = _mm256_loadu_ps(input);
1171 input += 8;
1172
1173 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
1174 o += 8;
1175 }
1176 if XNN_UNLIKELY(n != 0) {
1177 assert(n >= 1 * sizeof(float));
1178 assert(n <= 7 * sizeof(float));
1179 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->f16c.mask_table[7] - n));
1180
1181 const __m256 vf = _mm256_maskload_ps(input, vmask);
1182
1183 __m128 vf_lo = _mm256_castps256_ps128(vf);
1184 if (n & (4 * sizeof(float))) {
1185 _mm_storel_epi64((__m128i*) o, _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC));
1186 vf_lo = _mm256_extractf128_ps(vf, 1);
1187 o += 4;
1188 }
1189 __m128i vh = _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC);
1190 if (n & (2 * sizeof(float))) {
1191 _mm_storeu_si32(o, vh);
1192 vh = _mm_srli_epi64(vh, 32);
1193 o += 2;
1194 }
1195 if (n & (1 * sizeof(float))) {
1196 *((uint16_t*) o) = _mm_extract_epi16(vh, 0);
1197 }
1198 }
1199 }
1200