• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/common.h>
11 #include <xnnpack/gavgpool.h>
12 #include <xnnpack/intrinsics-polyfill.h>
13 #include <xnnpack/math.h>
14 #include <xnnpack/maxpool.h>
15 #include <xnnpack/prelu.h>
16 #include <xnnpack/vbinary.h>
17 #include <xnnpack/vcvt.h>
18 #include <xnnpack/vunary.h>
19 
20 
xnn_f16_f32_vcvt_ukernel__f16c_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])21 void xnn_f16_f32_vcvt_ukernel__f16c_x16(
22     size_t n,
23     const void* input,
24     float* output,
25     const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
26 {
27   assert(n != 0);
28   assert(n % sizeof(uint16_t) == 0);
29   assert(input != NULL);
30   assert(output != NULL);
31 
32   const uint16_t* i = (const uint16_t*) input;
33   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
34     const __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
35     const __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
36     i += 16;
37 
38     _mm256_storeu_ps(output, vacc0);
39     _mm256_storeu_ps(output + 8, vacc1);
40     output += 16;
41   }
42   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
43     const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
44     i += 8;
45 
46     _mm256_storeu_ps(output, vacc);
47     output += 8;
48   }
49   if XNN_UNLIKELY(n != 0) {
50     assert(n >= 1 * sizeof(uint16_t));
51     assert(n <= 7 * sizeof(uint16_t));
52     const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
53 
54     __m128 vacc_lo = _mm256_castps256_ps128(vacc);
55     if (n & (4 * sizeof(uint16_t))) {
56       _mm_storeu_ps(output, vacc_lo);
57       vacc_lo = _mm256_extractf128_ps(vacc, 1);
58       output += 4;
59     }
60     if (n & (2 * sizeof(uint16_t))) {
61       _mm_storel_pi((__m64*) output, vacc_lo);
62       vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
63       output += 2;
64     }
65     if (n & (1 * sizeof(uint16_t))) {
66       _mm_store_ss(output, vacc_lo);
67     }
68   }
69 }
70 
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])71 void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
72     size_t rows,
73     size_t channels,
74     const void* input,
75     size_t input_stride,
76     const void* zero,
77     void* buffer,
78     void* output,
79     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
80 {
81   assert(rows > 7);
82   assert(channels != 0);
83 
84   const uint16_t* i0 = input;
85   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
86   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
87   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
88   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
89   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
90   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
91   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
92 
93   uint16_t* b = buffer;
94   size_t c = channels;
95   for (; c != 0; c = doz(c, 8)) {
96     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
97     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
98 
99     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
100     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
101 
102     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
103     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
104     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
105     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
106     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
107     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
108     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
109     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
110     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
111 
112     _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
113   }
114 
115   for (rows -= 7; rows > 7; rows -= 7) {
116     i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
117     i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
118     i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
119     i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
120     i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
121     i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
122     i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
123 
124     uint16_t* b = buffer;
125     size_t c = channels;
126     for (; c != 0; c = doz(c, 8)) {
127       __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
128 
129       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
130 
131       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
132       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
133       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
134       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
135       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
136       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
137       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
138       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
139       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
140       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
141       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
142       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
143       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
144 
145       _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
146     }
147   }
148 
149   i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
150   i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
151   if XNN_UNPREDICTABLE(rows < 2) {
152     i1 = (const uint16_t*) zero;
153   }
154   i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
155   if XNN_UNPREDICTABLE(rows <= 2) {
156     i2 = (const uint16_t*) zero;
157   }
158   i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
159   if XNN_UNPREDICTABLE(rows < 4) {
160     i3 = (const uint16_t*) zero;
161   }
162   i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
163   if XNN_UNPREDICTABLE(rows <= 4) {
164     i4 = (const uint16_t*) zero;
165   }
166   i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
167   if XNN_UNPREDICTABLE(rows < 6) {
168     i5 = (const uint16_t*) zero;
169   }
170   i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
171   if XNN_UNPREDICTABLE(rows <= 6) {
172     i6 = (const uint16_t*) zero;
173   }
174 
175   const __m256 vscale = _mm256_load_ps(params->avx.scale);
176   const __m256 vmin = _mm256_load_ps(params->avx.min);
177   const __m256 vmax = _mm256_load_ps(params->avx.max);
178   for (; channels >= 8; channels -= 8) {
179     __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
180 
181     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
182 
183     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
184     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
185     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
186     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
187     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
188     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
189     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
190     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
191     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
192     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
193     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
194     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
195     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
196 
197     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
198 
199     __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
200 
201     vout01234567 = _mm256_min_ps(vout01234567, vmax);
202 
203     _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
204     output = (uint16_t*) output + 8;
205   }
206   if XNN_UNLIKELY(channels != 0) {
207     {
208       __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
209 
210       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
211       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
212       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
213       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
214       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
215       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
216       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
217       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
218       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
219       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
220       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
221       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
222       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
223       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
224 
225       vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
226       __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
227       vout01234567 = _mm256_min_ps(vout01234567, vmax);
228 
229       __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
230       if (channels & 4) {
231         _mm_storel_epi64((__m128i*) output, vh01234567);
232         output = (uint16_t*) output + 4;
233         vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
234       }
235       if (channels & 2) {
236         *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
237         output = (uint16_t*) output + 2;
238         vh01234567 = _mm_srli_epi64(vh01234567, 32);
239       }
240       if (channels & 1) {
241         *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
242       }
243     }
244   }
245 }
246 
xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])247 void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
248     size_t rows,
249     size_t channels,
250     const void* input,
251     size_t input_stride,
252     const void* zero,
253     void* output,
254     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
255 {
256   assert(rows != 0);
257   assert(rows <= 7);
258   assert(channels != 0);
259 
260   const uint16_t* i0 = input;
261   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
262   if XNN_UNPREDICTABLE(rows < 2) {
263     i1 = (const uint16_t*) zero;
264   }
265   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
266   if XNN_UNPREDICTABLE(rows <= 2) {
267     i2 = (const uint16_t*) zero;
268   }
269   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
270   if XNN_UNPREDICTABLE(rows < 4) {
271     i3 = (const uint16_t*) zero;
272   }
273   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
274   if XNN_UNPREDICTABLE(rows <= 4) {
275     i4 = (const uint16_t*) zero;
276   }
277   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
278   if XNN_UNPREDICTABLE(rows < 6) {
279     i5 = (const uint16_t*) zero;
280   }
281   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
282   if XNN_UNPREDICTABLE(rows <= 6) {
283     i6 = (const uint16_t*) zero;
284   }
285 
286   const __m256 vscale = _mm256_load_ps(params->avx.scale);
287   const __m256 vmin = _mm256_load_ps(params->avx.min);
288   const __m256 vmax = _mm256_load_ps(params->avx.max);
289   for (; channels >= 8; channels -= 8) {
290     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
291     i0 += 8;
292     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
293     i1 += 8;
294 
295     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
296     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
297     i2 += 8;
298 
299     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
300     i3 += 8;
301     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
302     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
303     i4 += 8;
304     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
305     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
306     i5 += 8;
307     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
308     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
309     i6 += 8;
310     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
311     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
312 
313     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
314 
315     __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
316 
317     vout01234567 = _mm256_min_ps(vout01234567, vmax);
318 
319     _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
320     output = (uint16_t*) output + 8;
321   }
322   if XNN_UNLIKELY(channels != 0) {
323     {
324       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
325       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
326 
327       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
328       __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
329 
330       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
331       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
332       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
333       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
334       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
335       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
336       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
337       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
338       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
339 
340       vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
341       __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
342       vout01234567 = _mm256_min_ps(vout01234567, vmax);
343 
344       __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
345       if (channels & 4) {
346         _mm_storel_epi64((__m128i*) output, vh01234567);
347         output = (uint16_t*) output + 4;
348         vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
349       }
350       if (channels & 2) {
351         *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
352         output = (uint16_t*) output + 2;
353         vh01234567 = _mm_srli_epi64(vh01234567, 32);
354       }
355       if (channels & 1) {
356         *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
357       }
358     }
359   }
360 }
361 
xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])362 void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
363     size_t output_pixels,
364     size_t kernel_elements,
365     size_t channels,
366     const void** input,
367     size_t input_offset,
368     void* output,
369     size_t input_increment,
370     size_t output_increment,
371     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
372 {
373   assert(output_pixels != 0);
374   assert(kernel_elements != 0);
375   assert(channels != 0);
376 
377   const __m256 voutput_min = _mm256_load_ps(params->avx.min);
378   const __m256 voutput_max = _mm256_load_ps(params->avx.max);
379   do {
380     uint16_t* o = output;
381     {
382       const uint16_t* i0 = *input++;
383       const uint16_t* i1 = *input++;
384       const uint16_t* i2 = *input++;
385       const uint16_t* i3 = *input++;
386       const uint16_t* i4 = *input++;
387       const uint16_t* i5 = *input++;
388       const uint16_t* i6 = *input++;
389       const uint16_t* i7 = *input++;
390       const uint16_t* i8 = *input++;
391       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
392       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
393       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
394       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
395       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
396       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
397       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
398       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
399       i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
400       if (kernel_elements < 2) {
401         i1 = i0;
402       }
403       if (kernel_elements <= 2) {
404         i2 = i0;
405       }
406       if (kernel_elements < 4) {
407         i3 = i0;
408       }
409       if (kernel_elements <= 4) {
410         i4 = i0;
411       }
412       if (kernel_elements < 6) {
413         i5 = i0;
414       }
415       if (kernel_elements <= 6) {
416         i6 = i0;
417       }
418       if (kernel_elements < 8) {
419         i7 = i0;
420       }
421       if (kernel_elements <= 8) {
422         i8 = i0;
423       }
424 
425       size_t c = channels;
426       for (; c >= 8; c -= 8) {
427         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
428         i0 += 8;
429         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
430         i1 += 8;
431         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
432         i2 += 8;
433         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
434         i3 += 8;
435         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
436         i4 += 8;
437         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
438         i5 += 8;
439         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
440         i6 += 8;
441         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
442         i7 += 8;
443         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
444         i8 += 8;
445 
446         const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
447         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
448         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
449         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
450 
451         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
452         const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
453         const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
454         const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
455 
456         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
457         o += 8;
458       }
459       if (c != 0) {
460         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
461         i0 += 8;
462         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
463         i1 += 8;
464         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
465         i2 += 8;
466         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
467         i3 += 8;
468         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
469         i4 += 8;
470         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
471         i5 += 8;
472         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
473         i6 += 8;
474         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
475         i7 += 8;
476         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
477         i8 += 8;
478 
479         const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
480         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
481         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
482         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
483 
484         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
485         const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
486         const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
487         __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
488 
489         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
490         if (c & 4) {
491           _mm_storel_epi64((__m128i*) o, vh);
492           vh = _mm_unpackhi_epi64(vh, vh);
493           o += 4;
494         }
495         if (c & 2) {
496           *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
497           vh = _mm_srli_epi64(vh, 32);
498           o += 2;
499         }
500         if (c & 1) {
501           *o = _mm_extract_epi16(vh, 0);
502           o += 1;
503         }
504       }
505     }
506 
507     for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
508       const uint16_t* i0 = *input++;
509       const uint16_t* i1 = *input++;
510       const uint16_t* i2 = *input++;
511       const uint16_t* i3 = *input++;
512       const uint16_t* i4 = *input++;
513       const uint16_t* i5 = *input++;
514       const uint16_t* i6 = *input++;
515       const uint16_t* i7 = *input++;
516       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
517       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
518       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
519       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
520       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
521       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
522       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
523       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
524       if (k < 2) {
525         i1 = i0;
526       }
527       if (k <= 2) {
528         i2 = i0;
529       }
530       if (k < 4) {
531         i3 = i0;
532       }
533       if (k <= 4) {
534         i4 = i0;
535       }
536       if (k < 6) {
537         i5 = i0;
538       }
539       if (k <= 6) {
540         i6 = i0;
541       }
542       if (k < 8) {
543         i7 = i0;
544       }
545 
546       o = output;
547       size_t c = channels;
548       for (; c >= 8; c -= 8) {
549         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
550         i0 += 8;
551         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
552         i1 += 8;
553         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
554         i2 += 8;
555         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
556         i3 += 8;
557         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
558         i4 += 8;
559         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
560         i5 += 8;
561         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
562         i6 += 8;
563         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
564         i7 += 8;
565         const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
566 
567         const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
568         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
569         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
570         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
571 
572         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
573         const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
574         const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
575         const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
576 
577         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
578         o += 8;
579       }
580       if (c != 0) {
581         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
582         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
583         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
584         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
585         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
586         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
587         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
588         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
589         const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
590 
591         const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
592         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
593         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
594         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
595 
596         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
597         const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
598         const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
599         __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
600 
601         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
602         if (c & 4) {
603           _mm_storel_epi64((__m128i*) o, vh);
604           vh = _mm_unpackhi_epi64(vh, vh);
605           o += 4;
606         }
607         if (c & 2) {
608           *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh);
609           vh = _mm_srli_epi64(vh, 32);
610           o += 2;
611         }
612         if (c & 1) {
613           *o = _mm_extract_epi16(vh, 0);
614           o += 1;
615         }
616       }
617     }
618     input = (const void**) ((uintptr_t) input + input_increment);
619     output = (uint16_t*) ((uintptr_t) o + output_increment);
620   } while (--output_pixels != 0);
621 }
622 
xnn_f16_prelu_ukernel__f16c_2x16(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride)623 void xnn_f16_prelu_ukernel__f16c_2x16(
624     size_t rows,
625     size_t channels,
626     const void* restrict input,
627     size_t input_stride,
628     const void* restrict weights,
629     void* restrict output,
630     size_t output_stride) XNN_OOB_READS
631 {
632   assert(rows != 0);
633   assert(channels != 0);
634   assert(channels % sizeof(uint16_t) == 0);
635 
636   const uint16_t* i0 = (const uint16_t*) input;
637   uint16_t* o0 = (uint16_t*) output;
638   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
639   uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
640 
641   const size_t input_increment = input_stride * 2 - channels;
642   const size_t output_increment = output_stride * 2 - channels;
643 
644   do {
645     if XNN_UNPREDICTABLE(rows < 2) {
646       i1 = i0;
647       o1 = o0;
648     }
649 
650     const uint16_t* w = (const uint16_t*) weights;
651     size_t c = channels;
652     for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
653       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
654       const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
655       w += 16;
656 
657       const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
658       const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
659       i0 += 16;
660       const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
661       const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
662       i1 += 16;
663 
664       __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
665       __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
666       __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
667       __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
668 
669       vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
670       vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
671       vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
672       vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
673 
674       _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
675       _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_NO_EXC));
676       _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
677       o0 += 16;
678       _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
679       _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_NO_EXC));
680       _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
681       o1 += 16;
682     }
683     for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
684       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
685       w += 8;
686 
687       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
688       i0 += 8;
689       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
690       i1 += 8;
691 
692       __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
693       __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
694 
695       vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
696       vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
697 
698       _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
699       o0 += 8;
700       _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
701       o1 += 8;
702     }
703     if XNN_UNLIKELY(c != 0) {
704       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
705 
706       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
707       i0 = (const uint16_t*) ((uintptr_t) i0 + c);
708       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
709       i1 = (const uint16_t*) ((uintptr_t) i1 + c);
710 
711       __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
712       __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
713 
714       vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
715       vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
716 
717       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
718       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
719       if (c & (4 * sizeof(uint16_t))) {
720         _mm_storel_epi64((__m128i*) o0, vh0x01234567);
721         _mm_storel_epi64((__m128i*) o1, vh1x01234567);
722 
723         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
724         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
725 
726         o0 += 4;
727         o1 += 4;
728       }
729       if (c & (2 * sizeof(uint16_t))) {
730         *((uint32_t*) o0) = (uint32_t) _mm_cvtsi128_si32(vh0x01234567);
731         *((uint32_t*) o1) = (uint32_t) _mm_cvtsi128_si32(vh1x01234567);
732 
733         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
734         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
735 
736         o0 += 2;
737         o1 += 2;
738       }
739       if (c & (1 * sizeof(uint16_t))) {
740         *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
741         *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
742 
743         o0 += 1;
744         o1 += 1;
745       }
746     }
747     i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
748     o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
749     i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
750     o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
751     rows = doz(rows, 2);
752   } while (rows != 0);
753 }
754 
xnn_f16_vadd_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])755 void xnn_f16_vadd_minmax_ukernel__f16c_x16(
756     size_t n,
757     const void* restrict a_ptr,
758     const void* restrict b_ptr,
759     void* restrict y_ptr,
760     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
761 {
762   assert(n != 0);
763   assert(n % sizeof(uint16_t) == 0);
764   assert(a_ptr != NULL);
765   assert(b_ptr != NULL);
766   assert(y_ptr != NULL);
767 
768   const uint16_t* a = (const uint16_t*) a_ptr;
769   const uint16_t* b = (const uint16_t*) b_ptr;
770   uint16_t* y = (uint16_t*) y_ptr;
771 
772   const __m256 vy_min = _mm256_load_ps(params->avx.min);
773   const __m256 vy_max = _mm256_load_ps(params->avx.max);
774 
775   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
776     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
777     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
778     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
779     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
780     a += 16;
781     b += 16;
782 
783     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
784     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
785 
786 
787     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
788     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
789 
790     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
791     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
792 
793     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
794     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
795     y += 16;
796   }
797   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
798     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
799     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
800     a += 8;
801     b += 8;
802 
803     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
804 
805     vy = _mm256_max_ps(vy, vy_min);
806     vy = _mm256_min_ps(vy, vy_max);
807 
808     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
809     y += 8;
810   }
811   if XNN_UNLIKELY(n != 0) {
812     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
813     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
814 
815     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
816 
817     vy = _mm256_max_ps(vy, vy_min);
818     vy = _mm256_min_ps(vy, vy_max);
819 
820     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
821     if (n & (4 * sizeof(uint16_t))) {
822       _mm_storel_epi64((__m128i*) y, vh);
823       vh = _mm_unpackhi_epi64(vh, vh);
824       y += 4;
825     }
826     if (n & (2 * sizeof(uint16_t))) {
827       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
828       vh = _mm_srli_epi64(vh, 32);
829       y += 2;
830     }
831     if (n & (1 * sizeof(uint16_t))) {
832       *y = (uint16_t) _mm_extract_epi16(vh, 0);
833     }
834   }
835 }
836 
xnn_f16_vaddc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])837 void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
838     size_t n,
839     const void* restrict a_ptr,
840     const void* restrict b_ptr,
841     void* restrict y_ptr,
842     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
843 {
844   assert(n != 0);
845   assert(n % sizeof(uint16_t) == 0);
846   assert(a_ptr != NULL);
847   assert(b_ptr != NULL);
848   assert(y_ptr != NULL);
849 
850   const uint16_t* a = (const uint16_t*) a_ptr;
851   const uint16_t* b = (const uint16_t*) b_ptr;
852   uint16_t* y = (uint16_t*) y_ptr;
853 
854   const __m256 vy_min = _mm256_load_ps(params->avx.min);
855   const __m256 vy_max = _mm256_load_ps(params->avx.max);
856 
857   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
858   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
859     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
860     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
861     a += 16;
862 
863     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
864     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
865 
866 
867     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
868     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
869 
870     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
871     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
872 
873     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
874     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
875     y += 16;
876   }
877   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
878     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
879     a += 8;
880 
881     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
882 
883     vy = _mm256_max_ps(vy, vy_min);
884     vy = _mm256_min_ps(vy, vy_max);
885 
886     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
887     y += 8;
888   }
889   if XNN_UNLIKELY(n != 0) {
890     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
891 
892     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
893 
894     vy = _mm256_max_ps(vy, vy_min);
895     vy = _mm256_min_ps(vy, vy_max);
896 
897     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
898     if (n & (4 * sizeof(uint16_t))) {
899       _mm_storel_epi64((__m128i*) y, vh);
900       vh = _mm_unpackhi_epi64(vh, vh);
901       y += 4;
902     }
903     if (n & (2 * sizeof(uint16_t))) {
904       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
905       vh = _mm_srli_epi64(vh, 32);
906       y += 2;
907     }
908     if (n & (1 * sizeof(uint16_t))) {
909       *y = (uint16_t) _mm_extract_epi16(vh, 0);
910     }
911   }
912 }
913 
xnn_f16_vmul_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])914 void xnn_f16_vmul_minmax_ukernel__f16c_x16(
915     size_t n,
916     const void* restrict a_ptr,
917     const void* restrict b_ptr,
918     void* restrict y_ptr,
919     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
920 {
921   assert(n != 0);
922   assert(n % sizeof(uint16_t) == 0);
923   assert(a_ptr != NULL);
924   assert(b_ptr != NULL);
925   assert(y_ptr != NULL);
926 
927   const uint16_t* a = (const uint16_t*) a_ptr;
928   const uint16_t* b = (const uint16_t*) b_ptr;
929   uint16_t* y = (uint16_t*) y_ptr;
930 
931   const __m256 vy_min = _mm256_load_ps(params->avx.min);
932   const __m256 vy_max = _mm256_load_ps(params->avx.max);
933 
934   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
935     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
936     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
937     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
938     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
939     a += 16;
940     b += 16;
941 
942     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
943     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
944 
945 
946     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
947     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
948 
949     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
950     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
951 
952     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
953     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
954     y += 16;
955   }
956   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
957     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
958     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
959     a += 8;
960     b += 8;
961 
962     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
963 
964     vy = _mm256_max_ps(vy, vy_min);
965     vy = _mm256_min_ps(vy, vy_max);
966 
967     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
968     y += 8;
969   }
970   if XNN_UNLIKELY(n != 0) {
971     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
972     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
973 
974     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
975 
976     vy = _mm256_max_ps(vy, vy_min);
977     vy = _mm256_min_ps(vy, vy_max);
978 
979     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
980     if (n & (4 * sizeof(uint16_t))) {
981       _mm_storel_epi64((__m128i*) y, vh);
982       vh = _mm_unpackhi_epi64(vh, vh);
983       y += 4;
984     }
985     if (n & (2 * sizeof(uint16_t))) {
986       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
987       vh = _mm_srli_epi64(vh, 32);
988       y += 2;
989     }
990     if (n & (1 * sizeof(uint16_t))) {
991       *y = (uint16_t) _mm_extract_epi16(vh, 0);
992     }
993   }
994 }
995 
xnn_f16_vmulc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])996 void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
997     size_t n,
998     const void* restrict a_ptr,
999     const void* restrict b_ptr,
1000     void* restrict y_ptr,
1001     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1002 {
1003   assert(n != 0);
1004   assert(n % sizeof(uint16_t) == 0);
1005   assert(a_ptr != NULL);
1006   assert(b_ptr != NULL);
1007   assert(y_ptr != NULL);
1008 
1009   const uint16_t* a = (const uint16_t*) a_ptr;
1010   const uint16_t* b = (const uint16_t*) b_ptr;
1011   uint16_t* y = (uint16_t*) y_ptr;
1012 
1013   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1014   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1015 
1016   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1017   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1018     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1019     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1020     a += 16;
1021 
1022     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1023     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1024 
1025 
1026     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1027     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1028 
1029     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1030     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1031 
1032     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1033     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1034     y += 16;
1035   }
1036   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1037     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1038     a += 8;
1039 
1040     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1041 
1042     vy = _mm256_max_ps(vy, vy_min);
1043     vy = _mm256_min_ps(vy, vy_max);
1044 
1045     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1046     y += 8;
1047   }
1048   if XNN_UNLIKELY(n != 0) {
1049     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1050 
1051     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1052 
1053     vy = _mm256_max_ps(vy, vy_min);
1054     vy = _mm256_min_ps(vy, vy_max);
1055 
1056     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1057     if (n & (4 * sizeof(uint16_t))) {
1058       _mm_storel_epi64((__m128i*) y, vh);
1059       vh = _mm_unpackhi_epi64(vh, vh);
1060       y += 4;
1061     }
1062     if (n & (2 * sizeof(uint16_t))) {
1063       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
1064       vh = _mm_srli_epi64(vh, 32);
1065       y += 2;
1066     }
1067     if (n & (1 * sizeof(uint16_t))) {
1068       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1069     }
1070   }
1071 }
1072 
xnn_f16_vhswish_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])1073 void xnn_f16_vhswish_ukernel__f16c_x16(
1074     size_t n,
1075     const void* restrict x_ptr,
1076     void* restrict y_ptr,
1077     const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1078 {
1079   assert(n != 0);
1080   assert(n % sizeof(uint16_t) == 0);
1081 
1082   const uint16_t* x = (const uint16_t*) x_ptr;
1083   uint16_t* y = (uint16_t*) y_ptr;
1084 
1085   const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
1086   const __m256 vthree = _mm256_load_ps(params->avx.three);
1087   const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
1088   const __m128i vzero = _mm_setzero_si128();
1089 
1090   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1091     __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1092     __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
1093     x += 16;
1094 
1095     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
1096     vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
1097     __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
1098     vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
1099 
1100     vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
1101     vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
1102 
1103     vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
1104     vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
1105 
1106     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
1107     vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
1108 
1109     _mm_storeu_si128((__m128i*) y, vacc01234567);
1110     _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
1111     y += 16;
1112   }
1113   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1114     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1115     x += 8;
1116     __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
1117     vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
1118     vacc = _mm_max_epi16(vacc, vzero);
1119     vacc = _mm_min_epi16(vacc, vsix);
1120     vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
1121     _mm_storeu_si128((__m128i*) y, vacc);
1122     y += 8;
1123   }
1124   if XNN_UNLIKELY(n != 0) {
1125     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
1126     __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
1127     vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
1128     vacc = _mm_max_epi16(vacc, vzero);
1129     vacc = _mm_min_epi16(vacc, vsix);
1130     vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
1131 
1132     if (n & (4 * sizeof(uint16_t))) {
1133       _mm_storel_epi64((__m128i*) y, vacc);
1134       vacc = _mm_unpackhi_epi64(vacc, vacc);
1135       y += 4;
1136     }
1137     if (n & (2 * sizeof(uint16_t))) {
1138       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
1139       vacc = _mm_srli_epi64(vacc, 32);
1140       y += 2;
1141     }
1142     if (n & (1 * sizeof(uint16_t))) {
1143       *y = (uint16_t) _mm_extract_epi16(vacc, 0);
1144     }
1145   }
1146 }
1147 
xnn_f32_f16_vcvt_ukernel__f16c_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1148 void xnn_f32_f16_vcvt_ukernel__f16c_x16(
1149     size_t n,
1150     const float* input,
1151     void* output,
1152     const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
1153 {
1154   assert(n != 0);
1155   assert(n % sizeof(float) == 0);
1156   assert(input != NULL);
1157   assert(output != NULL);
1158 
1159   uint16_t* o = (uint16_t*) output;
1160   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
1161     const __m256 vf0 = _mm256_loadu_ps(input);
1162     const __m256 vf1 = _mm256_loadu_ps(input + 8);
1163     input += 16;
1164 
1165     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
1166     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
1167     o += 16;
1168   }
1169   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1170     const __m256 vf = _mm256_loadu_ps(input);
1171     input += 8;
1172 
1173     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
1174     o += 8;
1175   }
1176   if XNN_UNLIKELY(n != 0) {
1177     assert(n >= 1 * sizeof(float));
1178     assert(n <= 7 * sizeof(float));
1179     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->f16c.mask_table[7] - n));
1180 
1181     const __m256 vf = _mm256_maskload_ps(input, vmask);
1182 
1183     __m128 vf_lo = _mm256_castps256_ps128(vf);
1184     if (n & (4 * sizeof(float))) {
1185       _mm_storel_epi64((__m128i*) o, _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC));
1186       vf_lo = _mm256_extractf128_ps(vf, 1);
1187       o += 4;
1188     }
1189     __m128i vh = _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC);
1190     if (n & (2 * sizeof(float))) {
1191       _mm_storeu_si32(o, vh);
1192       vh = _mm_srli_epi64(vh, 32);
1193       o += 2;
1194     }
1195     if (n & (1 * sizeof(float))) {
1196       *((uint16_t*) o) = _mm_extract_epi16(vh, 0);
1197     }
1198   }
1199 }
1200