1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gemm.h>
13 #include <xnnpack/igemm.h>
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/math.h>
17 #include <xnnpack/vaddsub.h>
18 #include <xnnpack/vcvt.h>
19 #include <xnnpack/vunary.h>
20
21
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])22 void xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(
23 size_t mr,
24 size_t nc,
25 size_t kc,
26 const void*restrict a,
27 size_t a_stride,
28 const void*restrict w,
29 void*restrict c,
30 size_t cm_stride,
31 size_t cn_stride,
32 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34 assert(mr != 0);
35 assert(mr <= 1);
36 assert(nc != 0);
37 assert(kc != 0);
38 assert(kc % sizeof(uint16_t) == 0);
39 assert(a != NULL);
40 assert(w != NULL);
41 assert(c != NULL);
42
43 const uint16_t* a0 = a;
44 uint16_t* c0 = c;
45
46 do {
47 __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
48 __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
49 w = (const uint16_t*) w + 16;
50
51 size_t k = kc;
52 do {
53 const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
54 a0 += 1;
55
56 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
57 const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
58 w = (const uint16_t*) w + 16;
59
60 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
61 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
62
63 k -= sizeof(uint16_t);
64 } while (k != 0);
65
66 const __m256 vscale = _mm256_load_ps(params->avx.scale);
67 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
68 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
69
70 const __m256 vmin = _mm256_load_ps(params->avx.min);
71 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
72 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
73
74 const __m256 vmax = _mm256_load_ps(params->avx.max);
75 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
76 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
77
78 if XNN_LIKELY(nc >= 16) {
79 _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
80 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
81 c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
82
83 a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
84
85 nc -= 16;
86 } else {
87 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
88 if (nc & 8) {
89 _mm_storeu_si128((__m128i*) c0, vh0x01234567);
90
91 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
92
93 c0 += 8;
94 }
95 if (nc & 4) {
96 _mm_storel_epi64((__m128i*) c0, vh0x01234567);
97
98 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
99
100 c0 += 4;
101 }
102 if (nc & 2) {
103 _mm_storeu_si32(c0, vh0x01234567);
104
105 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
106
107 c0 += 2;
108 }
109 if (nc & 1) {
110 *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
111 }
112
113 nc = 0;
114 }
115 } while (nc != 0);
116 }
117
xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])118 void xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(
119 size_t mr,
120 size_t nc,
121 size_t kc,
122 const void*restrict a,
123 size_t a_stride,
124 const void*restrict w,
125 void*restrict c,
126 size_t cm_stride,
127 size_t cn_stride,
128 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
129 {
130 assert(mr != 0);
131 assert(mr <= 4);
132 assert(nc != 0);
133 assert(kc != 0);
134 assert(kc % sizeof(uint16_t) == 0);
135 assert(a != NULL);
136 assert(w != NULL);
137 assert(c != NULL);
138
139 const uint16_t* a0 = a;
140 uint16_t* c0 = c;
141 const uint16_t* a1 = (const uint16_t*) ((uintptr_t) a0 + a_stride);
142 uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
143 if XNN_UNPREDICTABLE(mr < 2) {
144 a1 = a0;
145 c1 = c0;
146 }
147 const uint16_t* a2 = (const uint16_t*) ((uintptr_t) a1 + a_stride);
148 uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
149 if XNN_UNPREDICTABLE(mr <= 2) {
150 a2 = a1;
151 c2 = c1;
152 }
153 const uint16_t* a3 = (const uint16_t*) ((uintptr_t) a2 + a_stride);
154 uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
155 if XNN_UNPREDICTABLE(mr != 4) {
156 a3 = a2;
157 c3 = c2;
158 }
159
160 do {
161 __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
162 __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
163 __m256 vacc1x01234567 = vacc0x01234567;
164 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
165 __m256 vacc2x01234567 = vacc0x01234567;
166 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
167 __m256 vacc3x01234567 = vacc0x01234567;
168 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
169 w = (const uint16_t*) w + 16;
170
171 size_t k = kc;
172 do {
173 const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
174 a0 += 1;
175 const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
176 a1 += 1;
177 const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
178 a2 += 1;
179 const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
180 a3 += 1;
181
182 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
183 const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
184 w = (const uint16_t*) w + 16;
185
186 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
187 vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
188 vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
189 vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
190 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
191 vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
192 vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
193 vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
194
195 k -= sizeof(uint16_t);
196 } while (k != 0);
197
198 const __m256 vscale = _mm256_load_ps(params->avx.scale);
199 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
200 vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
201 vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
202 vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
203 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
204 vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
205 vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
206 vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
207
208 const __m256 vmin = _mm256_load_ps(params->avx.min);
209 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
210 vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
211 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
212 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
213 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
214 vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
215 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
216 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
217
218 const __m256 vmax = _mm256_load_ps(params->avx.max);
219 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
220 vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
221 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
222 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
223 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
224 vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
225 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
226 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
227
228 if XNN_LIKELY(nc >= 16) {
229 _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
230 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
231 c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
232 _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
233 _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
234 c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
235 _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
236 _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
237 c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
238 _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
239 _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
240 c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
241
242 a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
243 a1 = (const uint16_t*) ((uintptr_t) a1 - kc);
244 a2 = (const uint16_t*) ((uintptr_t) a2 - kc);
245 a3 = (const uint16_t*) ((uintptr_t) a3 - kc);
246
247 nc -= 16;
248 } else {
249 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
250 __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
251 __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
252 __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
253 if (nc & 8) {
254 _mm_storeu_si128((__m128i*) c0, vh0x01234567);
255 _mm_storeu_si128((__m128i*) c1, vh1x01234567);
256 _mm_storeu_si128((__m128i*) c2, vh2x01234567);
257 _mm_storeu_si128((__m128i*) c3, vh3x01234567);
258
259 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
260 vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
261 vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
262 vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
263
264 c0 += 8;
265 c1 += 8;
266 c2 += 8;
267 c3 += 8;
268 }
269 if (nc & 4) {
270 _mm_storel_epi64((__m128i*) c0, vh0x01234567);
271 _mm_storel_epi64((__m128i*) c1, vh1x01234567);
272 _mm_storel_epi64((__m128i*) c2, vh2x01234567);
273 _mm_storel_epi64((__m128i*) c3, vh3x01234567);
274
275 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
276 vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
277 vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
278 vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
279
280 c0 += 4;
281 c1 += 4;
282 c2 += 4;
283 c3 += 4;
284 }
285 if (nc & 2) {
286 _mm_storeu_si32(c0, vh0x01234567);
287 _mm_storeu_si32(c1, vh1x01234567);
288 _mm_storeu_si32(c2, vh2x01234567);
289 _mm_storeu_si32(c3, vh3x01234567);
290
291 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
292 vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
293 vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
294 vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
295
296 c0 += 2;
297 c1 += 2;
298 c2 += 2;
299 c3 += 2;
300 }
301 if (nc & 1) {
302 *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
303 *c1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
304 *c2 = (uint16_t) _mm_extract_epi16(vh2x01234567, 0);
305 *c3 = (uint16_t) _mm_extract_epi16(vh3x01234567, 0);
306 }
307
308 nc = 0;
309 }
310 } while (nc != 0);
311 }
312
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])313 void xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(
314 size_t mr,
315 size_t nc,
316 size_t kc,
317 size_t ks,
318 const void**restrict a,
319 const void*restrict w,
320 void*restrict c,
321 size_t cm_stride,
322 size_t cn_stride,
323 size_t a_offset,
324 const void* zero,
325 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
326 {
327 assert(mr != 0);
328 assert(mr <= 1);
329 assert(nc != 0);
330 assert(kc != 0);
331 assert(kc % sizeof(uint16_t) == 0);
332 assert(ks != 0);
333 assert(ks % (1 * sizeof(void*)) == 0);
334 assert(a_offset % sizeof(uint16_t) == 0);
335 assert(a != NULL);
336 assert(w != NULL);
337 assert(c != NULL);
338
339 uint16_t* c0 = c;
340
341 do {
342 __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
343 __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
344 w = (const uint16_t*) w + 16;
345
346 size_t p = ks;
347 do {
348 const uint16_t* restrict a0 = (const uint16_t*) a[0];
349 assert(a0 != NULL);
350 if XNN_UNPREDICTABLE(a0 != zero) {
351 a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
352 }
353 a += 1;
354
355 size_t k = kc;
356 do {
357 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
358 const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
359 w = (const uint16_t*) w + 16;
360
361 const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
362 a0 += 1;
363
364 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
365 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
366
367 k -= sizeof(uint16_t);
368 } while (k != 0);
369 p -= 1 * sizeof(void*);
370 } while (p != 0);
371
372 const __m256 vscale = _mm256_load_ps(params->avx.scale);
373 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
374 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
375
376 const __m256 vmin = _mm256_load_ps(params->avx.min);
377 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
378 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
379
380 const __m256 vmax = _mm256_load_ps(params->avx.max);
381 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
382 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
383
384 if XNN_LIKELY(nc >= 16) {
385 _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
386 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
387 c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
388
389 a = (const void**restrict) ((uintptr_t) a - ks);
390 nc -= 16;
391 } else {
392 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
393 if (nc & 8) {
394 _mm_storeu_si128((__m128i*) c0, vh0x01234567);
395
396 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
397
398 c0 += 8;
399 }
400 if (nc & 4) {
401 _mm_storel_epi64((__m128i*) c0, vh0x01234567);
402
403 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
404
405 c0 += 4;
406 }
407 if (nc & 2) {
408 _mm_storeu_si32(c0, vh0x01234567);
409
410 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
411
412 c0 += 2;
413 }
414 if (nc & 1) {
415 *c0 = _mm_extract_epi16(vh0x01234567, 0);
416 }
417
418 nc = 0;
419 }
420 } while (nc != 0);
421 }
422
xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])423 void xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(
424 size_t mr,
425 size_t nc,
426 size_t kc,
427 size_t ks,
428 const void**restrict a,
429 const void*restrict w,
430 void*restrict c,
431 size_t cm_stride,
432 size_t cn_stride,
433 size_t a_offset,
434 const void* zero,
435 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
436 {
437 assert(mr != 0);
438 assert(mr <= 4);
439 assert(nc != 0);
440 assert(kc != 0);
441 assert(kc % sizeof(uint16_t) == 0);
442 assert(ks != 0);
443 assert(ks % (4 * sizeof(void*)) == 0);
444 assert(a_offset % sizeof(uint16_t) == 0);
445 assert(a != NULL);
446 assert(w != NULL);
447 assert(c != NULL);
448
449 uint16_t* c0 = c;
450 uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
451 if XNN_UNPREDICTABLE(mr < 2) {
452 c1 = c0;
453 }
454 uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
455 if XNN_UNPREDICTABLE(mr <= 2) {
456 c2 = c1;
457 }
458 uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
459 if XNN_UNPREDICTABLE(mr != 4) {
460 c3 = c2;
461 }
462
463 do {
464 __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
465 __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
466 __m256 vacc1x01234567 = vacc0x01234567;
467 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
468 __m256 vacc2x01234567 = vacc0x01234567;
469 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
470 __m256 vacc3x01234567 = vacc0x01234567;
471 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
472 w = (const uint16_t*) w + 16;
473
474 size_t p = ks;
475 do {
476 const uint16_t* restrict a0 = (const uint16_t*) a[0];
477 assert(a0 != NULL);
478 if XNN_UNPREDICTABLE(a0 != zero) {
479 a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
480 }
481 const uint16_t* restrict a1 = (const uint16_t*) a[1];
482 assert(a1 != NULL);
483 if XNN_UNPREDICTABLE(a1 != zero) {
484 a1 = (const uint16_t*) ((uintptr_t) a1 + a_offset);
485 }
486 const uint16_t* restrict a2 = (const uint16_t*) a[2];
487 assert(a2 != NULL);
488 if XNN_UNPREDICTABLE(a2 != zero) {
489 a2 = (const uint16_t*) ((uintptr_t) a2 + a_offset);
490 }
491 const uint16_t* restrict a3 = (const uint16_t*) a[3];
492 assert(a3 != NULL);
493 if XNN_UNPREDICTABLE(a3 != zero) {
494 a3 = (const uint16_t*) ((uintptr_t) a3 + a_offset);
495 }
496 a += 4;
497
498 size_t k = kc;
499 do {
500 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
501 const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
502 w = (const uint16_t*) w + 16;
503
504 const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
505 a0 += 1;
506 const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
507 a1 += 1;
508 const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
509 a2 += 1;
510 const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
511 a3 += 1;
512
513 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
514 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
515 vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
516 vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
517 vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
518 vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
519 vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
520 vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
521
522 k -= sizeof(uint16_t);
523 } while (k != 0);
524 p -= 4 * sizeof(void*);
525 } while (p != 0);
526
527 const __m256 vscale = _mm256_load_ps(params->avx.scale);
528 vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
529 vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
530 vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
531 vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
532 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
533 vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
534 vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
535 vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
536
537 const __m256 vmin = _mm256_load_ps(params->avx.min);
538 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
539 vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
540 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
541 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
542 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
543 vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
544 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
545 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
546
547 const __m256 vmax = _mm256_load_ps(params->avx.max);
548 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
549 vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
550 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
551 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
552 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
553 vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
554 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
555 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
556
557 if XNN_LIKELY(nc >= 16) {
558 _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
559 _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
560 c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
561 _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
562 _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
563 c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
564 _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
565 _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
566 c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
567 _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
568 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
569 c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
570
571 a = (const void**restrict) ((uintptr_t) a - ks);
572 nc -= 16;
573 } else {
574 __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
575 __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
576 __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
577 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
578 if (nc & 8) {
579 _mm_storeu_si128((__m128i*) c3, vh3x01234567);
580 _mm_storeu_si128((__m128i*) c2, vh2x01234567);
581 _mm_storeu_si128((__m128i*) c1, vh1x01234567);
582 _mm_storeu_si128((__m128i*) c0, vh0x01234567);
583
584 vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
585 vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
586 vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
587 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
588
589 c3 += 8;
590 c2 += 8;
591 c1 += 8;
592 c0 += 8;
593 }
594 if (nc & 4) {
595 _mm_storel_epi64((__m128i*) c3, vh3x01234567);
596 _mm_storel_epi64((__m128i*) c2, vh2x01234567);
597 _mm_storel_epi64((__m128i*) c1, vh1x01234567);
598 _mm_storel_epi64((__m128i*) c0, vh0x01234567);
599
600 vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
601 vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
602 vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
603 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
604
605 c3 += 4;
606 c2 += 4;
607 c1 += 4;
608 c0 += 4;
609 }
610 if (nc & 2) {
611 _mm_storeu_si32(c3, vh3x01234567);
612 _mm_storeu_si32(c2, vh2x01234567);
613 _mm_storeu_si32(c1, vh1x01234567);
614 _mm_storeu_si32(c0, vh0x01234567);
615
616 vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
617 vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
618 vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
619 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
620
621 c3 += 2;
622 c2 += 2;
623 c1 += 2;
624 c0 += 2;
625 }
626 if (nc & 1) {
627 *c3 = _mm_extract_epi16(vh3x01234567, 0);
628 *c2 = _mm_extract_epi16(vh2x01234567, 0);
629 *c1 = _mm_extract_epi16(vh1x01234567, 0);
630 *c0 = _mm_extract_epi16(vh0x01234567, 0);
631 }
632
633 nc = 0;
634 }
635 } while (nc != 0);
636 }
637
xnn_f32_qs8_vcvt_ukernel__avx2_x64(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])638 void xnn_f32_qs8_vcvt_ukernel__avx2_x64(
639 size_t n,
640 const float* x,
641 int8_t* y,
642 const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
643 {
644 assert(n != 0);
645 assert(n % sizeof(float) == 0);
646 assert(x != NULL);
647 assert(y != NULL);
648
649 const __m256 vscale = _mm256_load_ps(params->avx2.scale);
650 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
651 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
652 const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
653 const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
654
655 for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
656 __m256 vx01 = _mm256_loadu_ps(x);
657 __m256 vx23 = _mm256_loadu_ps(x + 8);
658 __m256 vx45 = _mm256_loadu_ps(x + 16);
659 __m256 vx67 = _mm256_loadu_ps(x + 24);
660 __m256 vx89 = _mm256_loadu_ps(x + 32);
661 __m256 vxAB = _mm256_loadu_ps(x + 40);
662 __m256 vxCD = _mm256_loadu_ps(x + 48);
663 __m256 vxEF = _mm256_loadu_ps(x + 56);
664 x += 64;
665
666 vx01 = _mm256_mul_ps(vx01, vscale);
667 vx23 = _mm256_mul_ps(vx23, vscale);
668 vx45 = _mm256_mul_ps(vx45, vscale);
669 vx67 = _mm256_mul_ps(vx67, vscale);
670 vx89 = _mm256_mul_ps(vx89, vscale);
671 vxAB = _mm256_mul_ps(vxAB, vscale);
672 vxCD = _mm256_mul_ps(vxCD, vscale);
673 vxEF = _mm256_mul_ps(vxEF, vscale);
674
675 vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
676 vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
677 vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
678 vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
679 vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
680 vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
681 vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
682 vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
683
684 const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
685 const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
686 const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
687 const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
688 const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
689 const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
690 const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
691 const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
692
693 __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
694 __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
695 __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
696 __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
697
698 vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
699 vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
700 vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
701 vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
702
703 const __m256i vy02461357 = _mm256_packs_epi16(vacc0213, vacc4657);
704 const __m256i vy8ACE9BDF = _mm256_packs_epi16(vacc8A9B, vaccCEDF);
705
706 __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
707 __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
708
709 vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
710 vy89ABCDEF = _mm256_max_epi8(vy89ABCDEF, voutput_min);
711
712 _mm256_storeu_si256((__m256i*) y, vy01234567);
713 _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
714 y += 64;
715 }
716 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
717 __m256 vx = _mm256_loadu_ps(x);
718 vx = _mm256_mul_ps(vx, vscale);
719 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
720 x += 8;
721
722 const __m256i vacc = _mm256_cvtps_epi32(vx);
723
724 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
725 vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
726 vy = _mm_packs_epi16(vy, vy);
727 vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
728
729 _mm_storel_epi64((__m128i*) y, vy);
730 y += 8;
731 }
732 if XNN_UNLIKELY(n != 0) {
733 assert(n >= 1 * sizeof(float));
734 assert(n <= 7 * sizeof(float));
735 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx2.mask_table[7] - n));
736
737 __m256 vx = _mm256_maskload_ps(x, vmask);
738 vx = _mm256_mul_ps(vx, vscale);
739 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
740
741 const __m256i vacc = _mm256_cvtps_epi32(vx);
742
743 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
744 vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
745 vy = _mm_packs_epi16(vy, vy);
746 vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
747
748 if (n & (4 * sizeof(float))) {
749 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
750 y += 4;
751 vy = _mm_srli_epi64(vy, 32);
752 }
753 if (n & (2 * sizeof(float))) {
754 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
755 y += 2;
756 vy = _mm_srli_epi32(vy, 16);
757 }
758 if (n & (1 * sizeof(float))) {
759 *y = (int8_t) _mm_extract_epi8(vy, 0);
760 }
761 }
762 }
763
xnn_f32_qu8_vcvt_ukernel__avx2_x64(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])764 void xnn_f32_qu8_vcvt_ukernel__avx2_x64(
765 size_t n,
766 const float* x,
767 uint8_t* y,
768 const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
769 {
770 assert(n != 0);
771 assert(n % sizeof(float) == 0);
772 assert(x != NULL);
773 assert(y != NULL);
774
775 const __m256 vscale = _mm256_load_ps(params->avx2.scale);
776 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
777 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
778 const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
779 const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
780
781 for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
782 __m256 vx01 = _mm256_loadu_ps(x);
783 __m256 vx23 = _mm256_loadu_ps(x + 8);
784 __m256 vx45 = _mm256_loadu_ps(x + 16);
785 __m256 vx67 = _mm256_loadu_ps(x + 24);
786 __m256 vx89 = _mm256_loadu_ps(x + 32);
787 __m256 vxAB = _mm256_loadu_ps(x + 40);
788 __m256 vxCD = _mm256_loadu_ps(x + 48);
789 __m256 vxEF = _mm256_loadu_ps(x + 56);
790 x += 64;
791
792 vx01 = _mm256_mul_ps(vx01, vscale);
793 vx23 = _mm256_mul_ps(vx23, vscale);
794 vx45 = _mm256_mul_ps(vx45, vscale);
795 vx67 = _mm256_mul_ps(vx67, vscale);
796 vx89 = _mm256_mul_ps(vx89, vscale);
797 vxAB = _mm256_mul_ps(vxAB, vscale);
798 vxCD = _mm256_mul_ps(vxCD, vscale);
799 vxEF = _mm256_mul_ps(vxEF, vscale);
800
801 vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
802 vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
803 vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
804 vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
805 vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
806 vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
807 vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
808 vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
809
810 const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
811 const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
812 const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
813 const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
814 const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
815 const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
816 const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
817 const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
818
819 __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
820 __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
821 __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
822 __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
823
824 vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
825 vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
826 vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
827 vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
828
829 const __m256i vy02461357 = _mm256_packus_epi16(vacc0213, vacc4657);
830 const __m256i vy8ACE9BDF = _mm256_packus_epi16(vacc8A9B, vaccCEDF);
831
832 __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
833 __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
834
835 vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
836 vy89ABCDEF = _mm256_max_epu8(vy89ABCDEF, voutput_min);
837
838 _mm256_storeu_si256((__m256i*) y, vy01234567);
839 _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
840 y += 64;
841 }
842 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
843 __m256 vx = _mm256_loadu_ps(x);
844 vx = _mm256_mul_ps(vx, vscale);
845 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
846 x += 8;
847
848 const __m256i vacc = _mm256_cvtps_epi32(vx);
849
850 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
851 vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
852 vy = _mm_packus_epi16(vy, vy);
853 vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
854
855 _mm_storel_epi64((__m128i*) y, vy);
856 y += 8;
857 }
858 if XNN_UNLIKELY(n != 0) {
859 assert(n >= 1 * sizeof(float));
860 assert(n <= 7 * sizeof(float));
861 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx2.mask_table[7] - n));
862
863 __m256 vx = _mm256_maskload_ps(x, vmask);
864 vx = _mm256_mul_ps(vx, vscale);
865 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
866
867 const __m256i vacc = _mm256_cvtps_epi32(vx);
868
869 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
870 vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
871 vy = _mm_packus_epi16(vy, vy);
872 vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
873
874 if (n & (4 * sizeof(float))) {
875 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
876 y += 4;
877 vy = _mm_srli_epi64(vy, 32);
878 }
879 if (n & (2 * sizeof(float))) {
880 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
881 y += 2;
882 vy = _mm_srli_epi32(vy, 16);
883 }
884 if (n & (1 * sizeof(float))) {
885 *y = (uint8_t) _mm_extract_epi8(vy, 0);
886 }
887 }
888 }
889
xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])890 void xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(
891 size_t n,
892 const float* x,
893 float* y,
894 const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
895 {
896 assert(n % sizeof(float) == 0);
897
898 const __m256 vprescale = _mm256_load_ps(params->avx2_rr1_lut4_p4.prescale);
899 const __m256 valpha = _mm256_load_ps(params->avx2_rr1_lut4_p4.alpha);
900 const __m256 vbeta = _mm256_load_ps(params->avx2_rr1_lut4_p4.beta);
901 const __m256 vsat_cutoff = _mm256_load_ps(params->avx2_rr1_lut4_p4.sat_cutoff);
902 const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_lut4_p4.magic_bias);
903 const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_lut4_p4.log2e);
904 const __m256 vtable = _mm256_load_ps(params->avx2_rr1_lut4_p4.table);
905 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.minus_ln2);
906 const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c4);
907 const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c3);
908 const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c2);
909
910 for (; n >= 56 * sizeof(float); n -= 56 * sizeof(float)) {
911 __m256 vx0 = _mm256_loadu_ps(x);
912 __m256 vx1 = _mm256_loadu_ps(x + 8);
913 __m256 vx2 = _mm256_loadu_ps(x + 16);
914 __m256 vx3 = _mm256_loadu_ps(x + 24);
915 __m256 vx4 = _mm256_loadu_ps(x + 32);
916 __m256 vx5 = _mm256_loadu_ps(x + 40);
917 __m256 vx6 = _mm256_loadu_ps(x + 48);
918 x += 56;
919
920 const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
921 const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
922 const __m256 vz2 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx2, vprescale));
923 const __m256 vz3 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx3, vprescale));
924 const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale));
925 const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale));
926 const __m256 vz6 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx6, vprescale));
927
928 __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
929 __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
930 __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
931 __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
932 __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
933 __m256 vn5 = _mm256_fmadd_ps(vz5, vlog2e, vmagic_bias);
934 __m256 vn6 = _mm256_fmadd_ps(vz6, vlog2e, vmagic_bias);
935
936 const __m256i ven0 = _mm256_slli_epi32(_mm256_castps_si256(vn0), 21);
937 const __m256i vl0 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn0)));
938 vn0 = _mm256_sub_ps(vn0, vmagic_bias);
939 const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21);
940 const __m256i vl1 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn1)));
941 vn1 = _mm256_sub_ps(vn1, vmagic_bias);
942 const __m256i ven2 = _mm256_slli_epi32(_mm256_castps_si256(vn2), 21);
943 const __m256i vl2 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn2)));
944 vn2 = _mm256_sub_ps(vn2, vmagic_bias);
945 const __m256i ven3 = _mm256_slli_epi32(_mm256_castps_si256(vn3), 21);
946 const __m256i vl3 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn3)));
947 vn3 = _mm256_sub_ps(vn3, vmagic_bias);
948 const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21);
949 const __m256i vl4 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn4)));
950 vn4 = _mm256_sub_ps(vn4, vmagic_bias);
951 const __m256i ven5 = _mm256_slli_epi32(_mm256_castps_si256(vn5), 21);
952 const __m256i vl5 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn5)));
953 vn5 = _mm256_sub_ps(vn5, vmagic_bias);
954 const __m256i ven6 = _mm256_slli_epi32(_mm256_castps_si256(vn6), 21);
955 const __m256i vl6 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn6)));
956 vn6 = _mm256_sub_ps(vn6, vmagic_bias);
957
958 __m256 vs0 = _mm256_castsi256_ps(_mm256_add_epi32(vl0, ven0));
959 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
960 __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1));
961 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
962 __m256 vs2 = _mm256_castsi256_ps(_mm256_add_epi32(vl2, ven2));
963 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
964 __m256 vs3 = _mm256_castsi256_ps(_mm256_add_epi32(vl3, ven3));
965 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
966 __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4));
967 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
968 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5));
969 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vz5);
970 __m256 vs6 = _mm256_castsi256_ps(_mm256_add_epi32(vl6, ven6));
971 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vz6);
972
973 __m256 vp0 = _mm256_fmadd_ps(vc4, vt0, vc3);
974 __m256 vp1 = _mm256_fmadd_ps(vc4, vt1, vc3);
975 __m256 vp2 = _mm256_fmadd_ps(vc4, vt2, vc3);
976 __m256 vp3 = _mm256_fmadd_ps(vc4, vt3, vc3);
977 __m256 vp4 = _mm256_fmadd_ps(vc4, vt4, vc3);
978 __m256 vp5 = _mm256_fmadd_ps(vc4, vt5, vc3);
979 __m256 vp6 = _mm256_fmadd_ps(vc4, vt6, vc3);
980
981 vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
982 vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
983 vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
984 vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
985 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
986 vp5 = _mm256_fmadd_ps(vp5, vt5, vc2);
987 vp6 = _mm256_fmadd_ps(vp6, vt6, vc2);
988
989 vp0 = _mm256_mul_ps(vp0, vt0);
990 vt0 = _mm256_mul_ps(vt0, vs0);
991 vp1 = _mm256_mul_ps(vp1, vt1);
992 vt1 = _mm256_mul_ps(vt1, vs1);
993 vp2 = _mm256_mul_ps(vp2, vt2);
994 vt2 = _mm256_mul_ps(vt2, vs2);
995 vp3 = _mm256_mul_ps(vp3, vt3);
996 vt3 = _mm256_mul_ps(vt3, vs3);
997 vp4 = _mm256_mul_ps(vp4, vt4);
998 vt4 = _mm256_mul_ps(vt4, vs4);
999 vp5 = _mm256_mul_ps(vp5, vt5);
1000 vt5 = _mm256_mul_ps(vt5, vs5);
1001 vp6 = _mm256_mul_ps(vp6, vt6);
1002 vt6 = _mm256_mul_ps(vt6, vs6);
1003
1004 vs0 = _mm256_fmsub_ps(vs0, valpha, valpha);
1005 vp0 = _mm256_fmadd_ps(vp0, vt0, vt0);
1006 vs1 = _mm256_fmsub_ps(vs1, valpha, valpha);
1007 vp1 = _mm256_fmadd_ps(vp1, vt1, vt1);
1008 vs2 = _mm256_fmsub_ps(vs2, valpha, valpha);
1009 vp2 = _mm256_fmadd_ps(vp2, vt2, vt2);
1010 vs3 = _mm256_fmsub_ps(vs3, valpha, valpha);
1011 vp3 = _mm256_fmadd_ps(vp3, vt3, vt3);
1012 vs4 = _mm256_fmsub_ps(vs4, valpha, valpha);
1013 vp4 = _mm256_fmadd_ps(vp4, vt4, vt4);
1014 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha);
1015 vp5 = _mm256_fmadd_ps(vp5, vt5, vt5);
1016 vs6 = _mm256_fmsub_ps(vs6, valpha, valpha);
1017 vp6 = _mm256_fmadd_ps(vp6, vt6, vt6);
1018
1019 const __m256 ve0 = _mm256_fmadd_ps(vp0, valpha, vs0);
1020 vx0 = _mm256_mul_ps(vx0, vbeta);
1021 const __m256 ve1 = _mm256_fmadd_ps(vp1, valpha, vs1);
1022 vx1 = _mm256_mul_ps(vx1, vbeta);
1023 const __m256 ve2 = _mm256_fmadd_ps(vp2, valpha, vs2);
1024 vx2 = _mm256_mul_ps(vx2, vbeta);
1025 const __m256 ve3 = _mm256_fmadd_ps(vp3, valpha, vs3);
1026 vx3 = _mm256_mul_ps(vx3, vbeta);
1027 const __m256 ve4 = _mm256_fmadd_ps(vp4, valpha, vs4);
1028 vx4 = _mm256_mul_ps(vx4, vbeta);
1029 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5);
1030 vx5 = _mm256_mul_ps(vx5, vbeta);
1031 const __m256 ve6 = _mm256_fmadd_ps(vp6, valpha, vs6);
1032 vx6 = _mm256_mul_ps(vx6, vbeta);
1033
1034 const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
1035 const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
1036 const __m256 vy2 = _mm256_blendv_ps(vx2, ve2, vx2);
1037 const __m256 vy3 = _mm256_blendv_ps(vx3, ve3, vx3);
1038 const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4);
1039 const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5);
1040 const __m256 vy6 = _mm256_blendv_ps(vx6, ve6, vx6);
1041
1042 _mm256_storeu_ps(y, vy0);
1043 _mm256_storeu_ps(y + 8, vy1);
1044 _mm256_storeu_ps(y + 16, vy2);
1045 _mm256_storeu_ps(y + 24, vy3);
1046 _mm256_storeu_ps(y + 32, vy4);
1047 _mm256_storeu_ps(y + 40, vy5);
1048 _mm256_storeu_ps(y + 48, vy6);
1049 y += 56;
1050 }
1051 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1052 __m256 vx = _mm256_loadu_ps(x);
1053 x += 8;
1054
1055 const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1056
1057 __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1058 const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
1059 const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
1060 __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
1061 vn = _mm256_sub_ps(vn, vmagic_bias);
1062
1063 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1064
1065 __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
1066 vp = _mm256_fmadd_ps(vp, vt, vc2);
1067 vp = _mm256_mul_ps(vp, vt);
1068
1069 vt = _mm256_mul_ps(vt, vs);
1070 vs = _mm256_fmsub_ps(vs, valpha, valpha);
1071 vp = _mm256_fmadd_ps(vp, vt, vt);
1072 const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
1073
1074 vx = _mm256_mul_ps(vx, vbeta);
1075 const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1076
1077 _mm256_storeu_ps(y, vy);
1078 y += 8;
1079 }
1080 if XNN_UNLIKELY(n != 0) {
1081 assert(n >= 1 * sizeof(float));
1082 assert(n <= 7 * sizeof(float));
1083 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx2_rr1_lut4_p4.mask_table[7] - n));
1084
1085 __m256 vx = _mm256_maskload_ps(x, vmask);
1086
1087 const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1088
1089 __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1090 const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
1091 const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
1092 __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
1093 vn = _mm256_sub_ps(vn, vmagic_bias);
1094
1095 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1096
1097 __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
1098 vp = _mm256_fmadd_ps(vp, vt, vc2);
1099 vp = _mm256_mul_ps(vp, vt);
1100
1101 vt = _mm256_mul_ps(vt, vs);
1102 vs = _mm256_fmsub_ps(vs, valpha, valpha);
1103 vp = _mm256_fmadd_ps(vp, vt, vt);
1104 const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
1105
1106 vx = _mm256_mul_ps(vx, vbeta);
1107 const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1108
1109 __m128 vy_lo = _mm256_castps256_ps128(vy);
1110 if (n & (4 * sizeof(float))) {
1111 _mm_storeu_ps(y, vy_lo);
1112 vy_lo = _mm256_extractf128_ps(vy, 1);
1113 y += 4;
1114 }
1115 if (n & (2 * sizeof(float))) {
1116 _mm_storel_pi((__m64*) y, vy_lo);
1117 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
1118 y += 2;
1119 }
1120 if (n & (1 * sizeof(float))) {
1121 _mm_store_ss(y, vy_lo);
1122 }
1123 }
1124 }
1125
xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])1126 void xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(
1127 size_t n,
1128 const float* x,
1129 float* y,
1130 const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
1131 {
1132 assert(n % sizeof(float) == 0);
1133
1134 const __m256 vsign_mask = _mm256_load_ps(params->avx2_rr1_p5.sign_mask);
1135 const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p5.magic_bias);
1136 const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p5.log2e);
1137 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p5.minus_ln2);
1138 const __m256 vc5 = _mm256_load_ps(params->avx2_rr1_p5.c5);
1139 const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_p5.c4);
1140 const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_p5.c3);
1141 const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p5.c2);
1142 const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p5.c1);
1143 const __m256 vone = _mm256_load_ps(params->avx2_rr1_p5.one);
1144 const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx2_rr1_p5.denorm_cutoff);
1145
1146 for (; n >= 40 * sizeof(float); n -= 40 * sizeof(float)) {
1147 const __m256 vx0 = _mm256_loadu_ps(x);
1148 const __m256 vx1 = _mm256_loadu_ps(x + 8);
1149 const __m256 vx2 = _mm256_loadu_ps(x + 16);
1150 const __m256 vx3 = _mm256_loadu_ps(x + 24);
1151 const __m256 vx4 = _mm256_loadu_ps(x + 32);
1152 x += 40;
1153
1154 const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
1155 const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
1156 const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
1157 const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
1158 const __m256 vz4 = _mm256_or_ps(vx4, vsign_mask);
1159
1160 __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
1161 __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
1162 __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
1163 __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
1164 __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
1165
1166 const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
1167 const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
1168 const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23));
1169 const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23));
1170 const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23));
1171
1172 vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1173 vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1174 vn2 = _mm256_sub_ps(vn2, vmagic_bias);
1175 vn3 = _mm256_sub_ps(vn3, vmagic_bias);
1176 vn4 = _mm256_sub_ps(vn4, vmagic_bias);
1177
1178 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
1179 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
1180 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
1181 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
1182 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
1183
1184 __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4);
1185 __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4);
1186 __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4);
1187 __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4);
1188 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4);
1189
1190 vp0 = _mm256_fmadd_ps(vp0, vt0, vc3);
1191 vp1 = _mm256_fmadd_ps(vp1, vt1, vc3);
1192 vp2 = _mm256_fmadd_ps(vp2, vt2, vc3);
1193 vp3 = _mm256_fmadd_ps(vp3, vt3, vc3);
1194 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3);
1195
1196 vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
1197 vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
1198 vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
1199 vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
1200 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
1201
1202 vp0 = _mm256_fmadd_ps(vp0, vt0, vc1);
1203 vp1 = _mm256_fmadd_ps(vp1, vt1, vc1);
1204 vp2 = _mm256_fmadd_ps(vp2, vt2, vc1);
1205 vp3 = _mm256_fmadd_ps(vp3, vt3, vc1);
1206 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1);
1207
1208 vt0 = _mm256_mul_ps(vt0, vs0);
1209 vt1 = _mm256_mul_ps(vt1, vs1);
1210 vt2 = _mm256_mul_ps(vt2, vs2);
1211 vt3 = _mm256_mul_ps(vt3, vs3);
1212 vt4 = _mm256_mul_ps(vt4, vs4);
1213
1214 const __m256 ve0 = _mm256_fmadd_ps(vt0, vp0, vs0);
1215 const __m256 ve1 = _mm256_fmadd_ps(vt1, vp1, vs1);
1216 const __m256 ve2 = _mm256_fmadd_ps(vt2, vp2, vs2);
1217 const __m256 ve3 = _mm256_fmadd_ps(vt3, vp3, vs3);
1218 const __m256 ve4 = _mm256_fmadd_ps(vt4, vp4, vs4);
1219
1220 const __m256 vd0 = _mm256_add_ps(ve0, vone);
1221 const __m256 vd1 = _mm256_add_ps(ve1, vone);
1222 const __m256 vd2 = _mm256_add_ps(ve2, vone);
1223 const __m256 vd3 = _mm256_add_ps(ve3, vone);
1224 const __m256 vd4 = _mm256_add_ps(ve4, vone);
1225
1226 __m256 vf0 = _mm256_div_ps(ve0, vd0);
1227 __m256 vf1 = _mm256_div_ps(ve1, vd1);
1228 __m256 vf2 = _mm256_div_ps(ve2, vd2);
1229 __m256 vf3 = _mm256_div_ps(ve3, vd3);
1230 __m256 vf4 = _mm256_div_ps(ve4, vd4);
1231
1232 vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
1233 vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
1234 vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
1235 vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
1236 vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vz4, vdenorm_cutoff, _CMP_LT_OS), vf4);
1237
1238 vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
1239 vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
1240 vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
1241 vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
1242 vf4 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf4), vf4, vx4);
1243
1244 _mm256_storeu_ps(y, vf0);
1245 _mm256_storeu_ps(y + 8, vf1);
1246 _mm256_storeu_ps(y + 16, vf2);
1247 _mm256_storeu_ps(y + 24, vf3);
1248 _mm256_storeu_ps(y + 32, vf4);
1249 y += 40;
1250 }
1251 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1252 const __m256 vx = _mm256_loadu_ps(x);
1253 x += 8;
1254
1255 const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1256
1257 __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1258 const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1259 vn = _mm256_sub_ps(vn, vmagic_bias);
1260
1261 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1262
1263 __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
1264 vp = _mm256_fmadd_ps(vp, vt, vc3);
1265 vp = _mm256_fmadd_ps(vp, vt, vc2);
1266 vp = _mm256_fmadd_ps(vp, vt, vc1);
1267
1268 vt = _mm256_mul_ps(vt, vs);
1269 const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1270
1271 const __m256 vd = _mm256_add_ps(ve, vone);
1272 __m256 vf = _mm256_div_ps(ve, vd);
1273
1274 vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1275 vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1276
1277 _mm256_storeu_ps(y, vf);
1278 y += 8;
1279 }
1280 if XNN_UNLIKELY(n != 0) {
1281 assert(n >= 1 * sizeof(float));
1282 assert(n <= 7 * sizeof(float));
1283 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx2_rr1_p5.mask_table[7] - n));
1284
1285 const __m256 vx = _mm256_maskload_ps(x, vmask);
1286
1287 const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1288
1289 __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1290 const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1291 vn = _mm256_sub_ps(vn, vmagic_bias);
1292
1293 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1294
1295 __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
1296 vp = _mm256_fmadd_ps(vp, vt, vc3);
1297 vp = _mm256_fmadd_ps(vp, vt, vc2);
1298 vp = _mm256_fmadd_ps(vp, vt, vc1);
1299
1300 vt = _mm256_mul_ps(vt, vs);
1301 const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1302
1303 const __m256 vd = _mm256_add_ps(ve, vone);
1304 __m256 vf = _mm256_div_ps(ve, vd);
1305
1306 vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1307 vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1308
1309 __m128 vf_lo = _mm256_castps256_ps128(vf);
1310 if (n & (4 * sizeof(float))) {
1311 _mm_storeu_ps(y, vf_lo);
1312 vf_lo = _mm256_extractf128_ps(vf, 1);
1313 y += 4;
1314 }
1315 if (n & (2 * sizeof(float))) {
1316 _mm_storel_pi((__m64*) y, vf_lo);
1317 vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
1318 y += 2;
1319 }
1320 if (n & (1 * sizeof(float))) {
1321 _mm_store_ss(y, vf_lo);
1322 }
1323 }
1324 }
1325
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1326 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
1327 size_t channels,
1328 size_t output_width,
1329 const int8_t** input,
1330 const void* weights,
1331 int8_t* output,
1332 size_t input_stride,
1333 size_t output_increment,
1334 size_t input_offset,
1335 const int8_t* zero,
1336 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1337 {
1338 assert(channels != 0);
1339 assert(output_width != 0);
1340
1341 do {
1342 const int8_t* i0 = input[0];
1343 assert(i0 != NULL);
1344 if XNN_UNPREDICTABLE(i0 != zero) {
1345 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1346 }
1347 const int8_t* i1 = input[1];
1348 assert(i1 != NULL);
1349 if XNN_UNPREDICTABLE(i1 != zero) {
1350 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1351 }
1352 const int8_t* i2 = input[2];
1353 assert(i2 != NULL);
1354 if XNN_UNPREDICTABLE(i2 != zero) {
1355 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1356 }
1357 const int8_t* i3 = input[3];
1358 assert(i3 != NULL);
1359 if XNN_UNPREDICTABLE(i3 != zero) {
1360 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1361 }
1362 const int8_t* i4 = input[4];
1363 assert(i4 != NULL);
1364 if XNN_UNPREDICTABLE(i4 != zero) {
1365 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1366 }
1367 const int8_t* i5 = input[5];
1368 assert(i5 != NULL);
1369 if XNN_UNPREDICTABLE(i5 != zero) {
1370 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1371 }
1372 const int8_t* i6 = input[6];
1373 assert(i6 != NULL);
1374 if XNN_UNPREDICTABLE(i6 != zero) {
1375 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1376 }
1377 const int8_t* i7 = input[7];
1378 assert(i7 != NULL);
1379 if XNN_UNPREDICTABLE(i7 != zero) {
1380 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1381 }
1382 const int8_t* i8 = input[8];
1383 assert(i8 != NULL);
1384 if XNN_UNPREDICTABLE(i8 != zero) {
1385 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1386 }
1387 const int8_t* i9 = input[9];
1388 assert(i9 != NULL);
1389 if XNN_UNPREDICTABLE(i9 != zero) {
1390 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
1391 }
1392 const int8_t* i10 = input[10];
1393 assert(i10 != NULL);
1394 if XNN_UNPREDICTABLE(i10 != zero) {
1395 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
1396 }
1397 const int8_t* i11 = input[11];
1398 assert(i11 != NULL);
1399 if XNN_UNPREDICTABLE(i11 != zero) {
1400 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
1401 }
1402 const int8_t* i12 = input[12];
1403 assert(i12 != NULL);
1404 if XNN_UNPREDICTABLE(i12 != zero) {
1405 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
1406 }
1407 const int8_t* i13 = input[13];
1408 assert(i13 != NULL);
1409 if XNN_UNPREDICTABLE(i13 != zero) {
1410 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
1411 }
1412 const int8_t* i14 = input[14];
1413 assert(i14 != NULL);
1414 if XNN_UNPREDICTABLE(i14 != zero) {
1415 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
1416 }
1417 const int8_t* i15 = input[15];
1418 assert(i15 != NULL);
1419 if XNN_UNPREDICTABLE(i15 != zero) {
1420 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
1421 }
1422 const int8_t* i16 = input[16];
1423 assert(i16 != NULL);
1424 if XNN_UNPREDICTABLE(i16 != zero) {
1425 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
1426 }
1427 const int8_t* i17 = input[17];
1428 assert(i17 != NULL);
1429 if XNN_UNPREDICTABLE(i17 != zero) {
1430 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
1431 }
1432 const int8_t* i18 = input[18];
1433 assert(i18 != NULL);
1434 if XNN_UNPREDICTABLE(i18 != zero) {
1435 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
1436 }
1437 const int8_t* i19 = input[19];
1438 assert(i19 != NULL);
1439 if XNN_UNPREDICTABLE(i19 != zero) {
1440 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
1441 }
1442 const int8_t* i20 = input[20];
1443 assert(i20 != NULL);
1444 if XNN_UNPREDICTABLE(i20 != zero) {
1445 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
1446 }
1447 const int8_t* i21 = input[21];
1448 assert(i21 != NULL);
1449 if XNN_UNPREDICTABLE(i21 != zero) {
1450 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
1451 }
1452 const int8_t* i22 = input[22];
1453 assert(i22 != NULL);
1454 if XNN_UNPREDICTABLE(i22 != zero) {
1455 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
1456 }
1457 const int8_t* i23 = input[23];
1458 assert(i23 != NULL);
1459 if XNN_UNPREDICTABLE(i23 != zero) {
1460 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
1461 }
1462 const int8_t* i24 = input[24];
1463 assert(i24 != NULL);
1464 if XNN_UNPREDICTABLE(i24 != zero) {
1465 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
1466 }
1467 input = (const int8_t**) ((uintptr_t) input + input_stride);
1468
1469 size_t c = channels;
1470 const void* w = weights;
1471 for (; c >= 16; c -= 16) {
1472 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1473 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
1474
1475
1476 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
1477 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
1478 const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
1479 const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
1480 i0 += 16;
1481
1482 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
1483 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
1484
1485 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
1486 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
1487 const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
1488 const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
1489 i1 += 16;
1490
1491 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
1492 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
1493
1494 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
1495 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
1496 const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
1497 const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
1498 i2 += 16;
1499
1500 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
1501 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
1502
1503 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
1504 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
1505 const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
1506 const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
1507 i3 += 16;
1508
1509 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
1510 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
1511
1512 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
1513 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
1514 const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
1515 const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
1516 i4 += 16;
1517
1518 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
1519 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
1520
1521 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
1522 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
1523 const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
1524 const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
1525 i5 += 16;
1526
1527 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
1528 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
1529
1530 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
1531 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
1532 const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
1533 const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
1534 i6 += 16;
1535
1536 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
1537 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
1538
1539 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
1540 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
1541 const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
1542 const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
1543 i7 += 16;
1544
1545 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
1546 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
1547
1548 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
1549 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
1550 const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
1551 const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
1552 i8 += 16;
1553
1554 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
1555 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
1556
1557 const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
1558 const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
1559 const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
1560 const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
1561 i9 += 16;
1562
1563 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
1564 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
1565
1566 const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
1567 const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
1568 const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
1569 const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
1570 i10 += 16;
1571
1572 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
1573 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
1574
1575 const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
1576 const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
1577 const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
1578 const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
1579 i11 += 16;
1580
1581 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
1582 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
1583
1584 const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
1585 const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
1586 const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
1587 const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
1588 i12 += 16;
1589
1590 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
1591 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
1592
1593 const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
1594 const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
1595 const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
1596 const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
1597 i13 += 16;
1598
1599 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
1600 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
1601
1602 const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
1603 const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
1604 const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
1605 const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
1606 i14 += 16;
1607
1608 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
1609 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
1610
1611 const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
1612 const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
1613 const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
1614 const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
1615 i15 += 16;
1616
1617 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
1618 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
1619
1620 const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
1621 const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
1622 const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
1623 const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
1624 i16 += 16;
1625
1626 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
1627 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
1628
1629 const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
1630 const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
1631 const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
1632 const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
1633 i17 += 16;
1634
1635 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
1636 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
1637
1638 const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
1639 const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
1640 const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
1641 const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
1642 i18 += 16;
1643
1644 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
1645 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
1646
1647 const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
1648 const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
1649 const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
1650 const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
1651 i19 += 16;
1652
1653 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
1654 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
1655
1656 const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
1657 const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
1658 const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
1659 const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
1660 i20 += 16;
1661
1662 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
1663 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
1664
1665 const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
1666 const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
1667 const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
1668 const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
1669 i21 += 16;
1670
1671 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
1672 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
1673
1674 const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
1675 const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
1676 const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
1677 const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
1678 i22 += 16;
1679
1680 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
1681 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
1682
1683 const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
1684 const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
1685 const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
1686 const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
1687 i23 += 16;
1688
1689 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
1690 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
1691
1692 const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
1693 const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
1694 const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
1695 const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
1696 i24 += 16;
1697
1698 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
1699 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
1700
1701 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
1702
1703 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
1704 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
1705
1706 const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
1707 const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
1708 w = (const void*) ((const float*) w + 16);
1709 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
1710 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
1711
1712 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
1713 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
1714 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
1715
1716 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
1717 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
1718
1719 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
1720 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
1721
1722 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
1723
1724 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
1725 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
1726
1727 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
1728 output += 16;
1729 }
1730 if XNN_UNLIKELY(c != 0) {
1731 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
1732 do {
1733 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1734
1735
1736 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
1737 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
1738 i0 += 8;
1739
1740 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
1741
1742 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
1743 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
1744 i1 += 8;
1745
1746 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
1747
1748 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
1749 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
1750 i2 += 8;
1751
1752 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
1753
1754 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
1755 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
1756 i3 += 8;
1757
1758 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
1759
1760 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
1761 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
1762 i4 += 8;
1763
1764 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
1765
1766 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
1767 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
1768 i5 += 8;
1769
1770 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
1771
1772 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
1773 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
1774 i6 += 8;
1775
1776 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
1777
1778 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
1779 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
1780 i7 += 8;
1781
1782 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
1783
1784 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
1785 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
1786 i8 += 8;
1787
1788 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
1789
1790 const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
1791 const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
1792 i9 += 8;
1793
1794 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
1795
1796 const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
1797 const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
1798 i10 += 8;
1799
1800 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
1801
1802 const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
1803 const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
1804 i11 += 8;
1805
1806 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
1807
1808 const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
1809 const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
1810 i12 += 8;
1811
1812 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
1813
1814 const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
1815 const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
1816 i13 += 8;
1817
1818 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
1819
1820 const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
1821 const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
1822 i14 += 8;
1823
1824 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
1825
1826 const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
1827 const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
1828 i15 += 8;
1829
1830 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
1831
1832 const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
1833 const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
1834 i16 += 8;
1835
1836 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
1837
1838 const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
1839 const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
1840 i17 += 8;
1841
1842 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
1843
1844 const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
1845 const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
1846 i18 += 8;
1847
1848 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
1849
1850 const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
1851 const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
1852 i19 += 8;
1853
1854 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
1855
1856 const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
1857 const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
1858 i20 += 8;
1859
1860 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
1861
1862 const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
1863 const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
1864 i21 += 8;
1865
1866 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
1867
1868 const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
1869 const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
1870 i22 += 8;
1871
1872 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
1873
1874 const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
1875 const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
1876 i23 += 8;
1877
1878 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
1879
1880 const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
1881 const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
1882 i24 += 8;
1883
1884 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
1885
1886 k += 8;
1887
1888 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
1889 const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)));
1890 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
1891 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->avx2.output_max_less_zero_point));
1892 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
1893
1894 w = (const void*) ((const int32_t*) w + 8);
1895
1896 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
1897 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
1898
1899 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1900
1901 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
1902 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1903
1904 if XNN_LIKELY(c >= 8) {
1905 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1906 output += 8;
1907 c -= 8;
1908 } else {
1909 if (c & 4) {
1910 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
1911 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1912 output += 4;
1913 }
1914 if (c & 2) {
1915 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
1916 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1917 output += 2;
1918 }
1919 if (c & 1) {
1920 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1921 output += 1;
1922 }
1923 c = 0;
1924 }
1925 } while (c != 0);
1926 }
1927
1928 output = (int8_t*) ((uintptr_t) output + output_increment);
1929 } while (--output_width != 0);
1930 }
1931
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1932 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
1933 size_t channels,
1934 size_t output_width,
1935 const int8_t** input,
1936 const void* weights,
1937 int8_t* output,
1938 size_t input_stride,
1939 size_t output_increment,
1940 size_t input_offset,
1941 const int8_t* zero,
1942 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1943 {
1944 assert(channels != 0);
1945 assert(output_width != 0);
1946
1947 do {
1948 const int8_t* i0 = input[0];
1949 assert(i0 != NULL);
1950 if XNN_UNPREDICTABLE(i0 != zero) {
1951 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1952 }
1953 const int8_t* i1 = input[1];
1954 assert(i1 != NULL);
1955 if XNN_UNPREDICTABLE(i1 != zero) {
1956 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1957 }
1958 const int8_t* i2 = input[2];
1959 assert(i2 != NULL);
1960 if XNN_UNPREDICTABLE(i2 != zero) {
1961 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1962 }
1963 const int8_t* i3 = input[3];
1964 assert(i3 != NULL);
1965 if XNN_UNPREDICTABLE(i3 != zero) {
1966 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1967 }
1968 const int8_t* i4 = input[4];
1969 assert(i4 != NULL);
1970 if XNN_UNPREDICTABLE(i4 != zero) {
1971 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1972 }
1973 const int8_t* i5 = input[5];
1974 assert(i5 != NULL);
1975 if XNN_UNPREDICTABLE(i5 != zero) {
1976 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1977 }
1978 const int8_t* i6 = input[6];
1979 assert(i6 != NULL);
1980 if XNN_UNPREDICTABLE(i6 != zero) {
1981 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1982 }
1983 const int8_t* i7 = input[7];
1984 assert(i7 != NULL);
1985 if XNN_UNPREDICTABLE(i7 != zero) {
1986 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1987 }
1988 const int8_t* i8 = input[8];
1989 assert(i8 != NULL);
1990 if XNN_UNPREDICTABLE(i8 != zero) {
1991 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1992 }
1993 input = (const int8_t**) ((uintptr_t) input + input_stride);
1994
1995 size_t c = channels;
1996 const void* w = weights;
1997 for (; c >= 16; c -= 16) {
1998 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1999 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
2000
2001
2002 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2003 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2004 const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
2005 const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
2006 i0 += 16;
2007
2008 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2009 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
2010
2011 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2012 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2013 const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
2014 const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
2015 i1 += 16;
2016
2017 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2018 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
2019
2020 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2021 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2022 const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
2023 const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
2024 i2 += 16;
2025
2026 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2027 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
2028
2029 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2030 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
2031 const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
2032 const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
2033 i3 += 16;
2034
2035 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2036 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
2037
2038 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2039 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
2040 const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
2041 const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
2042 i4 += 16;
2043
2044 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2045 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
2046
2047 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2048 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
2049 const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
2050 const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
2051 i5 += 16;
2052
2053 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2054 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
2055
2056 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2057 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
2058 const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
2059 const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
2060 i6 += 16;
2061
2062 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2063 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
2064
2065 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2066 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
2067 const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
2068 const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
2069 i7 += 16;
2070
2071 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2072 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
2073
2074 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2075 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
2076 const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
2077 const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
2078 i8 += 16;
2079
2080 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2081 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
2082
2083 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
2084
2085 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2086 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
2087
2088 const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
2089 const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
2090 w = (const void*) ((const float*) w + 16);
2091 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2092 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
2093
2094 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2095 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
2096 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
2097
2098 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2099 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
2100
2101 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2102 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
2103
2104 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
2105
2106 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
2107 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
2108
2109 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2110 output += 16;
2111 }
2112 if XNN_UNLIKELY(c != 0) {
2113 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
2114 do {
2115 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2116
2117
2118 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2119 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
2120 i0 += 8;
2121
2122 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2123
2124 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2125 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
2126 i1 += 8;
2127
2128 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2129
2130 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2131 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
2132 i2 += 8;
2133
2134 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2135
2136 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2137 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
2138 i3 += 8;
2139
2140 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2141
2142 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2143 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
2144 i4 += 8;
2145
2146 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2147
2148 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2149 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
2150 i5 += 8;
2151
2152 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2153
2154 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2155 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
2156 i6 += 8;
2157
2158 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2159
2160 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2161 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
2162 i7 += 8;
2163
2164 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2165
2166 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2167 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
2168 i8 += 8;
2169
2170 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2171
2172 k += 8;
2173
2174 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2175 const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
2176 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2177 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->avx2.output_max_less_zero_point));
2178 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2179
2180 w = (const void*) ((const int32_t*) w + 8);
2181
2182 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
2183 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
2184
2185 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2186
2187 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
2188 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
2189
2190 if XNN_LIKELY(c >= 8) {
2191 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
2192 output += 8;
2193 c -= 8;
2194 } else {
2195 if (c & 4) {
2196 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
2197 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
2198 output += 4;
2199 }
2200 if (c & 2) {
2201 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
2202 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
2203 output += 2;
2204 }
2205 if (c & 1) {
2206 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
2207 output += 1;
2208 }
2209 c = 0;
2210 }
2211 } while (c != 0);
2212 }
2213
2214 output = (int8_t*) ((uintptr_t) output + output_increment);
2215 } while (--output_width != 0);
2216 }
2217
xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2218 void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
2219 size_t mr,
2220 size_t nc,
2221 size_t kc,
2222 const int8_t* restrict a,
2223 size_t a_stride,
2224 const void* restrict w,
2225 int8_t* restrict c,
2226 size_t cm_stride,
2227 size_t cn_stride,
2228 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2229 {
2230 assert(mr != 0);
2231 assert(mr <= 1);
2232 assert(nc != 0);
2233 assert(kc != 0);
2234 assert(kc % sizeof(int8_t) == 0);
2235 assert(a != NULL);
2236 assert(w != NULL);
2237 assert(c != NULL);
2238
2239 kc = round_up_po2(kc, 8);
2240 const int8_t* a0 = a;
2241 int8_t* c0 = c;
2242
2243 do {
2244 const __m128i vbias0x0 = _mm_loadu_si32(w);
2245 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2246 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2247 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2248 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2249 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2250 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2251 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2252 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2253 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2254 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2255 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2256 w = (const void*) ((const int32_t*) w + 8);
2257
2258 size_t k = 0;
2259 while (k < kc) {
2260 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2261 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2262 a0 += 8;
2263
2264 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2265 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2266
2267 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2268 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2269 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2270
2271 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2272 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2273 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2274
2275 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2276 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2277 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2278
2279 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2280
2281 w = (const void*) ((const int8_t*) w + 64);
2282 k += 8 * sizeof(int8_t);
2283 }
2284
2285 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2286 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2287
2288 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2289
2290 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2291 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2292
2293 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2294
2295 const __m256 vscale01234567 = _mm256_load_ps(w);
2296 w = (const void*) ((const float*) w + 8);
2297 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2298
2299 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2300 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2301
2302 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2303
2304 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2305 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
2306
2307 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2308
2309 __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
2310
2311 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2312
2313 __m128i vout_lo = _mm256_castsi256_si128(vout);
2314 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2315
2316 if (nc >= 8) {
2317 _mm_storel_epi64((__m128i*) c0, vout_lo);
2318
2319 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2320
2321 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2322
2323 nc -= 8;
2324 } else {
2325 if (nc & 4) {
2326 _mm_storeu_si32(c0, vout_lo);
2327
2328 c0 += 4;
2329
2330 vout_lo = _mm_srli_epi64(vout_lo, 32);
2331 vout_hi = _mm_srli_epi64(vout_hi, 32);
2332 }
2333 if (nc & 2) {
2334 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2335
2336 c0 += 2;
2337
2338 vout_lo = _mm_srli_epi32(vout_lo, 16);
2339 vout_hi = _mm_srli_epi32(vout_hi, 16);
2340 }
2341 if (nc & 1) {
2342 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2343 }
2344
2345 nc = 0;
2346 }
2347 } while (nc != 0);
2348 }
2349
xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2350 void xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
2351 size_t mr,
2352 size_t nc,
2353 size_t kc,
2354 const int8_t* restrict a,
2355 size_t a_stride,
2356 const void* restrict w,
2357 int8_t* restrict c,
2358 size_t cm_stride,
2359 size_t cn_stride,
2360 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2361 {
2362 assert(mr != 0);
2363 assert(mr <= 3);
2364 assert(nc != 0);
2365 assert(kc != 0);
2366 assert(kc % sizeof(int8_t) == 0);
2367 assert(a != NULL);
2368 assert(w != NULL);
2369 assert(c != NULL);
2370
2371 kc = round_up_po2(kc, 8);
2372 const int8_t* a0 = a;
2373 int8_t* c0 = c;
2374 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
2375 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2376 if XNN_UNPREDICTABLE(mr < 2) {
2377 a1 = a0;
2378 c1 = c0;
2379 }
2380 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
2381 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2382 if XNN_UNPREDICTABLE(mr <= 2) {
2383 a2 = a1;
2384 c2 = c1;
2385 }
2386
2387 do {
2388 const __m128i vbias0x0 = _mm_loadu_si32(w);
2389 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2390 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2391 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2392 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2393 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2394 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2395 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2396 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2397 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2398 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2399 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2400 __m256i vacc1x01 = vacc0x01;
2401 __m256i vacc1x23 = vacc0x23;
2402 __m256i vacc1x45 = vacc0x45;
2403 __m256i vacc1x67 = vacc0x67;
2404 __m256i vacc2x01 = vacc0x01;
2405 __m256i vacc2x23 = vacc0x23;
2406 __m256i vacc2x45 = vacc0x45;
2407 __m256i vacc2x67 = vacc0x67;
2408 w = (const void*) ((const int32_t*) w + 8);
2409
2410 size_t k = 0;
2411 while (k < kc) {
2412 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2413 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2414 a0 += 8;
2415 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
2416 const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
2417 a1 += 8;
2418 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
2419 const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
2420 a2 += 8;
2421
2422 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2423 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2424
2425 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2426 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
2427 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
2428 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2429 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2430
2431 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2432 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
2433 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
2434 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2435 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2436
2437 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2438 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
2439 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
2440 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2441 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2442
2443 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2444 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
2445 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
2446
2447 w = (const void*) ((const int8_t*) w + 64);
2448 k += 8 * sizeof(int8_t);
2449 }
2450
2451 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2452 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2453 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
2454 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
2455 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
2456 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
2457
2458 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2459 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
2460 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
2461
2462 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2463 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2464 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
2465 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
2466
2467 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2468 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
2469 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
2470
2471 const __m256 vscale01234567 = _mm256_load_ps(w);
2472 w = (const void*) ((const float*) w + 8);
2473 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2474 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
2475 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
2476
2477 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2478 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2479 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
2480 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
2481
2482 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2483 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
2484 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
2485
2486 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2487 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
2488 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
2489
2490 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2491 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2492
2493 __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
2494
2495 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2496
2497 __m128i vout_lo = _mm256_castsi256_si128(vout);
2498 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2499
2500 if (nc >= 8) {
2501 _mm_storel_epi64((__m128i*) c0, vout_lo);
2502 _mm_storel_epi64((__m128i*) c1, vout_hi);
2503 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
2504
2505 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2506 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2507 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2508
2509 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2510 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
2511 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
2512
2513 nc -= 8;
2514 } else {
2515 if (nc & 4) {
2516 _mm_storeu_si32(c0, vout_lo);
2517 _mm_storeu_si32(c1, vout_hi);
2518 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
2519
2520 c0 += 4;
2521 c1 += 4;
2522 c2 += 4;
2523
2524 vout_lo = _mm_srli_epi64(vout_lo, 32);
2525 vout_hi = _mm_srli_epi64(vout_hi, 32);
2526 }
2527 if (nc & 2) {
2528 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2529 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
2530 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
2531
2532 c0 += 2;
2533 c1 += 2;
2534 c2 += 2;
2535
2536 vout_lo = _mm_srli_epi32(vout_lo, 16);
2537 vout_hi = _mm_srli_epi32(vout_hi, 16);
2538 }
2539 if (nc & 1) {
2540 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2541 *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
2542 *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
2543 }
2544
2545 nc = 0;
2546 }
2547 } while (nc != 0);
2548 }
2549
xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2550 void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
2551 size_t mr,
2552 size_t nc,
2553 size_t kc,
2554 size_t ks,
2555 const int8_t** restrict a,
2556 const void* restrict w,
2557 int8_t* restrict c,
2558 size_t cm_stride,
2559 size_t cn_stride,
2560 size_t a_offset,
2561 const int8_t* zero,
2562 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2563 {
2564 assert(mr != 0);
2565 assert(mr <= 1);
2566 assert(nc != 0);
2567 assert(kc != 0);
2568 assert(ks != 0);
2569 assert(ks % (1 * sizeof(void*)) == 0);
2570 assert(a_offset % sizeof(int8_t) == 0);
2571 assert(a != NULL);
2572 assert(w != NULL);
2573 assert(c != NULL);
2574
2575 kc = round_up_po2(kc, 8);
2576 int8_t* c0 = c;
2577
2578 do {
2579 const __m128i vbias0x0 = _mm_loadu_si32(w);
2580 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2581 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2582 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2583 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2584 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2585 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2586 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2587 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2588 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2589 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2590 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2591 w = (const void*) ((const int32_t*) w + 8);
2592
2593 size_t p = ks;
2594 do {
2595 const int8_t* restrict a0 = a[0];
2596 if XNN_UNPREDICTABLE(a0 != zero) {
2597 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2598 }
2599 a += 1;
2600
2601 size_t k = 0;
2602 while (k < kc) {
2603 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2604 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2605 a0 += 8;
2606
2607 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2608 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2609
2610 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2611 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2612 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2613
2614 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2615 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2616 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2617
2618 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2619 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2620 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2621
2622 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2623
2624 w = (const void*) ((const int8_t*) w + 64);
2625 k += 8 * sizeof(int8_t);
2626 }
2627 p -= 1 * sizeof(void*);
2628 } while (p != 0);
2629
2630 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2631 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2632
2633 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2634
2635 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2636 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2637
2638 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2639
2640 const __m256 vscale01234567 = _mm256_load_ps(w);
2641 w = (const void*) ((const float*) w + 8);
2642 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2643
2644 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2645 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2646
2647 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2648
2649 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2650 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
2651
2652 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2653
2654 __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
2655
2656 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2657
2658 __m128i vout_lo = _mm256_castsi256_si128(vout);
2659 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2660
2661 if (nc >= 8) {
2662 _mm_storel_epi64((__m128i*) c0, vout_lo);
2663
2664 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2665
2666 a = (const int8_t**restrict) ((uintptr_t) a - ks);
2667
2668 nc -= 8;
2669 } else {
2670 if (nc & 4) {
2671 _mm_storeu_si32(c0, vout_lo);
2672
2673 c0 += 4;
2674
2675 vout_lo = _mm_srli_epi64(vout_lo, 32);
2676 vout_hi = _mm_srli_epi64(vout_hi, 32);
2677 }
2678 if (nc & 2) {
2679 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2680
2681 c0 += 2;
2682
2683 vout_lo = _mm_srli_epi32(vout_lo, 16);
2684 vout_hi = _mm_srli_epi32(vout_hi, 16);
2685 }
2686 if (nc & 1) {
2687 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2688 }
2689
2690 nc = 0;
2691 }
2692 } while (nc != 0);
2693 }
2694
xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2695 void xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
2696 size_t mr,
2697 size_t nc,
2698 size_t kc,
2699 size_t ks,
2700 const int8_t** restrict a,
2701 const void* restrict w,
2702 int8_t* restrict c,
2703 size_t cm_stride,
2704 size_t cn_stride,
2705 size_t a_offset,
2706 const int8_t* zero,
2707 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2708 {
2709 assert(mr != 0);
2710 assert(mr <= 3);
2711 assert(nc != 0);
2712 assert(kc != 0);
2713 assert(ks != 0);
2714 assert(ks % (3 * sizeof(void*)) == 0);
2715 assert(a_offset % sizeof(int8_t) == 0);
2716 assert(a != NULL);
2717 assert(w != NULL);
2718 assert(c != NULL);
2719
2720 kc = round_up_po2(kc, 8);
2721 int8_t* c0 = c;
2722 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2723 if XNN_UNPREDICTABLE(mr < 2) {
2724 c1 = c0;
2725 }
2726 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2727 if XNN_UNPREDICTABLE(mr <= 2) {
2728 c2 = c1;
2729 }
2730
2731 do {
2732 const __m128i vbias0x0 = _mm_loadu_si32(w);
2733 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2734 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2735 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2736 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2737 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2738 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2739 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2740 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2741 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2742 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2743 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2744 __m256i vacc1x01 = vacc0x01;
2745 __m256i vacc1x23 = vacc0x23;
2746 __m256i vacc1x45 = vacc0x45;
2747 __m256i vacc1x67 = vacc0x67;
2748 __m256i vacc2x01 = vacc0x01;
2749 __m256i vacc2x23 = vacc0x23;
2750 __m256i vacc2x45 = vacc0x45;
2751 __m256i vacc2x67 = vacc0x67;
2752 w = (const void*) ((const int32_t*) w + 8);
2753
2754 size_t p = ks;
2755 do {
2756 const int8_t* restrict a0 = a[0];
2757 if XNN_UNPREDICTABLE(a0 != zero) {
2758 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2759 }
2760 const int8_t* restrict a1 = a[1];
2761 if XNN_UNPREDICTABLE(a1 != zero) {
2762 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
2763 }
2764 const int8_t* restrict a2 = a[2];
2765 if XNN_UNPREDICTABLE(a2 != zero) {
2766 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
2767 }
2768 a += 3;
2769
2770 size_t k = 0;
2771 while (k < kc) {
2772 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2773 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2774 a0 += 8;
2775 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
2776 const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
2777 a1 += 8;
2778 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
2779 const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
2780 a2 += 8;
2781
2782 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2783 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2784
2785 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2786 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
2787 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
2788 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2789 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2790
2791 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2792 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
2793 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
2794 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2795 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2796
2797 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2798 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
2799 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
2800 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2801 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2802
2803 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2804 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
2805 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
2806
2807 w = (const void*) ((const int8_t*) w + 64);
2808 k += 8 * sizeof(int8_t);
2809 }
2810 p -= 3 * sizeof(void*);
2811 } while (p != 0);
2812
2813 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2814 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2815 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
2816 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
2817 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
2818 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
2819
2820 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2821 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
2822 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
2823
2824 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2825 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2826 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
2827 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
2828
2829 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2830 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
2831 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
2832
2833 const __m256 vscale01234567 = _mm256_load_ps(w);
2834 w = (const void*) ((const float*) w + 8);
2835 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2836 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
2837 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
2838
2839 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2840 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2841 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
2842 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
2843
2844 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2845 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
2846 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
2847
2848 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2849 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
2850 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
2851
2852 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2853 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2854
2855 __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
2856
2857 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2858
2859 __m128i vout_lo = _mm256_castsi256_si128(vout);
2860 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2861
2862 if (nc >= 8) {
2863 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
2864 _mm_storel_epi64((__m128i*) c1, vout_hi);
2865 _mm_storel_epi64((__m128i*) c0, vout_lo);
2866
2867 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2868 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2869 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2870
2871 a = (const int8_t**restrict) ((uintptr_t) a - ks);
2872
2873 nc -= 8;
2874 } else {
2875 if (nc & 4) {
2876 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
2877 _mm_storeu_si32(c1, vout_hi);
2878 _mm_storeu_si32(c0, vout_lo);
2879
2880 c2 += 4;
2881 c1 += 4;
2882 c0 += 4;
2883
2884 vout_lo = _mm_srli_epi64(vout_lo, 32);
2885 vout_hi = _mm_srli_epi64(vout_hi, 32);
2886 }
2887 if (nc & 2) {
2888 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
2889 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
2890 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2891
2892 c2 += 2;
2893 c1 += 2;
2894 c0 += 2;
2895
2896 vout_lo = _mm_srli_epi32(vout_lo, 16);
2897 vout_hi = _mm_srli_epi32(vout_hi, 16);
2898 }
2899 if (nc & 1) {
2900 *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
2901 *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
2902 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2903 }
2904
2905 nc = 0;
2906 }
2907 } while (nc != 0);
2908 }
2909
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2910 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
2911 size_t channels,
2912 size_t output_width,
2913 const int8_t** input,
2914 const void* weights,
2915 int8_t* output,
2916 size_t input_stride,
2917 size_t output_increment,
2918 size_t input_offset,
2919 const int8_t* zero,
2920 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2921 {
2922 assert(channels != 0);
2923 assert(output_width != 0);
2924
2925 do {
2926 const int8_t* i0 = input[0];
2927 assert(i0 != NULL);
2928 if XNN_UNPREDICTABLE(i0 != zero) {
2929 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2930 }
2931 const int8_t* i1 = input[1];
2932 assert(i1 != NULL);
2933 if XNN_UNPREDICTABLE(i1 != zero) {
2934 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2935 }
2936 const int8_t* i2 = input[2];
2937 assert(i2 != NULL);
2938 if XNN_UNPREDICTABLE(i2 != zero) {
2939 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2940 }
2941 const int8_t* i3 = input[3];
2942 assert(i3 != NULL);
2943 if XNN_UNPREDICTABLE(i3 != zero) {
2944 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2945 }
2946 const int8_t* i4 = input[4];
2947 assert(i4 != NULL);
2948 if XNN_UNPREDICTABLE(i4 != zero) {
2949 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2950 }
2951 const int8_t* i5 = input[5];
2952 assert(i5 != NULL);
2953 if XNN_UNPREDICTABLE(i5 != zero) {
2954 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2955 }
2956 const int8_t* i6 = input[6];
2957 assert(i6 != NULL);
2958 if XNN_UNPREDICTABLE(i6 != zero) {
2959 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2960 }
2961 const int8_t* i7 = input[7];
2962 assert(i7 != NULL);
2963 if XNN_UNPREDICTABLE(i7 != zero) {
2964 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2965 }
2966 const int8_t* i8 = input[8];
2967 assert(i8 != NULL);
2968 if XNN_UNPREDICTABLE(i8 != zero) {
2969 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2970 }
2971 const int8_t* i9 = input[9];
2972 assert(i9 != NULL);
2973 if XNN_UNPREDICTABLE(i9 != zero) {
2974 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2975 }
2976 const int8_t* i10 = input[10];
2977 assert(i10 != NULL);
2978 if XNN_UNPREDICTABLE(i10 != zero) {
2979 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2980 }
2981 const int8_t* i11 = input[11];
2982 assert(i11 != NULL);
2983 if XNN_UNPREDICTABLE(i11 != zero) {
2984 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2985 }
2986 const int8_t* i12 = input[12];
2987 assert(i12 != NULL);
2988 if XNN_UNPREDICTABLE(i12 != zero) {
2989 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2990 }
2991 const int8_t* i13 = input[13];
2992 assert(i13 != NULL);
2993 if XNN_UNPREDICTABLE(i13 != zero) {
2994 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2995 }
2996 const int8_t* i14 = input[14];
2997 assert(i14 != NULL);
2998 if XNN_UNPREDICTABLE(i14 != zero) {
2999 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
3000 }
3001 const int8_t* i15 = input[15];
3002 assert(i15 != NULL);
3003 if XNN_UNPREDICTABLE(i15 != zero) {
3004 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
3005 }
3006 const int8_t* i16 = input[16];
3007 assert(i16 != NULL);
3008 if XNN_UNPREDICTABLE(i16 != zero) {
3009 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
3010 }
3011 const int8_t* i17 = input[17];
3012 assert(i17 != NULL);
3013 if XNN_UNPREDICTABLE(i17 != zero) {
3014 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
3015 }
3016 const int8_t* i18 = input[18];
3017 assert(i18 != NULL);
3018 if XNN_UNPREDICTABLE(i18 != zero) {
3019 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
3020 }
3021 const int8_t* i19 = input[19];
3022 assert(i19 != NULL);
3023 if XNN_UNPREDICTABLE(i19 != zero) {
3024 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
3025 }
3026 const int8_t* i20 = input[20];
3027 assert(i20 != NULL);
3028 if XNN_UNPREDICTABLE(i20 != zero) {
3029 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
3030 }
3031 const int8_t* i21 = input[21];
3032 assert(i21 != NULL);
3033 if XNN_UNPREDICTABLE(i21 != zero) {
3034 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
3035 }
3036 const int8_t* i22 = input[22];
3037 assert(i22 != NULL);
3038 if XNN_UNPREDICTABLE(i22 != zero) {
3039 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
3040 }
3041 const int8_t* i23 = input[23];
3042 assert(i23 != NULL);
3043 if XNN_UNPREDICTABLE(i23 != zero) {
3044 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
3045 }
3046 const int8_t* i24 = input[24];
3047 assert(i24 != NULL);
3048 if XNN_UNPREDICTABLE(i24 != zero) {
3049 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
3050 }
3051 input = (const int8_t**) ((uintptr_t) input + input_stride);
3052
3053 size_t c = channels;
3054 const void* w = weights;
3055 for (; c >= 16; c -= 16) {
3056 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3057 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
3058
3059
3060 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3061 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
3062 const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
3063 const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
3064 i0 += 16;
3065
3066 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3067 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
3068
3069 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3070 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
3071 const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
3072 const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
3073 i1 += 16;
3074
3075 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3076 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
3077
3078 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3079 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
3080 const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
3081 const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
3082 i2 += 16;
3083
3084 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3085 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
3086
3087 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3088 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
3089 const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
3090 const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
3091 i3 += 16;
3092
3093 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3094 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
3095
3096 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3097 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
3098 const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
3099 const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
3100 i4 += 16;
3101
3102 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3103 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
3104
3105 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3106 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
3107 const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
3108 const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
3109 i5 += 16;
3110
3111 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3112 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
3113
3114 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3115 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
3116 const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
3117 const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
3118 i6 += 16;
3119
3120 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3121 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
3122
3123 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3124 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
3125 const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
3126 const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
3127 i7 += 16;
3128
3129 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3130 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
3131
3132 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3133 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
3134 const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
3135 const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
3136 i8 += 16;
3137
3138 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3139 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
3140
3141 const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
3142 const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
3143 const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
3144 const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
3145 i9 += 16;
3146
3147 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
3148 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
3149
3150 const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
3151 const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
3152 const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
3153 const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
3154 i10 += 16;
3155
3156 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
3157 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
3158
3159 const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
3160 const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
3161 const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
3162 const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
3163 i11 += 16;
3164
3165 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
3166 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
3167
3168 const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
3169 const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
3170 const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
3171 const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
3172 i12 += 16;
3173
3174 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
3175 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
3176
3177 const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
3178 const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
3179 const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
3180 const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
3181 i13 += 16;
3182
3183 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
3184 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
3185
3186 const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
3187 const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
3188 const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
3189 const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
3190 i14 += 16;
3191
3192 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
3193 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
3194
3195 const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
3196 const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
3197 const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
3198 const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
3199 i15 += 16;
3200
3201 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
3202 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
3203
3204 const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
3205 const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
3206 const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
3207 const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
3208 i16 += 16;
3209
3210 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
3211 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
3212
3213 const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
3214 const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
3215 const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
3216 const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
3217 i17 += 16;
3218
3219 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
3220 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
3221
3222 const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
3223 const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
3224 const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
3225 const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
3226 i18 += 16;
3227
3228 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
3229 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
3230
3231 const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
3232 const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
3233 const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
3234 const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
3235 i19 += 16;
3236
3237 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
3238 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
3239
3240 const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
3241 const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
3242 const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
3243 const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
3244 i20 += 16;
3245
3246 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
3247 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
3248
3249 const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
3250 const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
3251 const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
3252 const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
3253 i21 += 16;
3254
3255 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
3256 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
3257
3258 const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
3259 const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
3260 const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
3261 const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
3262 i22 += 16;
3263
3264 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
3265 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
3266
3267 const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
3268 const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
3269 const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
3270 const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
3271 i23 += 16;
3272
3273 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
3274 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
3275
3276 const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
3277 const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
3278 const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
3279 const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
3280 i24 += 16;
3281
3282 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
3283 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
3284
3285 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
3286
3287 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3288 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
3289
3290 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3291 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
3292 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
3293
3294 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3295 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
3296 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
3297
3298 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3299 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
3300
3301 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3302 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
3303
3304 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3305
3306 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3307 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3308
3309 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3310 output += 16;
3311 }
3312 if XNN_UNLIKELY(c != 0) {
3313 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
3314 do {
3315 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3316
3317
3318 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3319 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
3320 i0 += 8;
3321
3322 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3323
3324 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3325 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
3326 i1 += 8;
3327
3328 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3329
3330 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3331 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
3332 i2 += 8;
3333
3334 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3335
3336 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3337 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
3338 i3 += 8;
3339
3340 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3341
3342 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3343 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
3344 i4 += 8;
3345
3346 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3347
3348 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3349 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
3350 i5 += 8;
3351
3352 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3353
3354 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3355 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
3356 i6 += 8;
3357
3358 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3359
3360 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3361 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
3362 i7 += 8;
3363
3364 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3365
3366 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3367 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
3368 i8 += 8;
3369
3370 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3371
3372 const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
3373 const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
3374 i9 += 8;
3375
3376 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
3377
3378 const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
3379 const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
3380 i10 += 8;
3381
3382 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
3383
3384 const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
3385 const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
3386 i11 += 8;
3387
3388 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
3389
3390 const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
3391 const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
3392 i12 += 8;
3393
3394 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
3395
3396 const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
3397 const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
3398 i13 += 8;
3399
3400 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
3401
3402 const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
3403 const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
3404 i14 += 8;
3405
3406 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
3407
3408 const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
3409 const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
3410 i15 += 8;
3411
3412 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
3413
3414 const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
3415 const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
3416 i16 += 8;
3417
3418 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
3419
3420 const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
3421 const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
3422 i17 += 8;
3423
3424 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
3425
3426 const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
3427 const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
3428 i18 += 8;
3429
3430 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
3431
3432 const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
3433 const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
3434 i19 += 8;
3435
3436 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
3437
3438 const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
3439 const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
3440 i20 += 8;
3441
3442 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
3443
3444 const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
3445 const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
3446 i21 += 8;
3447
3448 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
3449
3450 const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
3451 const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
3452 i22 += 8;
3453
3454 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
3455
3456 const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
3457 const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
3458 i23 += 8;
3459
3460 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
3461
3462 const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
3463 const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
3464 i24 += 8;
3465
3466 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
3467
3468 k += 8;
3469
3470 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3471 vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
3472 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3473 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3474
3475 w = (const void*) ((const int32_t*) w + 8);
3476
3477 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3478 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3479
3480 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3481
3482 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3483 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3484
3485 if XNN_LIKELY(c >= 8) {
3486 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3487 output += 8;
3488 c -= 8;
3489 } else {
3490 if (c & 4) {
3491 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
3492 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3493 output += 4;
3494 }
3495 if (c & 2) {
3496 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
3497 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3498 output += 2;
3499 }
3500 if (c & 1) {
3501 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3502 output += 1;
3503 }
3504 c = 0;
3505 }
3506 } while (c != 0);
3507 }
3508
3509 output = (int8_t*) ((uintptr_t) output + output_increment);
3510 } while (--output_width != 0);
3511 }
3512
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3513 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
3514 size_t channels,
3515 size_t output_width,
3516 const int8_t** input,
3517 const void* weights,
3518 int8_t* output,
3519 size_t input_stride,
3520 size_t output_increment,
3521 size_t input_offset,
3522 const int8_t* zero,
3523 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3524 {
3525 assert(channels != 0);
3526 assert(output_width != 0);
3527
3528 do {
3529 const int8_t* i0 = input[0];
3530 assert(i0 != NULL);
3531 if XNN_UNPREDICTABLE(i0 != zero) {
3532 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3533 }
3534 const int8_t* i1 = input[1];
3535 assert(i1 != NULL);
3536 if XNN_UNPREDICTABLE(i1 != zero) {
3537 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3538 }
3539 const int8_t* i2 = input[2];
3540 assert(i2 != NULL);
3541 if XNN_UNPREDICTABLE(i2 != zero) {
3542 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3543 }
3544 const int8_t* i3 = input[3];
3545 assert(i3 != NULL);
3546 if XNN_UNPREDICTABLE(i3 != zero) {
3547 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3548 }
3549 const int8_t* i4 = input[4];
3550 assert(i4 != NULL);
3551 if XNN_UNPREDICTABLE(i4 != zero) {
3552 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3553 }
3554 const int8_t* i5 = input[5];
3555 assert(i5 != NULL);
3556 if XNN_UNPREDICTABLE(i5 != zero) {
3557 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3558 }
3559 const int8_t* i6 = input[6];
3560 assert(i6 != NULL);
3561 if XNN_UNPREDICTABLE(i6 != zero) {
3562 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3563 }
3564 const int8_t* i7 = input[7];
3565 assert(i7 != NULL);
3566 if XNN_UNPREDICTABLE(i7 != zero) {
3567 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3568 }
3569 const int8_t* i8 = input[8];
3570 assert(i8 != NULL);
3571 if XNN_UNPREDICTABLE(i8 != zero) {
3572 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3573 }
3574 input = (const int8_t**) ((uintptr_t) input + input_stride);
3575
3576 size_t c = channels;
3577 const void* w = weights;
3578 for (; c >= 16; c -= 16) {
3579 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3580 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
3581
3582
3583 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3584 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
3585 const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
3586 const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
3587 i0 += 16;
3588
3589 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3590 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
3591
3592 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3593 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
3594 const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
3595 const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
3596 i1 += 16;
3597
3598 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3599 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
3600
3601 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3602 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
3603 const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
3604 const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
3605 i2 += 16;
3606
3607 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3608 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
3609
3610 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3611 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
3612 const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
3613 const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
3614 i3 += 16;
3615
3616 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3617 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
3618
3619 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3620 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
3621 const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
3622 const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
3623 i4 += 16;
3624
3625 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3626 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
3627
3628 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3629 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
3630 const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
3631 const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
3632 i5 += 16;
3633
3634 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3635 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
3636
3637 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3638 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
3639 const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
3640 const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
3641 i6 += 16;
3642
3643 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3644 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
3645
3646 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3647 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
3648 const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
3649 const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
3650 i7 += 16;
3651
3652 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3653 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
3654
3655 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3656 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
3657 const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
3658 const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
3659 i8 += 16;
3660
3661 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3662 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
3663
3664 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
3665
3666 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3667 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
3668
3669 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3670 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
3671 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
3672
3673 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3674 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
3675 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
3676
3677 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3678 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
3679
3680 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3681 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
3682
3683 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3684
3685 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3686 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3687
3688 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3689 output += 16;
3690 }
3691 if XNN_UNLIKELY(c != 0) {
3692 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
3693 do {
3694 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3695
3696
3697 const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3698 const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
3699 i0 += 8;
3700
3701 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3702
3703 const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3704 const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
3705 i1 += 8;
3706
3707 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3708
3709 const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3710 const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
3711 i2 += 8;
3712
3713 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3714
3715 const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3716 const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
3717 i3 += 8;
3718
3719 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3720
3721 const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3722 const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
3723 i4 += 8;
3724
3725 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3726
3727 const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3728 const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
3729 i5 += 8;
3730
3731 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3732
3733 const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3734 const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
3735 i6 += 8;
3736
3737 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3738
3739 const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3740 const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
3741 i7 += 8;
3742
3743 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3744
3745 const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3746 const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
3747 i8 += 8;
3748
3749 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3750
3751 k += 8;
3752
3753 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3754 vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
3755 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3756 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3757
3758 w = (const void*) ((const int32_t*) w + 8);
3759
3760 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3761 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3762
3763 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3764
3765 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3766 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3767
3768 if XNN_LIKELY(c >= 8) {
3769 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3770 output += 8;
3771 c -= 8;
3772 } else {
3773 if (c & 4) {
3774 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
3775 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3776 output += 4;
3777 }
3778 if (c & 2) {
3779 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
3780 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3781 output += 2;
3782 }
3783 if (c & 1) {
3784 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3785 output += 1;
3786 }
3787 c = 0;
3788 }
3789 } while (c != 0);
3790 }
3791
3792 output = (int8_t*) ((uintptr_t) output + output_increment);
3793 } while (--output_width != 0);
3794 }
3795
xnn_qs8_f32_vcvt_ukernel__avx2_x16(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])3796 void xnn_qs8_f32_vcvt_ukernel__avx2_x16(
3797 size_t n,
3798 const int8_t* x,
3799 float* y,
3800 const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3801 {
3802 assert(n != 0);
3803 assert(n % sizeof(int8_t) == 0);
3804 assert(x != NULL);
3805 assert(y != NULL);
3806
3807 const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
3808 const __m256 vscale = _mm256_load_ps(params->avx.scale);
3809 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
3810 __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3811 __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
3812 x += 16;
3813
3814 vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
3815 vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
3816
3817 __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
3818 __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
3819
3820 vy01234567 = _mm256_mul_ps(vy01234567, vscale);
3821 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
3822
3823 _mm256_storeu_ps(y, vy01234567);
3824 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3825 y += 16;
3826 }
3827 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
3828 __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3829 vx = _mm256_add_epi32(vx, vminus_zero_point);
3830 x += 8;
3831
3832 __m256 vy = _mm256_cvtepi32_ps(vx);
3833 vy = _mm256_mul_ps(vy, vscale);
3834
3835 _mm256_storeu_ps(y, vy);
3836 y += 8;
3837 }
3838 if XNN_UNLIKELY(n != 0) {
3839 assert(n >= 1 * sizeof(int8_t));
3840 assert(n <= 7 * sizeof(int8_t));
3841
3842 __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3843 vx = _mm256_add_epi32(vx, vminus_zero_point);
3844
3845 __m256 vy = _mm256_cvtepi32_ps(vx);
3846 vy = _mm256_mul_ps(vy, vscale);
3847
3848 __m128 vy_lo = _mm256_castps256_ps128(vy);
3849 if (n & (4 * sizeof(int8_t))) {
3850 _mm_storeu_ps(y, vy_lo);
3851 vy_lo = _mm256_extractf128_ps(vy, 1);
3852 y += 4;
3853 }
3854 if (n & (2 * sizeof(int8_t))) {
3855 _mm_storel_pi((__m64*) y, vy_lo);
3856 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3857 y += 2;
3858 }
3859 if (n & (1 * sizeof(int8_t))) {
3860 _mm_store_ss(y, vy_lo);
3861 }
3862 }
3863 }
3864
xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3865 void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
3866 size_t mr,
3867 size_t nc,
3868 size_t kc,
3869 const int8_t* restrict a,
3870 size_t a_stride,
3871 const void* restrict w,
3872 int8_t* restrict c,
3873 size_t cm_stride,
3874 size_t cn_stride,
3875 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3876 {
3877 assert(mr != 0);
3878 assert(mr <= 1);
3879 assert(nc != 0);
3880 assert(kc != 0);
3881 assert(kc % sizeof(int8_t) == 0);
3882 assert(a != NULL);
3883 assert(w != NULL);
3884 assert(c != NULL);
3885
3886 kc = round_up_po2(kc, 8);
3887 const int8_t* a0 = a;
3888 int8_t* c0 = c;
3889
3890 do {
3891 const __m128i vbias0x0 = _mm_loadu_si32(w);
3892 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
3893 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3894 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
3895 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
3896 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3897 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
3898 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
3899 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3900 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
3901 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
3902 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3903 w = (const void*) ((const int32_t*) w + 8);
3904
3905 size_t k = 0;
3906 while (k < kc) {
3907 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3908 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3909 a0 += 8;
3910
3911 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3912 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3913
3914 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3915 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3916 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3917
3918 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3919 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3920 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3921
3922 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3923 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3924 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3925
3926 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3927
3928 w = (const void*) ((const int8_t*) w + 64);
3929 k += 8 * sizeof(int8_t);
3930 }
3931
3932 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3933 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3934
3935 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3936
3937 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3938 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3939
3940 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3941
3942 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3943 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
3944
3945 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3946 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3947
3948 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3949
3950 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3951 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
3952
3953 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3954
3955 __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
3956
3957 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3958
3959 __m128i vout_lo = _mm256_castsi256_si128(vout);
3960 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3961
3962 if (nc >= 8) {
3963 _mm_storel_epi64((__m128i*) c0, vout_lo);
3964
3965 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3966
3967 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
3968
3969 nc -= 8;
3970 } else {
3971 if (nc & 4) {
3972 _mm_storeu_si32(c0, vout_lo);
3973
3974 c0 += 4;
3975
3976 vout_lo = _mm_srli_epi64(vout_lo, 32);
3977 vout_hi = _mm_srli_epi64(vout_hi, 32);
3978 }
3979 if (nc & 2) {
3980 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
3981
3982 c0 += 2;
3983
3984 vout_lo = _mm_srli_epi32(vout_lo, 16);
3985 vout_hi = _mm_srli_epi32(vout_hi, 16);
3986 }
3987 if (nc & 1) {
3988 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
3989 }
3990
3991 nc = 0;
3992 }
3993 } while (nc != 0);
3994 }
3995
xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3996 void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
3997 size_t mr,
3998 size_t nc,
3999 size_t kc,
4000 const int8_t* restrict a,
4001 size_t a_stride,
4002 const void* restrict w,
4003 int8_t* restrict c,
4004 size_t cm_stride,
4005 size_t cn_stride,
4006 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4007 {
4008 assert(mr != 0);
4009 assert(mr <= 3);
4010 assert(nc != 0);
4011 assert(kc != 0);
4012 assert(kc % sizeof(int8_t) == 0);
4013 assert(a != NULL);
4014 assert(w != NULL);
4015 assert(c != NULL);
4016
4017 kc = round_up_po2(kc, 8);
4018 const int8_t* a0 = a;
4019 int8_t* c0 = c;
4020 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
4021 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4022 if XNN_UNPREDICTABLE(mr < 2) {
4023 a1 = a0;
4024 c1 = c0;
4025 }
4026 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
4027 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4028 if XNN_UNPREDICTABLE(mr <= 2) {
4029 a2 = a1;
4030 c2 = c1;
4031 }
4032
4033 do {
4034 const __m128i vbias0x0 = _mm_loadu_si32(w);
4035 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4036 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4037 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4038 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4039 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4040 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4041 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4042 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4043 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4044 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4045 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4046 __m256i vacc1x01 = vacc0x01;
4047 __m256i vacc1x23 = vacc0x23;
4048 __m256i vacc1x45 = vacc0x45;
4049 __m256i vacc1x67 = vacc0x67;
4050 __m256i vacc2x01 = vacc0x01;
4051 __m256i vacc2x23 = vacc0x23;
4052 __m256i vacc2x45 = vacc0x45;
4053 __m256i vacc2x67 = vacc0x67;
4054 w = (const void*) ((const int32_t*) w + 8);
4055
4056 size_t k = 0;
4057 while (k < kc) {
4058 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4059 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4060 a0 += 8;
4061 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
4062 const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
4063 a1 += 8;
4064 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
4065 const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
4066 a2 += 8;
4067
4068 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4069 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4070
4071 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4072 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
4073 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
4074 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4075 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4076
4077 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4078 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
4079 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
4080 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4081 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4082
4083 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4084 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
4085 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
4086 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4087 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4088
4089 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4090 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
4091 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
4092
4093 w = (const void*) ((const int8_t*) w + 64);
4094 k += 8 * sizeof(int8_t);
4095 }
4096
4097 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4098 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4099 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
4100 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
4101 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
4102 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
4103
4104 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4105 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
4106 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
4107
4108 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4109 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4110 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
4111 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
4112
4113 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4114 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
4115 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
4116
4117 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4118 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4119 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
4120 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
4121
4122 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4123 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4124 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
4125 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
4126
4127 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4128 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
4129 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
4130
4131 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4132 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
4133 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
4134
4135 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4136 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4137
4138 __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
4139
4140 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4141
4142 __m128i vout_lo = _mm256_castsi256_si128(vout);
4143 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4144
4145 if (nc >= 8) {
4146 _mm_storel_epi64((__m128i*) c0, vout_lo);
4147 _mm_storel_epi64((__m128i*) c1, vout_hi);
4148 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
4149
4150 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4151 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4152 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4153
4154 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4155 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
4156 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
4157
4158 nc -= 8;
4159 } else {
4160 if (nc & 4) {
4161 _mm_storeu_si32(c0, vout_lo);
4162 _mm_storeu_si32(c1, vout_hi);
4163 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
4164
4165 c0 += 4;
4166 c1 += 4;
4167 c2 += 4;
4168
4169 vout_lo = _mm_srli_epi64(vout_lo, 32);
4170 vout_hi = _mm_srli_epi64(vout_hi, 32);
4171 }
4172 if (nc & 2) {
4173 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4174 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
4175 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
4176
4177 c0 += 2;
4178 c1 += 2;
4179 c2 += 2;
4180
4181 vout_lo = _mm_srli_epi32(vout_lo, 16);
4182 vout_hi = _mm_srli_epi32(vout_hi, 16);
4183 }
4184 if (nc & 1) {
4185 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4186 *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
4187 *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
4188 }
4189
4190 nc = 0;
4191 }
4192 } while (nc != 0);
4193 }
4194
xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4195 void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
4196 size_t mr,
4197 size_t nc,
4198 size_t kc,
4199 size_t ks,
4200 const int8_t** restrict a,
4201 const void* restrict w,
4202 int8_t* restrict c,
4203 size_t cm_stride,
4204 size_t cn_stride,
4205 size_t a_offset,
4206 const int8_t* zero,
4207 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4208 {
4209 assert(mr != 0);
4210 assert(mr <= 1);
4211 assert(nc != 0);
4212 assert(kc != 0);
4213 assert(ks != 0);
4214 assert(ks % (1 * sizeof(void*)) == 0);
4215 assert(a_offset % sizeof(int8_t) == 0);
4216 assert(a != NULL);
4217 assert(w != NULL);
4218 assert(c != NULL);
4219
4220 kc = round_up_po2(kc, 8);
4221 int8_t* c0 = c;
4222
4223 do {
4224 const __m128i vbias0x0 = _mm_loadu_si32(w);
4225 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4226 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4227 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4228 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4229 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4230 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4231 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4232 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4233 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4234 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4235 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4236 w = (const void*) ((const int32_t*) w + 8);
4237
4238 size_t p = ks;
4239 do {
4240 const int8_t* restrict a0 = a[0];
4241 if XNN_UNPREDICTABLE(a0 != zero) {
4242 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4243 }
4244 a += 1;
4245
4246 size_t k = 0;
4247 while (k < kc) {
4248 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4249 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4250 a0 += 8;
4251
4252 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4253 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4254
4255 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4256 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4257 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4258
4259 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4260 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4261 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4262
4263 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4264 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4265 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4266
4267 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4268
4269 w = (const void*) ((const int8_t*) w + 64);
4270 k += 8 * sizeof(int8_t);
4271 }
4272 p -= 1 * sizeof(void*);
4273 } while (p != 0);
4274
4275 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4276 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4277
4278 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4279
4280 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4281 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4282
4283 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4284
4285 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4286 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4287
4288 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4289 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4290
4291 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4292
4293 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4294 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
4295
4296 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4297
4298 __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
4299
4300 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4301
4302 __m128i vout_lo = _mm256_castsi256_si128(vout);
4303 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4304
4305 if (nc >= 8) {
4306 _mm_storel_epi64((__m128i*) c0, vout_lo);
4307
4308 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4309
4310 a = (const int8_t**restrict) ((uintptr_t) a - ks);
4311
4312 nc -= 8;
4313 } else {
4314 if (nc & 4) {
4315 _mm_storeu_si32(c0, vout_lo);
4316
4317 c0 += 4;
4318
4319 vout_lo = _mm_srli_epi64(vout_lo, 32);
4320 vout_hi = _mm_srli_epi64(vout_hi, 32);
4321 }
4322 if (nc & 2) {
4323 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4324
4325 c0 += 2;
4326
4327 vout_lo = _mm_srli_epi32(vout_lo, 16);
4328 vout_hi = _mm_srli_epi32(vout_hi, 16);
4329 }
4330 if (nc & 1) {
4331 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4332 }
4333
4334 nc = 0;
4335 }
4336 } while (nc != 0);
4337 }
4338
xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4339 void xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
4340 size_t mr,
4341 size_t nc,
4342 size_t kc,
4343 size_t ks,
4344 const int8_t** restrict a,
4345 const void* restrict w,
4346 int8_t* restrict c,
4347 size_t cm_stride,
4348 size_t cn_stride,
4349 size_t a_offset,
4350 const int8_t* zero,
4351 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4352 {
4353 assert(mr != 0);
4354 assert(mr <= 3);
4355 assert(nc != 0);
4356 assert(kc != 0);
4357 assert(ks != 0);
4358 assert(ks % (3 * sizeof(void*)) == 0);
4359 assert(a_offset % sizeof(int8_t) == 0);
4360 assert(a != NULL);
4361 assert(w != NULL);
4362 assert(c != NULL);
4363
4364 kc = round_up_po2(kc, 8);
4365 int8_t* c0 = c;
4366 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4367 if XNN_UNPREDICTABLE(mr < 2) {
4368 c1 = c0;
4369 }
4370 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4371 if XNN_UNPREDICTABLE(mr <= 2) {
4372 c2 = c1;
4373 }
4374
4375 do {
4376 const __m128i vbias0x0 = _mm_loadu_si32(w);
4377 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4378 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4379 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4380 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4381 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4382 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4383 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4384 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4385 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4386 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4387 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4388 __m256i vacc1x01 = vacc0x01;
4389 __m256i vacc1x23 = vacc0x23;
4390 __m256i vacc1x45 = vacc0x45;
4391 __m256i vacc1x67 = vacc0x67;
4392 __m256i vacc2x01 = vacc0x01;
4393 __m256i vacc2x23 = vacc0x23;
4394 __m256i vacc2x45 = vacc0x45;
4395 __m256i vacc2x67 = vacc0x67;
4396 w = (const void*) ((const int32_t*) w + 8);
4397
4398 size_t p = ks;
4399 do {
4400 const int8_t* restrict a0 = a[0];
4401 if XNN_UNPREDICTABLE(a0 != zero) {
4402 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4403 }
4404 const int8_t* restrict a1 = a[1];
4405 if XNN_UNPREDICTABLE(a1 != zero) {
4406 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
4407 }
4408 const int8_t* restrict a2 = a[2];
4409 if XNN_UNPREDICTABLE(a2 != zero) {
4410 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
4411 }
4412 a += 3;
4413
4414 size_t k = 0;
4415 while (k < kc) {
4416 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4417 const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4418 a0 += 8;
4419 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
4420 const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
4421 a1 += 8;
4422 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
4423 const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
4424 a2 += 8;
4425
4426 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4427 const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4428
4429 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4430 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
4431 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
4432 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4433 const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4434
4435 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4436 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
4437 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
4438 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4439 const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4440
4441 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4442 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
4443 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
4444 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4445 const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4446
4447 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4448 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
4449 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
4450
4451 w = (const void*) ((const int8_t*) w + 64);
4452 k += 8 * sizeof(int8_t);
4453 }
4454 p -= 3 * sizeof(void*);
4455 } while (p != 0);
4456
4457 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4458 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4459 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
4460 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
4461 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
4462 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
4463
4464 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4465 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
4466 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
4467
4468 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4469 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4470 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
4471 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
4472
4473 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4474 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
4475 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
4476
4477 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4478 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4479 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
4480 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
4481
4482 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4483 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4484 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
4485 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
4486
4487 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4488 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
4489 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
4490
4491 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4492 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
4493 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
4494
4495 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4496 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4497
4498 __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
4499
4500 vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4501
4502 __m128i vout_lo = _mm256_castsi256_si128(vout);
4503 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4504
4505 if (nc >= 8) {
4506 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
4507 _mm_storel_epi64((__m128i*) c1, vout_hi);
4508 _mm_storel_epi64((__m128i*) c0, vout_lo);
4509
4510 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4511 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4512 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4513
4514 a = (const int8_t**restrict) ((uintptr_t) a - ks);
4515
4516 nc -= 8;
4517 } else {
4518 if (nc & 4) {
4519 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
4520 _mm_storeu_si32(c1, vout_hi);
4521 _mm_storeu_si32(c0, vout_lo);
4522
4523 c2 += 4;
4524 c1 += 4;
4525 c0 += 4;
4526
4527 vout_lo = _mm_srli_epi64(vout_lo, 32);
4528 vout_hi = _mm_srli_epi64(vout_hi, 32);
4529 }
4530 if (nc & 2) {
4531 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
4532 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
4533 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4534
4535 c2 += 2;
4536 c1 += 2;
4537 c0 += 2;
4538
4539 vout_lo = _mm_srli_epi32(vout_lo, 16);
4540 vout_hi = _mm_srli_epi32(vout_hi, 16);
4541 }
4542 if (nc & 1) {
4543 *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
4544 *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
4545 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4546 }
4547
4548 nc = 0;
4549 }
4550 } while (nc != 0);
4551 }
4552
xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4553 void xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
4554 size_t n,
4555 const int8_t* input_a,
4556 const int8_t* input_b,
4557 int8_t* output,
4558 const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4559 {
4560 const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
4561 const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
4562 const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
4563 const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
4564 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
4565 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
4566 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
4567
4568 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4569 const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4570 const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
4571 const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
4572 const __m256i vb89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
4573 input_a += 16;
4574 input_b += 16;
4575
4576 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4577 __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
4578
4579 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
4580 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
4581
4582 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4583 vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
4584
4585 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4586
4587 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4588
4589 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4590
4591 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
4592
4593 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4594 output += 16;
4595 }
4596 if XNN_UNLIKELY(n != 0) {
4597 do {
4598 const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4599 const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
4600 input_a += 8;
4601 input_b += 8;
4602
4603 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4604
4605 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
4606
4607 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4608
4609 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
4610 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4611 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4612 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
4613
4614 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
4615 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4616 output += 8;
4617 n -= 8 * sizeof(int8_t);
4618 } else {
4619 if (n & (4 * sizeof(int8_t))) {
4620 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
4621 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4622 output += 4;
4623 }
4624 if (n & (2 * sizeof(int8_t))) {
4625 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
4626 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4627 output += 2;
4628 }
4629 if (n & (1 * sizeof(int8_t))) {
4630 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4631 }
4632 n = 0;
4633 }
4634 } while (n != 0);
4635 }
4636 }
4637
xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4638 void xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
4639 size_t n,
4640 const int8_t* input_a,
4641 const int8_t* input_b,
4642 int8_t* output,
4643 const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4644 {
4645 const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
4646 const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
4647 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
4648 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
4649 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
4650
4651 const __m256i vbias = _mm256_add_epi32(
4652 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
4653 _mm256_load_si256((const __m256i*) params->avx2.bias));
4654 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4655 const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4656 const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
4657 input_a += 16;
4658
4659 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4660 __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
4661
4662 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4663 vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
4664
4665 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4666
4667 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4668
4669 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4670
4671 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
4672
4673 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4674 output += 16;
4675 }
4676 if XNN_UNLIKELY(n != 0) {
4677 do {
4678 const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4679 input_a += 8;
4680
4681 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4682
4683 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4684
4685 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
4686 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4687 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4688 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
4689
4690 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
4691 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4692 output += 8;
4693 n -= 8 * sizeof(int8_t);
4694 } else {
4695 if (n & (4 * sizeof(int8_t))) {
4696 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
4697 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4698 output += 4;
4699 }
4700 if (n & (2 * sizeof(int8_t))) {
4701 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
4702 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4703 output += 2;
4704 }
4705 if (n & (1 * sizeof(int8_t))) {
4706 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4707 }
4708 n = 0;
4709 }
4710 } while (n != 0);
4711 }
4712 }
4713
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4714 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
4715 size_t channels,
4716 size_t output_width,
4717 const uint8_t** input,
4718 const void* weights,
4719 uint8_t* output,
4720 size_t input_stride,
4721 size_t output_increment,
4722 size_t input_offset,
4723 const uint8_t* zero,
4724 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4725 {
4726 assert(channels != 0);
4727 assert(output_width != 0);
4728
4729 const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
4730 do {
4731 const uint8_t* i0 = input[0];
4732 assert(i0 != NULL);
4733 if XNN_UNPREDICTABLE(i0 != zero) {
4734 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
4735 }
4736 const uint8_t* i1 = input[1];
4737 assert(i1 != NULL);
4738 if XNN_UNPREDICTABLE(i1 != zero) {
4739 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
4740 }
4741 const uint8_t* i2 = input[2];
4742 assert(i2 != NULL);
4743 if XNN_UNPREDICTABLE(i2 != zero) {
4744 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
4745 }
4746 const uint8_t* i3 = input[3];
4747 assert(i3 != NULL);
4748 if XNN_UNPREDICTABLE(i3 != zero) {
4749 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
4750 }
4751 const uint8_t* i4 = input[4];
4752 assert(i4 != NULL);
4753 if XNN_UNPREDICTABLE(i4 != zero) {
4754 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
4755 }
4756 const uint8_t* i5 = input[5];
4757 assert(i5 != NULL);
4758 if XNN_UNPREDICTABLE(i5 != zero) {
4759 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
4760 }
4761 const uint8_t* i6 = input[6];
4762 assert(i6 != NULL);
4763 if XNN_UNPREDICTABLE(i6 != zero) {
4764 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
4765 }
4766 const uint8_t* i7 = input[7];
4767 assert(i7 != NULL);
4768 if XNN_UNPREDICTABLE(i7 != zero) {
4769 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
4770 }
4771 const uint8_t* i8 = input[8];
4772 assert(i8 != NULL);
4773 if XNN_UNPREDICTABLE(i8 != zero) {
4774 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
4775 }
4776 const uint8_t* i9 = input[9];
4777 assert(i9 != NULL);
4778 if XNN_UNPREDICTABLE(i9 != zero) {
4779 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
4780 }
4781 const uint8_t* i10 = input[10];
4782 assert(i10 != NULL);
4783 if XNN_UNPREDICTABLE(i10 != zero) {
4784 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
4785 }
4786 const uint8_t* i11 = input[11];
4787 assert(i11 != NULL);
4788 if XNN_UNPREDICTABLE(i11 != zero) {
4789 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
4790 }
4791 const uint8_t* i12 = input[12];
4792 assert(i12 != NULL);
4793 if XNN_UNPREDICTABLE(i12 != zero) {
4794 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
4795 }
4796 const uint8_t* i13 = input[13];
4797 assert(i13 != NULL);
4798 if XNN_UNPREDICTABLE(i13 != zero) {
4799 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
4800 }
4801 const uint8_t* i14 = input[14];
4802 assert(i14 != NULL);
4803 if XNN_UNPREDICTABLE(i14 != zero) {
4804 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
4805 }
4806 const uint8_t* i15 = input[15];
4807 assert(i15 != NULL);
4808 if XNN_UNPREDICTABLE(i15 != zero) {
4809 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
4810 }
4811 const uint8_t* i16 = input[16];
4812 assert(i16 != NULL);
4813 if XNN_UNPREDICTABLE(i16 != zero) {
4814 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
4815 }
4816 const uint8_t* i17 = input[17];
4817 assert(i17 != NULL);
4818 if XNN_UNPREDICTABLE(i17 != zero) {
4819 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
4820 }
4821 const uint8_t* i18 = input[18];
4822 assert(i18 != NULL);
4823 if XNN_UNPREDICTABLE(i18 != zero) {
4824 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
4825 }
4826 const uint8_t* i19 = input[19];
4827 assert(i19 != NULL);
4828 if XNN_UNPREDICTABLE(i19 != zero) {
4829 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
4830 }
4831 const uint8_t* i20 = input[20];
4832 assert(i20 != NULL);
4833 if XNN_UNPREDICTABLE(i20 != zero) {
4834 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
4835 }
4836 const uint8_t* i21 = input[21];
4837 assert(i21 != NULL);
4838 if XNN_UNPREDICTABLE(i21 != zero) {
4839 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
4840 }
4841 const uint8_t* i22 = input[22];
4842 assert(i22 != NULL);
4843 if XNN_UNPREDICTABLE(i22 != zero) {
4844 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
4845 }
4846 const uint8_t* i23 = input[23];
4847 assert(i23 != NULL);
4848 if XNN_UNPREDICTABLE(i23 != zero) {
4849 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
4850 }
4851 const uint8_t* i24 = input[24];
4852 assert(i24 != NULL);
4853 if XNN_UNPREDICTABLE(i24 != zero) {
4854 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
4855 }
4856 input = (const uint8_t**) ((uintptr_t) input + input_stride);
4857
4858 size_t c = channels;
4859 const void* w = weights;
4860 for (; c >= 16; c -= 16) {
4861 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4862 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
4863
4864
4865 const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4866 const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
4867 const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
4868 const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
4869 i0 += 16;
4870
4871 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4872 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
4873
4874 const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4875 const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
4876 const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
4877 const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
4878 i1 += 16;
4879
4880 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4881 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
4882
4883 const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4884 const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
4885 const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
4886 const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
4887 i2 += 16;
4888
4889 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4890 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
4891
4892 const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4893 const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
4894 const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
4895 const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
4896 i3 += 16;
4897
4898 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4899 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
4900
4901 const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4902 const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
4903 const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
4904 const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
4905 i4 += 16;
4906
4907 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4908 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
4909
4910 const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4911 const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
4912 const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
4913 const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
4914 i5 += 16;
4915
4916 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4917 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
4918
4919 const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4920 const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
4921 const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
4922 const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
4923 i6 += 16;
4924
4925 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4926 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
4927
4928 const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4929 const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
4930 const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
4931 const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
4932 i7 += 16;
4933
4934 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4935 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
4936
4937 const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4938 const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
4939 const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
4940 const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
4941 i8 += 16;
4942
4943 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4944 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
4945
4946 const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
4947 const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
4948 const __m256i vi9x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
4949 const __m256i vk9x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
4950 i9 += 16;
4951
4952 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
4953 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
4954
4955 const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
4956 const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
4957 const __m256i vi10x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
4958 const __m256i vk10x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
4959 i10 += 16;
4960
4961 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
4962 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
4963
4964 const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
4965 const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
4966 const __m256i vi11x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
4967 const __m256i vk11x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
4968 i11 += 16;
4969
4970 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
4971 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
4972
4973 const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
4974 const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
4975 const __m256i vi12x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
4976 const __m256i vk12x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
4977 i12 += 16;
4978
4979 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
4980 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
4981
4982 const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
4983 const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
4984 const __m256i vi13x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
4985 const __m256i vk13x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
4986 i13 += 16;
4987
4988 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
4989 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
4990
4991 const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
4992 const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
4993 const __m256i vi14x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
4994 const __m256i vk14x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
4995 i14 += 16;
4996
4997 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
4998 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
4999
5000 const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
5001 const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
5002 const __m256i vi15x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
5003 const __m256i vk15x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
5004 i15 += 16;
5005
5006 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
5007 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
5008
5009 const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
5010 const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
5011 const __m256i vi16x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
5012 const __m256i vk16x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
5013 i16 += 16;
5014
5015 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
5016 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
5017
5018 const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
5019 const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
5020 const __m256i vi17x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
5021 const __m256i vk17x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
5022 i17 += 16;
5023
5024 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
5025 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
5026
5027 const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
5028 const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
5029 const __m256i vi18x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
5030 const __m256i vk18x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
5031 i18 += 16;
5032
5033 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
5034 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
5035
5036 const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
5037 const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
5038 const __m256i vi19x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
5039 const __m256i vk19x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
5040 i19 += 16;
5041
5042 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
5043 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
5044
5045 const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
5046 const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
5047 const __m256i vi20x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
5048 const __m256i vk20x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
5049 i20 += 16;
5050
5051 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
5052 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
5053
5054 const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
5055 const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
5056 const __m256i vi21x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
5057 const __m256i vk21x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
5058 i21 += 16;
5059
5060 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
5061 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
5062
5063 const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
5064 const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
5065 const __m256i vi22x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
5066 const __m256i vk22x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
5067 i22 += 16;
5068
5069 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
5070 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
5071
5072 const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
5073 const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
5074 const __m256i vi23x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
5075 const __m256i vk23x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
5076 i23 += 16;
5077
5078 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
5079 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
5080
5081 const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
5082 const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
5083 const __m256i vi24x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
5084 const __m256i vk24x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
5085 i24 += 16;
5086
5087 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
5088 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
5089
5090 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
5091
5092 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5093 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
5094
5095 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5096 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
5097 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
5098
5099 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5100 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
5101 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
5102
5103 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5104 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
5105
5106 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5107 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5108
5109 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5110
5111 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5112 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5113
5114 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5115 output += 16;
5116 }
5117 if XNN_UNLIKELY(c != 0) {
5118 const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
5119 do {
5120 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5121
5122
5123 const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5124 const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
5125 i0 += 8;
5126
5127 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5128
5129 const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5130 const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
5131 i1 += 8;
5132
5133 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5134
5135 const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5136 const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
5137 i2 += 8;
5138
5139 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5140
5141 const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5142 const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
5143 i3 += 8;
5144
5145 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5146
5147 const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5148 const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
5149 i4 += 8;
5150
5151 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5152
5153 const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5154 const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
5155 i5 += 8;
5156
5157 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5158
5159 const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5160 const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
5161 i6 += 8;
5162
5163 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5164
5165 const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5166 const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
5167 i7 += 8;
5168
5169 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5170
5171 const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5172 const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
5173 i8 += 8;
5174
5175 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5176
5177 const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
5178 const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144))), vk_zero_point);
5179 i9 += 8;
5180
5181 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
5182
5183 const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
5184 const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160))), vk_zero_point);
5185 i10 += 8;
5186
5187 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
5188
5189 const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
5190 const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176))), vk_zero_point);
5191 i11 += 8;
5192
5193 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
5194
5195 const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
5196 const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192))), vk_zero_point);
5197 i12 += 8;
5198
5199 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
5200
5201 const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
5202 const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208))), vk_zero_point);
5203 i13 += 8;
5204
5205 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
5206
5207 const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
5208 const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224))), vk_zero_point);
5209 i14 += 8;
5210
5211 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
5212
5213 const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
5214 const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240))), vk_zero_point);
5215 i15 += 8;
5216
5217 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
5218
5219 const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
5220 const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256))), vk_zero_point);
5221 i16 += 8;
5222
5223 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
5224
5225 const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
5226 const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272))), vk_zero_point);
5227 i17 += 8;
5228
5229 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
5230
5231 const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
5232 const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288))), vk_zero_point);
5233 i18 += 8;
5234
5235 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
5236
5237 const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
5238 const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304))), vk_zero_point);
5239 i19 += 8;
5240
5241 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
5242
5243 const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
5244 const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320))), vk_zero_point);
5245 i20 += 8;
5246
5247 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
5248
5249 const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
5250 const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336))), vk_zero_point);
5251 i21 += 8;
5252
5253 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
5254
5255 const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
5256 const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352))), vk_zero_point);
5257 i22 += 8;
5258
5259 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
5260
5261 const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
5262 const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368))), vk_zero_point);
5263 i23 += 8;
5264
5265 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
5266
5267 const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
5268 const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384))), vk_zero_point);
5269 i24 += 8;
5270
5271 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
5272
5273 k += 8;
5274
5275 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5276 vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
5277 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
5278 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5279
5280 w = (const void*) ((const int32_t*) w + 8);
5281
5282 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
5283 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
5284
5285 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
5286
5287 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5288 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
5289
5290 if XNN_LIKELY(c >= 8) {
5291 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5292 output += 8;
5293 c -= 8;
5294 } else {
5295 if (c & 4) {
5296 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
5297 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5298 output += 4;
5299 }
5300 if (c & 2) {
5301 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
5302 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5303 output += 2;
5304 }
5305 if (c & 1) {
5306 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
5307 output += 1;
5308 }
5309 c = 0;
5310 }
5311 } while (c != 0);
5312 }
5313
5314 output = (uint8_t*) ((uintptr_t) output + output_increment);
5315 } while (--output_width != 0);
5316 }
5317
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5318 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
5319 size_t channels,
5320 size_t output_width,
5321 const uint8_t** input,
5322 const void* weights,
5323 uint8_t* output,
5324 size_t input_stride,
5325 size_t output_increment,
5326 size_t input_offset,
5327 const uint8_t* zero,
5328 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5329 {
5330 assert(channels != 0);
5331 assert(output_width != 0);
5332
5333 const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
5334 do {
5335 const uint8_t* i0 = input[0];
5336 assert(i0 != NULL);
5337 if XNN_UNPREDICTABLE(i0 != zero) {
5338 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
5339 }
5340 const uint8_t* i1 = input[1];
5341 assert(i1 != NULL);
5342 if XNN_UNPREDICTABLE(i1 != zero) {
5343 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
5344 }
5345 const uint8_t* i2 = input[2];
5346 assert(i2 != NULL);
5347 if XNN_UNPREDICTABLE(i2 != zero) {
5348 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
5349 }
5350 const uint8_t* i3 = input[3];
5351 assert(i3 != NULL);
5352 if XNN_UNPREDICTABLE(i3 != zero) {
5353 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
5354 }
5355 const uint8_t* i4 = input[4];
5356 assert(i4 != NULL);
5357 if XNN_UNPREDICTABLE(i4 != zero) {
5358 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
5359 }
5360 const uint8_t* i5 = input[5];
5361 assert(i5 != NULL);
5362 if XNN_UNPREDICTABLE(i5 != zero) {
5363 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
5364 }
5365 const uint8_t* i6 = input[6];
5366 assert(i6 != NULL);
5367 if XNN_UNPREDICTABLE(i6 != zero) {
5368 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
5369 }
5370 const uint8_t* i7 = input[7];
5371 assert(i7 != NULL);
5372 if XNN_UNPREDICTABLE(i7 != zero) {
5373 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
5374 }
5375 const uint8_t* i8 = input[8];
5376 assert(i8 != NULL);
5377 if XNN_UNPREDICTABLE(i8 != zero) {
5378 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
5379 }
5380 input = (const uint8_t**) ((uintptr_t) input + input_stride);
5381
5382 size_t c = channels;
5383 const void* w = weights;
5384 for (; c >= 16; c -= 16) {
5385 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5386 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
5387
5388
5389 const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5390 const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
5391 const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
5392 const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
5393 i0 += 16;
5394
5395 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5396 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
5397
5398 const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5399 const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
5400 const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
5401 const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
5402 i1 += 16;
5403
5404 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5405 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
5406
5407 const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5408 const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
5409 const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
5410 const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
5411 i2 += 16;
5412
5413 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5414 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
5415
5416 const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5417 const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
5418 const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
5419 const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
5420 i3 += 16;
5421
5422 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5423 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
5424
5425 const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5426 const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
5427 const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
5428 const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
5429 i4 += 16;
5430
5431 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5432 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
5433
5434 const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5435 const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
5436 const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
5437 const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
5438 i5 += 16;
5439
5440 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5441 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
5442
5443 const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5444 const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
5445 const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
5446 const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
5447 i6 += 16;
5448
5449 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5450 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
5451
5452 const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5453 const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
5454 const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
5455 const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
5456 i7 += 16;
5457
5458 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5459 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
5460
5461 const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5462 const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
5463 const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
5464 const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
5465 i8 += 16;
5466
5467 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5468 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
5469
5470 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
5471
5472 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5473 __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
5474
5475 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5476 vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
5477 vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
5478
5479 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5480 vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
5481 vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
5482
5483 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5484 vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
5485
5486 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5487 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5488
5489 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5490
5491 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5492 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5493
5494 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5495 output += 16;
5496 }
5497 if XNN_UNLIKELY(c != 0) {
5498 const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
5499 do {
5500 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5501
5502
5503 const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5504 const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
5505 i0 += 8;
5506
5507 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5508
5509 const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5510 const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
5511 i1 += 8;
5512
5513 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5514
5515 const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5516 const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
5517 i2 += 8;
5518
5519 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5520
5521 const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5522 const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
5523 i3 += 8;
5524
5525 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5526
5527 const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5528 const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
5529 i4 += 8;
5530
5531 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5532
5533 const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5534 const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
5535 i5 += 8;
5536
5537 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5538
5539 const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5540 const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
5541 i6 += 8;
5542
5543 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5544
5545 const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5546 const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
5547 i7 += 8;
5548
5549 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5550
5551 const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5552 const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
5553 i8 += 8;
5554
5555 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5556
5557 k += 8;
5558
5559 __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5560 vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
5561 vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
5562 vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5563
5564 w = (const void*) ((const int32_t*) w + 8);
5565
5566 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
5567 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
5568
5569 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
5570
5571 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5572 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
5573
5574 if XNN_LIKELY(c >= 8) {
5575 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5576 output += 8;
5577 c -= 8;
5578 } else {
5579 if (c & 4) {
5580 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
5581 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5582 output += 4;
5583 }
5584 if (c & 2) {
5585 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
5586 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5587 output += 2;
5588 }
5589 if (c & 1) {
5590 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
5591 output += 1;
5592 }
5593 c = 0;
5594 }
5595 } while (c != 0);
5596 }
5597
5598 output = (uint8_t*) ((uintptr_t) output + output_increment);
5599 } while (--output_width != 0);
5600 }
5601
xnn_qu8_f32_vcvt_ukernel__avx2_x16(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5602 void xnn_qu8_f32_vcvt_ukernel__avx2_x16(
5603 size_t n,
5604 const uint8_t* x,
5605 float* y,
5606 const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5607 {
5608 assert(n != 0);
5609 assert(n % sizeof(uint8_t) == 0);
5610 assert(x != NULL);
5611 assert(y != NULL);
5612
5613 const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
5614 const __m256 vscale = _mm256_load_ps(params->avx.scale);
5615 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
5616 __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5617 __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
5618 x += 16;
5619
5620 vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
5621 vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
5622
5623 __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
5624 __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
5625
5626 vy01234567 = _mm256_mul_ps(vy01234567, vscale);
5627 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
5628
5629 _mm256_storeu_ps(y, vy01234567);
5630 _mm256_storeu_ps(y + 8, vy89ABCDEF);
5631 y += 16;
5632 }
5633 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
5634 __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5635 vx = _mm256_add_epi32(vx, vminus_zero_point);
5636 x += 8;
5637
5638 __m256 vy = _mm256_cvtepi32_ps(vx);
5639 vy = _mm256_mul_ps(vy, vscale);
5640
5641 _mm256_storeu_ps(y, vy);
5642 y += 8;
5643 }
5644 if XNN_UNLIKELY(n != 0) {
5645 assert(n >= 1 * sizeof(uint8_t));
5646 assert(n <= 7 * sizeof(uint8_t));
5647
5648 __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5649 vx = _mm256_add_epi32(vx, vminus_zero_point);
5650
5651 __m256 vy = _mm256_cvtepi32_ps(vx);
5652 vy = _mm256_mul_ps(vy, vscale);
5653
5654 __m128 vy_lo = _mm256_castps256_ps128(vy);
5655 if (n & (4 * sizeof(uint8_t))) {
5656 _mm_storeu_ps(y, vy_lo);
5657 vy_lo = _mm256_extractf128_ps(vy, 1);
5658 y += 4;
5659 }
5660 if (n & (2 * sizeof(uint8_t))) {
5661 _mm_storel_pi((__m64*) y, vy_lo);
5662 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
5663 y += 2;
5664 }
5665 if (n & (1 * sizeof(uint8_t))) {
5666 _mm_store_ss(y, vy_lo);
5667 }
5668 }
5669 }
5670
xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5671 void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
5672 size_t mr,
5673 size_t nc,
5674 size_t kc,
5675 const uint8_t* restrict a,
5676 size_t a_stride,
5677 const void* restrict w,
5678 uint8_t* restrict c,
5679 size_t cm_stride,
5680 size_t cn_stride,
5681 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5682 {
5683 assert(mr != 0);
5684 assert(mr <= 1);
5685 assert(nc != 0);
5686 assert(kc != 0);
5687 assert(kc % sizeof(uint8_t) == 0);
5688 assert(a != NULL);
5689 assert(w != NULL);
5690 assert(c != NULL);
5691
5692 kc = round_up_po2(kc, 8);
5693 const uint8_t* a0 = a;
5694 uint8_t* c0 = c;
5695
5696 do {
5697 const __m128i vbias0x0 = _mm_loadu_si32(w);
5698 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
5699 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5700 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
5701 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
5702 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5703 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
5704 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
5705 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5706 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
5707 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
5708 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5709 w = (const void*) ((const int32_t*) w + 8);
5710
5711 size_t k = 0;
5712 const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
5713 while (k < kc) {
5714 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5715 const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
5716 a0 += 8;
5717
5718 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5719 const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
5720
5721 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5722 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
5723 const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
5724
5725 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5726 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
5727 const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
5728
5729 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5730 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
5731 const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
5732
5733 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5734
5735 w = (const void*) ((const uint8_t*) w + 64);
5736 k += 8 * sizeof(uint8_t);
5737 }
5738
5739 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5740 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5741
5742 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5743
5744 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5745 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5746
5747 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5748
5749 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5750 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5751
5752 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5753 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5754
5755 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5756
5757 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5758 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
5759
5760 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5761
5762 __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
5763
5764 vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5765
5766 __m128i vout_lo = _mm256_castsi256_si128(vout);
5767 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5768
5769 if (nc >= 8) {
5770 _mm_storel_epi64((__m128i*) c0, vout_lo);
5771
5772 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
5773
5774 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
5775
5776 nc -= 8;
5777 } else {
5778 if (nc & 4) {
5779 _mm_storeu_si32(c0, vout_lo);
5780
5781 c0 += 4;
5782
5783 vout_lo = _mm_srli_epi64(vout_lo, 32);
5784 vout_hi = _mm_srli_epi64(vout_hi, 32);
5785 }
5786 if (nc & 2) {
5787 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
5788
5789 c0 += 2;
5790
5791 vout_lo = _mm_srli_epi32(vout_lo, 16);
5792 vout_hi = _mm_srli_epi32(vout_hi, 16);
5793 }
5794 if (nc & 1) {
5795 *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
5796 }
5797
5798 nc = 0;
5799 }
5800 } while (nc != 0);
5801 }
5802
xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5803 void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
5804 size_t mr,
5805 size_t nc,
5806 size_t kc,
5807 const uint8_t* restrict a,
5808 size_t a_stride,
5809 const void* restrict w,
5810 uint8_t* restrict c,
5811 size_t cm_stride,
5812 size_t cn_stride,
5813 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5814 {
5815 assert(mr != 0);
5816 assert(mr <= 3);
5817 assert(nc != 0);
5818 assert(kc != 0);
5819 assert(kc % sizeof(uint8_t) == 0);
5820 assert(a != NULL);
5821 assert(w != NULL);
5822 assert(c != NULL);
5823
5824 kc = round_up_po2(kc, 8);
5825 const uint8_t* a0 = a;
5826 uint8_t* c0 = c;
5827 const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
5828 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
5829 if XNN_UNPREDICTABLE(mr < 2) {
5830 a1 = a0;
5831 c1 = c0;
5832 }
5833 const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
5834 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
5835 if XNN_UNPREDICTABLE(mr <= 2) {
5836 a2 = a1;
5837 c2 = c1;
5838 }
5839
5840 do {
5841 const __m128i vbias0x0 = _mm_loadu_si32(w);
5842 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
5843 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5844 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
5845 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
5846 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5847 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
5848 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
5849 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5850 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
5851 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
5852 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5853 __m256i vacc1x01 = vacc0x01;
5854 __m256i vacc1x23 = vacc0x23;
5855 __m256i vacc1x45 = vacc0x45;
5856 __m256i vacc1x67 = vacc0x67;
5857 __m256i vacc2x01 = vacc0x01;
5858 __m256i vacc2x23 = vacc0x23;
5859 __m256i vacc2x45 = vacc0x45;
5860 __m256i vacc2x67 = vacc0x67;
5861 w = (const void*) ((const int32_t*) w + 8);
5862
5863 size_t k = 0;
5864 const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
5865 while (k < kc) {
5866 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5867 const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
5868 a0 += 8;
5869 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
5870 const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
5871 a1 += 8;
5872 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
5873 const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
5874 a2 += 8;
5875
5876 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5877 const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
5878
5879 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5880 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
5881 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
5882 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
5883 const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
5884
5885 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5886 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
5887 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
5888 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
5889 const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
5890
5891 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5892 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
5893 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
5894 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
5895 const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
5896
5897 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5898 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
5899 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
5900
5901 w = (const void*) ((const uint8_t*) w + 64);
5902 k += 8 * sizeof(uint8_t);
5903 }
5904
5905 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5906 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5907 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
5908 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
5909 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
5910 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
5911
5912 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5913 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
5914 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
5915
5916 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5917 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5918 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
5919 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
5920
5921 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5922 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
5923 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
5924
5925 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5926 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5927 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
5928 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
5929
5930 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5931 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5932 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
5933 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
5934
5935 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5936 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
5937 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
5938
5939 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5940 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
5941 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
5942
5943 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5944 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5945
5946 __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
5947
5948 vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5949
5950 __m128i vout_lo = _mm256_castsi256_si128(vout);
5951 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5952
5953 if (nc >= 8) {
5954 _mm_storel_epi64((__m128i*) c0, vout_lo);
5955 _mm_storel_epi64((__m128i*) c1, vout_hi);
5956 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
5957
5958 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
5959 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
5960 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
5961
5962 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
5963 a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
5964 a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
5965
5966 nc -= 8;
5967 } else {
5968 if (nc & 4) {
5969 _mm_storeu_si32(c0, vout_lo);
5970 _mm_storeu_si32(c1, vout_hi);
5971 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
5972
5973 c0 += 4;
5974 c1 += 4;
5975 c2 += 4;
5976
5977 vout_lo = _mm_srli_epi64(vout_lo, 32);
5978 vout_hi = _mm_srli_epi64(vout_hi, 32);
5979 }
5980 if (nc & 2) {
5981 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
5982 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
5983 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
5984
5985 c0 += 2;
5986 c1 += 2;
5987 c2 += 2;
5988
5989 vout_lo = _mm_srli_epi32(vout_lo, 16);
5990 vout_hi = _mm_srli_epi32(vout_hi, 16);
5991 }
5992 if (nc & 1) {
5993 *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
5994 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
5995 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
5996 }
5997
5998 nc = 0;
5999 }
6000 } while (nc != 0);
6001 }
6002
xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6003 void xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
6004 size_t mr,
6005 size_t nc,
6006 size_t kc,
6007 size_t ks,
6008 const uint8_t** restrict a,
6009 const void* restrict w,
6010 uint8_t* restrict c,
6011 size_t cm_stride,
6012 size_t cn_stride,
6013 size_t a_offset,
6014 const uint8_t* zero,
6015 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6016 {
6017 assert(mr != 0);
6018 assert(mr <= 1);
6019 assert(nc != 0);
6020 assert(kc != 0);
6021 assert(ks != 0);
6022 assert(ks % (1 * sizeof(void*)) == 0);
6023 assert(a_offset % sizeof(uint8_t) == 0);
6024 assert(a != NULL);
6025 assert(w != NULL);
6026 assert(c != NULL);
6027
6028 kc = round_up_po2(kc, 8);
6029 uint8_t* c0 = c;
6030
6031 do {
6032 const __m128i vbias0x0 = _mm_loadu_si32(w);
6033 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
6034 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
6035 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
6036 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
6037 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
6038 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
6039 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
6040 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
6041 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
6042 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
6043 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
6044 w = (const void*) ((const int32_t*) w + 8);
6045
6046 size_t p = ks;
6047 const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
6048 do {
6049 const uint8_t* restrict a0 = a[0];
6050 if XNN_UNPREDICTABLE(a0 != zero) {
6051 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
6052 }
6053 a += 1;
6054
6055 size_t k = 0;
6056 while (k < kc) {
6057 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
6058 const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
6059 a0 += 8;
6060
6061 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6062 const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
6063
6064 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
6065 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
6066 const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
6067
6068 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
6069 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
6070 const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
6071
6072 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
6073 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
6074 const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
6075
6076 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
6077
6078 w = (const void*) ((const uint8_t*) w + 64);
6079 k += 8 * sizeof(uint8_t);
6080 }
6081 p -= 1 * sizeof(void*);
6082 } while (p != 0);
6083
6084 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
6085 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
6086
6087 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
6088
6089 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
6090 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
6091
6092 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
6093
6094 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6095 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
6096
6097 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6098 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
6099
6100 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
6101
6102 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6103 __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
6104
6105 vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6106
6107 __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
6108
6109 vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
6110
6111 __m128i vout_lo = _mm256_castsi256_si128(vout);
6112 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
6113
6114 if (nc >= 8) {
6115 _mm_storel_epi64((__m128i*) c0, vout_lo);
6116
6117 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
6118
6119 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
6120
6121 nc -= 8;
6122 } else {
6123 if (nc & 4) {
6124 _mm_storeu_si32(c0, vout_lo);
6125
6126 c0 += 4;
6127
6128 vout_lo = _mm_srli_epi64(vout_lo, 32);
6129 vout_hi = _mm_srli_epi64(vout_hi, 32);
6130 }
6131 if (nc & 2) {
6132 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
6133
6134 c0 += 2;
6135
6136 vout_lo = _mm_srli_epi32(vout_lo, 16);
6137 vout_hi = _mm_srli_epi32(vout_hi, 16);
6138 }
6139 if (nc & 1) {
6140 *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
6141 }
6142
6143 nc = 0;
6144 }
6145 } while (nc != 0);
6146 }
6147
xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6148 void xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
6149 size_t mr,
6150 size_t nc,
6151 size_t kc,
6152 size_t ks,
6153 const uint8_t** restrict a,
6154 const void* restrict w,
6155 uint8_t* restrict c,
6156 size_t cm_stride,
6157 size_t cn_stride,
6158 size_t a_offset,
6159 const uint8_t* zero,
6160 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6161 {
6162 assert(mr != 0);
6163 assert(mr <= 3);
6164 assert(nc != 0);
6165 assert(kc != 0);
6166 assert(ks != 0);
6167 assert(ks % (3 * sizeof(void*)) == 0);
6168 assert(a_offset % sizeof(uint8_t) == 0);
6169 assert(a != NULL);
6170 assert(w != NULL);
6171 assert(c != NULL);
6172
6173 kc = round_up_po2(kc, 8);
6174 uint8_t* c0 = c;
6175 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
6176 if XNN_UNPREDICTABLE(mr < 2) {
6177 c1 = c0;
6178 }
6179 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
6180 if XNN_UNPREDICTABLE(mr <= 2) {
6181 c2 = c1;
6182 }
6183
6184 do {
6185 const __m128i vbias0x0 = _mm_loadu_si32(w);
6186 const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
6187 __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
6188 const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
6189 const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
6190 __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
6191 const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
6192 const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
6193 __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
6194 const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
6195 const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
6196 __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
6197 __m256i vacc1x01 = vacc0x01;
6198 __m256i vacc1x23 = vacc0x23;
6199 __m256i vacc1x45 = vacc0x45;
6200 __m256i vacc1x67 = vacc0x67;
6201 __m256i vacc2x01 = vacc0x01;
6202 __m256i vacc2x23 = vacc0x23;
6203 __m256i vacc2x45 = vacc0x45;
6204 __m256i vacc2x67 = vacc0x67;
6205 w = (const void*) ((const int32_t*) w + 8);
6206
6207 size_t p = ks;
6208 const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
6209 do {
6210 const uint8_t* restrict a0 = a[0];
6211 if XNN_UNPREDICTABLE(a0 != zero) {
6212 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
6213 }
6214 const uint8_t* restrict a1 = a[1];
6215 if XNN_UNPREDICTABLE(a1 != zero) {
6216 a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
6217 }
6218 const uint8_t* restrict a2 = a[2];
6219 if XNN_UNPREDICTABLE(a2 != zero) {
6220 a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
6221 }
6222 a += 3;
6223
6224 size_t k = 0;
6225 while (k < kc) {
6226 const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
6227 const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
6228 a0 += 8;
6229 const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
6230 const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
6231 a1 += 8;
6232 const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
6233 const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
6234 a2 += 8;
6235
6236 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6237 const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
6238
6239 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
6240 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
6241 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
6242 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
6243 const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
6244
6245 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
6246 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
6247 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
6248 const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
6249 const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
6250
6251 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
6252 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
6253 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
6254 const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
6255 const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
6256
6257 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
6258 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
6259 vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
6260
6261 w = (const void*) ((const uint8_t*) w + 64);
6262 k += 8 * sizeof(uint8_t);
6263 }
6264 p -= 3 * sizeof(void*);
6265 } while (p != 0);
6266
6267 const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
6268 const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
6269 const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
6270 const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
6271 const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
6272 const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
6273
6274 const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
6275 const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
6276 const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
6277
6278 const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
6279 __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
6280 __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
6281 __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
6282
6283 __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
6284 __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
6285 __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
6286
6287 const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6288 vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
6289 vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
6290 vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
6291
6292 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6293 vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
6294 vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
6295 vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
6296
6297 vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
6298 vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
6299 vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
6300
6301 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6302 __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
6303 __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
6304
6305 vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6306 vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6307
6308 __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
6309
6310 vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
6311
6312 __m128i vout_lo = _mm256_castsi256_si128(vout);
6313 __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
6314
6315 if (nc >= 8) {
6316 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
6317 _mm_storel_epi64((__m128i*) c1, vout_hi);
6318 _mm_storel_epi64((__m128i*) c0, vout_lo);
6319
6320 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
6321 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
6322 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
6323
6324 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
6325
6326 nc -= 8;
6327 } else {
6328 if (nc & 4) {
6329 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
6330 _mm_storeu_si32(c1, vout_hi);
6331 _mm_storeu_si32(c0, vout_lo);
6332
6333 c2 += 4;
6334 c1 += 4;
6335 c0 += 4;
6336
6337 vout_lo = _mm_srli_epi64(vout_lo, 32);
6338 vout_hi = _mm_srli_epi64(vout_hi, 32);
6339 }
6340 if (nc & 2) {
6341 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
6342 *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
6343 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
6344
6345 c2 += 2;
6346 c1 += 2;
6347 c0 += 2;
6348
6349 vout_lo = _mm_srli_epi32(vout_lo, 16);
6350 vout_hi = _mm_srli_epi32(vout_hi, 16);
6351 }
6352 if (nc & 1) {
6353 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
6354 *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
6355 *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
6356 }
6357
6358 nc = 0;
6359 }
6360 } while (nc != 0);
6361 }
6362
xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6363 void xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
6364 size_t n,
6365 const uint8_t* input_a,
6366 const uint8_t* input_b,
6367 uint8_t* output,
6368 const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6369 {
6370 const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
6371 const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
6372 const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
6373 const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
6374 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
6375 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
6376 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
6377
6378 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6379 const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6380 const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
6381 const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
6382 const __m256i vb89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
6383 input_a += 16;
6384 input_b += 16;
6385
6386 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6387 __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
6388
6389 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
6390 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
6391
6392 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6393 vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
6394
6395 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6396
6397 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6398
6399 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6400
6401 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
6402
6403 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6404 output += 16;
6405 }
6406 if XNN_UNLIKELY(n != 0) {
6407 do {
6408 const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6409 const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
6410 input_a += 8;
6411 input_b += 8;
6412
6413 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6414
6415 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
6416
6417 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6418
6419 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
6420 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6421 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6422 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
6423
6424 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
6425 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6426 output += 8;
6427 n -= 8 * sizeof(uint8_t);
6428 } else {
6429 if (n & (4 * sizeof(uint8_t))) {
6430 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6431 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6432 output += 4;
6433 }
6434 if (n & (2 * sizeof(uint8_t))) {
6435 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
6436 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6437 output += 2;
6438 }
6439 if (n & (1 * sizeof(uint8_t))) {
6440 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6441 }
6442 n = 0;
6443 }
6444 } while (n != 0);
6445 }
6446 }
6447
xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6448 void xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
6449 size_t n,
6450 const uint8_t* input_a,
6451 const uint8_t* input_b,
6452 uint8_t* output,
6453 const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6454 {
6455 const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
6456 const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
6457 const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
6458 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
6459 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
6460
6461 const __m256i vbias = _mm256_add_epi32(
6462 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
6463 _mm256_load_si256((const __m256i*) params->avx2.bias));
6464 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6465 const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6466 const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
6467 input_a += 16;
6468
6469 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6470 __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
6471
6472 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6473 vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
6474
6475 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6476
6477 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6478
6479 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6480
6481 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
6482
6483 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6484 output += 16;
6485 }
6486 if XNN_UNLIKELY(n != 0) {
6487 do {
6488 const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6489 input_a += 8;
6490
6491 __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6492
6493 vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6494
6495 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
6496 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6497 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6498 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
6499
6500 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
6501 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6502 output += 8;
6503 n -= 8 * sizeof(uint8_t);
6504 } else {
6505 if (n & (4 * sizeof(uint8_t))) {
6506 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6507 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6508 output += 4;
6509 }
6510 if (n & (2 * sizeof(uint8_t))) {
6511 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
6512 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6513 output += 2;
6514 }
6515 if (n & (1 * sizeof(uint8_t))) {
6516 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6517 }
6518 n = 0;
6519 }
6520 } while (n != 0);
6521 }
6522 }
6523
xnn_x8_lut_ukernel__avx2_x128(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])6524 void xnn_x8_lut_ukernel__avx2_x128(
6525 size_t n,
6526 const uint8_t* x,
6527 uint8_t* y,
6528 const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
6529 {
6530 assert(n != 0);
6531 assert(x != NULL);
6532 assert(y != NULL);
6533
6534 const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
6535 const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
6536 const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
6537 const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
6538 const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
6539 const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
6540 const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
6541 const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
6542 const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
6543 const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
6544 const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
6545 const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
6546 const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
6547 const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
6548 const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
6549 const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
6550
6551 const __m256i vtable0 = vt0;
6552 const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
6553 const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
6554 const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
6555 const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
6556 const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
6557 const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
6558 const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
6559 const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
6560 const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
6561 const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
6562 const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
6563 const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
6564 const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
6565 const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
6566 const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
6567
6568 const __m256i voffset = _mm256_set1_epi8(16);
6569 for (; n >= 128 * sizeof(uint8_t); n -= 128 * sizeof(uint8_t)) {
6570 __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
6571 __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
6572 __m256i vx2 = _mm256_loadu_si256((const __m256i*) (x + 64));
6573 __m256i vx3 = _mm256_loadu_si256((const __m256i*) (x + 96));
6574 x += 128;
6575
6576 __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
6577 __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
6578 __m256i vy2 = _mm256_shuffle_epi8(vtable0, vx2);
6579 __m256i vy3 = _mm256_shuffle_epi8(vtable0, vx3);
6580
6581 vx0 = _mm256_sub_epi8(vx0, voffset);
6582 vx1 = _mm256_sub_epi8(vx1, voffset);
6583 vx2 = _mm256_sub_epi8(vx2, voffset);
6584 vx3 = _mm256_sub_epi8(vx3, voffset);
6585 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
6586 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
6587 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable1, vx2));
6588 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable1, vx3));
6589 vx0 = _mm256_sub_epi8(vx0, voffset);
6590 vx1 = _mm256_sub_epi8(vx1, voffset);
6591 vx2 = _mm256_sub_epi8(vx2, voffset);
6592 vx3 = _mm256_sub_epi8(vx3, voffset);
6593 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
6594 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
6595 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable2, vx2));
6596 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable2, vx3));
6597 vx0 = _mm256_sub_epi8(vx0, voffset);
6598 vx1 = _mm256_sub_epi8(vx1, voffset);
6599 vx2 = _mm256_sub_epi8(vx2, voffset);
6600 vx3 = _mm256_sub_epi8(vx3, voffset);
6601 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
6602 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
6603 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable3, vx2));
6604 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable3, vx3));
6605 vx0 = _mm256_sub_epi8(vx0, voffset);
6606 vx1 = _mm256_sub_epi8(vx1, voffset);
6607 vx2 = _mm256_sub_epi8(vx2, voffset);
6608 vx3 = _mm256_sub_epi8(vx3, voffset);
6609 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
6610 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
6611 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable4, vx2));
6612 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable4, vx3));
6613 vx0 = _mm256_sub_epi8(vx0, voffset);
6614 vx1 = _mm256_sub_epi8(vx1, voffset);
6615 vx2 = _mm256_sub_epi8(vx2, voffset);
6616 vx3 = _mm256_sub_epi8(vx3, voffset);
6617 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
6618 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
6619 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable5, vx2));
6620 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable5, vx3));
6621 vx0 = _mm256_sub_epi8(vx0, voffset);
6622 vx1 = _mm256_sub_epi8(vx1, voffset);
6623 vx2 = _mm256_sub_epi8(vx2, voffset);
6624 vx3 = _mm256_sub_epi8(vx3, voffset);
6625 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
6626 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
6627 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable6, vx2));
6628 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable6, vx3));
6629 vx0 = _mm256_sub_epi8(vx0, voffset);
6630 vx1 = _mm256_sub_epi8(vx1, voffset);
6631 vx2 = _mm256_sub_epi8(vx2, voffset);
6632 vx3 = _mm256_sub_epi8(vx3, voffset);
6633 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
6634 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
6635 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable7, vx2));
6636 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable7, vx3));
6637 vx0 = _mm256_sub_epi8(vx0, voffset);
6638 vx1 = _mm256_sub_epi8(vx1, voffset);
6639 vx2 = _mm256_sub_epi8(vx2, voffset);
6640 vx3 = _mm256_sub_epi8(vx3, voffset);
6641 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
6642 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
6643 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable8, vx2));
6644 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable8, vx3));
6645
6646 vx0 = _mm256_subs_epi8(vx0, voffset);
6647 vx1 = _mm256_subs_epi8(vx1, voffset);
6648 vx2 = _mm256_subs_epi8(vx2, voffset);
6649 vx3 = _mm256_subs_epi8(vx3, voffset);
6650 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
6651 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
6652 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable9, vx2));
6653 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable9, vx3));
6654 vx0 = _mm256_subs_epi8(vx0, voffset);
6655 vx1 = _mm256_subs_epi8(vx1, voffset);
6656 vx2 = _mm256_subs_epi8(vx2, voffset);
6657 vx3 = _mm256_subs_epi8(vx3, voffset);
6658 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
6659 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
6660 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableA, vx2));
6661 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableA, vx3));
6662 vx0 = _mm256_subs_epi8(vx0, voffset);
6663 vx1 = _mm256_subs_epi8(vx1, voffset);
6664 vx2 = _mm256_subs_epi8(vx2, voffset);
6665 vx3 = _mm256_subs_epi8(vx3, voffset);
6666 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
6667 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
6668 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableB, vx2));
6669 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableB, vx3));
6670 vx0 = _mm256_subs_epi8(vx0, voffset);
6671 vx1 = _mm256_subs_epi8(vx1, voffset);
6672 vx2 = _mm256_subs_epi8(vx2, voffset);
6673 vx3 = _mm256_subs_epi8(vx3, voffset);
6674 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
6675 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
6676 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableC, vx2));
6677 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableC, vx3));
6678 vx0 = _mm256_subs_epi8(vx0, voffset);
6679 vx1 = _mm256_subs_epi8(vx1, voffset);
6680 vx2 = _mm256_subs_epi8(vx2, voffset);
6681 vx3 = _mm256_subs_epi8(vx3, voffset);
6682 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
6683 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
6684 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableD, vx2));
6685 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableD, vx3));
6686 vx0 = _mm256_subs_epi8(vx0, voffset);
6687 vx1 = _mm256_subs_epi8(vx1, voffset);
6688 vx2 = _mm256_subs_epi8(vx2, voffset);
6689 vx3 = _mm256_subs_epi8(vx3, voffset);
6690 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
6691 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
6692 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableE, vx2));
6693 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableE, vx3));
6694 vx0 = _mm256_subs_epi8(vx0, voffset);
6695 vx1 = _mm256_subs_epi8(vx1, voffset);
6696 vx2 = _mm256_subs_epi8(vx2, voffset);
6697 vx3 = _mm256_subs_epi8(vx3, voffset);
6698 vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
6699 vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
6700 vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableF, vx2));
6701 vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableF, vx3));
6702
6703 _mm256_storeu_si256((__m256i*) y, vy0);
6704 _mm256_storeu_si256((__m256i*) (y + 32), vy1);
6705 _mm256_storeu_si256((__m256i*) (y + 64), vy2);
6706 _mm256_storeu_si256((__m256i*) (y + 96), vy3);
6707 y += 128;
6708 }
6709 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6710 __m128i vx = _mm_loadu_si128((const __m128i*) x);
6711 x += 16;
6712
6713 __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
6714
6715 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6716 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
6717 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6718 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
6719 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6720 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
6721 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6722 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
6723 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6724 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
6725 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6726 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
6727 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6728 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
6729 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6730 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
6731
6732 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6733 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
6734 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6735 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
6736 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6737 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
6738 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6739 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
6740 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6741 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
6742 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6743 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
6744 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6745 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
6746
6747 _mm_storeu_si128((__m128i*) y, vy);
6748 y += 16;
6749 }
6750 if XNN_UNLIKELY(n != 0) {
6751 __m128i vx = _mm_loadu_si128((const __m128i*) x);
6752
6753 __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
6754
6755 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6756 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
6757 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6758 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
6759 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6760 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
6761 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6762 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
6763 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6764 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
6765 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6766 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
6767 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6768 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
6769 vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6770 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
6771
6772 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6773 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
6774 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6775 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
6776 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6777 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
6778 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6779 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
6780 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6781 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
6782 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6783 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
6784 vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6785 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
6786
6787 if (n & (8 * sizeof(uint8_t))) {
6788 _mm_storel_epi64((__m128i*) y, vy);
6789 vy = _mm_unpackhi_epi64(vy, vy);
6790 y += 8;
6791 }
6792 if (n & (4 * sizeof(uint8_t))) {
6793 _mm_storeu_si32(y, vy);
6794 vy = _mm_srli_epi64(vy, 32);
6795 y += 4;
6796 }
6797 if (n & (2 * sizeof(uint8_t))) {
6798 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
6799 vy = _mm_srli_epi32(vy, 16);
6800 y += 2;
6801 }
6802 if (n & (1 * sizeof(uint8_t))) {
6803 *y = (uint8_t) _mm_extract_epi8(vy, 0);
6804 }
6805 }
6806 }
6807