• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gemm.h>
13 #include <xnnpack/igemm.h>
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/math.h>
17 #include <xnnpack/vaddsub.h>
18 #include <xnnpack/vcvt.h>
19 #include <xnnpack/vunary.h>
20 
21 
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])22 void xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(
23     size_t mr,
24     size_t nc,
25     size_t kc,
26     const void*restrict a,
27     size_t a_stride,
28     const void*restrict w,
29     void*restrict c,
30     size_t cm_stride,
31     size_t cn_stride,
32     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(mr != 0);
35   assert(mr <= 1);
36   assert(nc != 0);
37   assert(kc != 0);
38   assert(kc % sizeof(uint16_t) == 0);
39   assert(a != NULL);
40   assert(w != NULL);
41   assert(c != NULL);
42 
43   const uint16_t* a0 = a;
44   uint16_t* c0 = c;
45 
46   do {
47     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
48     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
49     w = (const uint16_t*) w + 16;
50 
51     size_t k = kc;
52     do {
53       const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
54       a0 += 1;
55 
56       const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
57       const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
58       w = (const uint16_t*) w + 16;
59 
60       vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
61       vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
62 
63       k -= sizeof(uint16_t);
64     } while (k != 0);
65 
66     const __m256 vscale = _mm256_load_ps(params->avx.scale);
67     vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
68     vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
69 
70     const __m256 vmin = _mm256_load_ps(params->avx.min);
71     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
72     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
73 
74     const __m256 vmax = _mm256_load_ps(params->avx.max);
75     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
76     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
77 
78     if XNN_LIKELY(nc >= 16) {
79       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
80       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
81       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
82 
83       a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
84 
85       nc -= 16;
86     } else {
87       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
88       if (nc & 8) {
89         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
90 
91         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
92 
93         c0 += 8;
94       }
95       if (nc & 4) {
96         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
97 
98         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
99 
100         c0 += 4;
101       }
102       if (nc & 2) {
103         _mm_storeu_si32(c0, vh0x01234567);
104 
105         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
106 
107         c0 += 2;
108       }
109       if (nc & 1) {
110         *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
111       }
112 
113       nc = 0;
114     }
115   } while (nc != 0);
116 }
117 
xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])118 void xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(
119     size_t mr,
120     size_t nc,
121     size_t kc,
122     const void*restrict a,
123     size_t a_stride,
124     const void*restrict w,
125     void*restrict c,
126     size_t cm_stride,
127     size_t cn_stride,
128     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
129 {
130   assert(mr != 0);
131   assert(mr <= 4);
132   assert(nc != 0);
133   assert(kc != 0);
134   assert(kc % sizeof(uint16_t) == 0);
135   assert(a != NULL);
136   assert(w != NULL);
137   assert(c != NULL);
138 
139   const uint16_t* a0 = a;
140   uint16_t* c0 = c;
141   const uint16_t* a1 = (const uint16_t*) ((uintptr_t) a0 + a_stride);
142   uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
143   if XNN_UNPREDICTABLE(mr < 2) {
144     a1 = a0;
145     c1 = c0;
146   }
147   const uint16_t* a2 = (const uint16_t*) ((uintptr_t) a1 + a_stride);
148   uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
149   if XNN_UNPREDICTABLE(mr <= 2) {
150     a2 = a1;
151     c2 = c1;
152   }
153   const uint16_t* a3 = (const uint16_t*) ((uintptr_t) a2 + a_stride);
154   uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
155   if XNN_UNPREDICTABLE(mr != 4) {
156     a3 = a2;
157     c3 = c2;
158   }
159 
160   do {
161     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
162     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
163     __m256 vacc1x01234567 = vacc0x01234567;
164     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
165     __m256 vacc2x01234567 = vacc0x01234567;
166     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
167     __m256 vacc3x01234567 = vacc0x01234567;
168     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
169     w = (const uint16_t*) w + 16;
170 
171     size_t k = kc;
172     do {
173       const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
174       a0 += 1;
175       const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
176       a1 += 1;
177       const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
178       a2 += 1;
179       const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
180       a3 += 1;
181 
182       const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
183       const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
184       w = (const uint16_t*) w + 16;
185 
186       vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
187       vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
188       vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
189       vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
190       vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
191       vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
192       vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
193       vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
194 
195       k -= sizeof(uint16_t);
196     } while (k != 0);
197 
198     const __m256 vscale = _mm256_load_ps(params->avx.scale);
199     vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
200     vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
201     vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
202     vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
203     vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
204     vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
205     vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
206     vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
207 
208     const __m256 vmin = _mm256_load_ps(params->avx.min);
209     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
210     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
211     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
212     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
213     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
214     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
215     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
216     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
217 
218     const __m256 vmax = _mm256_load_ps(params->avx.max);
219     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
220     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
221     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
222     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
223     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
224     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
225     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
226     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
227 
228     if XNN_LIKELY(nc >= 16) {
229       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
230       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
231       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
232       _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
233       _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
234       c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
235       _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
236       _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
237       c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
238       _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
239       _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
240       c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
241 
242       a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
243       a1 = (const uint16_t*) ((uintptr_t) a1 - kc);
244       a2 = (const uint16_t*) ((uintptr_t) a2 - kc);
245       a3 = (const uint16_t*) ((uintptr_t) a3 - kc);
246 
247       nc -= 16;
248     } else {
249       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
250       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
251       __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
252       __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
253       if (nc & 8) {
254         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
255         _mm_storeu_si128((__m128i*) c1, vh1x01234567);
256         _mm_storeu_si128((__m128i*) c2, vh2x01234567);
257         _mm_storeu_si128((__m128i*) c3, vh3x01234567);
258 
259         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
260         vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
261         vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
262         vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
263 
264         c0 += 8;
265         c1 += 8;
266         c2 += 8;
267         c3 += 8;
268       }
269       if (nc & 4) {
270         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
271         _mm_storel_epi64((__m128i*) c1, vh1x01234567);
272         _mm_storel_epi64((__m128i*) c2, vh2x01234567);
273         _mm_storel_epi64((__m128i*) c3, vh3x01234567);
274 
275         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
276         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
277         vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
278         vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
279 
280         c0 += 4;
281         c1 += 4;
282         c2 += 4;
283         c3 += 4;
284       }
285       if (nc & 2) {
286         _mm_storeu_si32(c0, vh0x01234567);
287         _mm_storeu_si32(c1, vh1x01234567);
288         _mm_storeu_si32(c2, vh2x01234567);
289         _mm_storeu_si32(c3, vh3x01234567);
290 
291         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
292         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
293         vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
294         vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
295 
296         c0 += 2;
297         c1 += 2;
298         c2 += 2;
299         c3 += 2;
300       }
301       if (nc & 1) {
302         *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
303         *c1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
304         *c2 = (uint16_t) _mm_extract_epi16(vh2x01234567, 0);
305         *c3 = (uint16_t) _mm_extract_epi16(vh3x01234567, 0);
306       }
307 
308       nc = 0;
309     }
310   } while (nc != 0);
311 }
312 
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])313 void xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(
314     size_t mr,
315     size_t nc,
316     size_t kc,
317     size_t ks,
318     const void**restrict a,
319     const void*restrict w,
320     void*restrict c,
321     size_t cm_stride,
322     size_t cn_stride,
323     size_t a_offset,
324     const void* zero,
325     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
326 {
327   assert(mr != 0);
328   assert(mr <= 1);
329   assert(nc != 0);
330   assert(kc != 0);
331   assert(kc % sizeof(uint16_t) == 0);
332   assert(ks != 0);
333   assert(ks % (1 * sizeof(void*)) == 0);
334   assert(a_offset % sizeof(uint16_t) == 0);
335   assert(a != NULL);
336   assert(w != NULL);
337   assert(c != NULL);
338 
339   uint16_t* c0 = c;
340 
341   do {
342     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
343     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
344     w = (const uint16_t*) w + 16;
345 
346     size_t p = ks;
347     do {
348       const uint16_t* restrict a0 = (const uint16_t*) a[0];
349       assert(a0 != NULL);
350       if XNN_UNPREDICTABLE(a0 != zero) {
351         a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
352       }
353       a += 1;
354 
355       size_t k = kc;
356       do {
357         const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
358         const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
359         w = (const uint16_t*) w + 16;
360 
361         const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
362         a0 += 1;
363 
364         vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
365         vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
366 
367         k -= sizeof(uint16_t);
368       } while (k != 0);
369       p -= 1 * sizeof(void*);
370     } while (p != 0);
371 
372     const __m256 vscale = _mm256_load_ps(params->avx.scale);
373     vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
374     vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
375 
376     const __m256 vmin = _mm256_load_ps(params->avx.min);
377     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
378     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
379 
380     const __m256 vmax = _mm256_load_ps(params->avx.max);
381     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
382     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
383 
384     if XNN_LIKELY(nc >= 16) {
385       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
386       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
387       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
388 
389       a = (const void**restrict) ((uintptr_t) a - ks);
390       nc -= 16;
391     } else {
392       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
393       if (nc & 8) {
394         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
395 
396         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
397 
398         c0 += 8;
399       }
400       if (nc & 4) {
401         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
402 
403         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
404 
405         c0 += 4;
406       }
407       if (nc & 2) {
408         _mm_storeu_si32(c0, vh0x01234567);
409 
410         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
411 
412         c0 += 2;
413       }
414       if (nc & 1) {
415         *c0 = _mm_extract_epi16(vh0x01234567, 0);
416       }
417 
418       nc = 0;
419     }
420   } while (nc != 0);
421 }
422 
xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])423 void xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(
424     size_t mr,
425     size_t nc,
426     size_t kc,
427     size_t ks,
428     const void**restrict a,
429     const void*restrict w,
430     void*restrict c,
431     size_t cm_stride,
432     size_t cn_stride,
433     size_t a_offset,
434     const void* zero,
435     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
436 {
437   assert(mr != 0);
438   assert(mr <= 4);
439   assert(nc != 0);
440   assert(kc != 0);
441   assert(kc % sizeof(uint16_t) == 0);
442   assert(ks != 0);
443   assert(ks % (4 * sizeof(void*)) == 0);
444   assert(a_offset % sizeof(uint16_t) == 0);
445   assert(a != NULL);
446   assert(w != NULL);
447   assert(c != NULL);
448 
449   uint16_t* c0 = c;
450   uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
451   if XNN_UNPREDICTABLE(mr < 2) {
452     c1 = c0;
453   }
454   uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
455   if XNN_UNPREDICTABLE(mr <= 2) {
456     c2 = c1;
457   }
458   uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
459   if XNN_UNPREDICTABLE(mr != 4) {
460     c3 = c2;
461   }
462 
463   do {
464     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
465     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
466     __m256 vacc1x01234567 = vacc0x01234567;
467     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
468     __m256 vacc2x01234567 = vacc0x01234567;
469     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
470     __m256 vacc3x01234567 = vacc0x01234567;
471     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
472     w = (const uint16_t*) w + 16;
473 
474     size_t p = ks;
475     do {
476       const uint16_t* restrict a0 = (const uint16_t*) a[0];
477       assert(a0 != NULL);
478       if XNN_UNPREDICTABLE(a0 != zero) {
479         a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
480       }
481       const uint16_t* restrict a1 = (const uint16_t*) a[1];
482       assert(a1 != NULL);
483       if XNN_UNPREDICTABLE(a1 != zero) {
484         a1 = (const uint16_t*) ((uintptr_t) a1 + a_offset);
485       }
486       const uint16_t* restrict a2 = (const uint16_t*) a[2];
487       assert(a2 != NULL);
488       if XNN_UNPREDICTABLE(a2 != zero) {
489         a2 = (const uint16_t*) ((uintptr_t) a2 + a_offset);
490       }
491       const uint16_t* restrict a3 = (const uint16_t*) a[3];
492       assert(a3 != NULL);
493       if XNN_UNPREDICTABLE(a3 != zero) {
494         a3 = (const uint16_t*) ((uintptr_t) a3 + a_offset);
495       }
496       a += 4;
497 
498       size_t k = kc;
499       do {
500         const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
501         const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
502         w = (const uint16_t*) w + 16;
503 
504         const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
505         a0 += 1;
506         const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
507         a1 += 1;
508         const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
509         a2 += 1;
510         const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
511         a3 += 1;
512 
513         vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
514         vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
515         vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
516         vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
517         vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
518         vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
519         vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
520         vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
521 
522         k -= sizeof(uint16_t);
523       } while (k != 0);
524       p -= 4 * sizeof(void*);
525     } while (p != 0);
526 
527     const __m256 vscale = _mm256_load_ps(params->avx.scale);
528     vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
529     vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
530     vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
531     vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
532     vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
533     vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
534     vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
535     vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
536 
537     const __m256 vmin = _mm256_load_ps(params->avx.min);
538     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
539     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
540     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
541     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
542     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
543     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
544     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
545     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
546 
547     const __m256 vmax = _mm256_load_ps(params->avx.max);
548     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
549     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
550     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
551     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
552     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
553     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
554     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
555     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
556 
557     if XNN_LIKELY(nc >= 16) {
558       _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
559       _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
560       c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
561       _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
562       _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
563       c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
564       _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
565       _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
566       c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
567       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
568       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
569       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
570 
571       a = (const void**restrict) ((uintptr_t) a - ks);
572       nc -= 16;
573     } else {
574       __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
575       __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
576       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
577       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
578       if (nc & 8) {
579         _mm_storeu_si128((__m128i*) c3, vh3x01234567);
580         _mm_storeu_si128((__m128i*) c2, vh2x01234567);
581         _mm_storeu_si128((__m128i*) c1, vh1x01234567);
582         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
583 
584         vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
585         vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
586         vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
587         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
588 
589         c3 += 8;
590         c2 += 8;
591         c1 += 8;
592         c0 += 8;
593       }
594       if (nc & 4) {
595         _mm_storel_epi64((__m128i*) c3, vh3x01234567);
596         _mm_storel_epi64((__m128i*) c2, vh2x01234567);
597         _mm_storel_epi64((__m128i*) c1, vh1x01234567);
598         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
599 
600         vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
601         vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
602         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
603         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
604 
605         c3 += 4;
606         c2 += 4;
607         c1 += 4;
608         c0 += 4;
609       }
610       if (nc & 2) {
611         _mm_storeu_si32(c3, vh3x01234567);
612         _mm_storeu_si32(c2, vh2x01234567);
613         _mm_storeu_si32(c1, vh1x01234567);
614         _mm_storeu_si32(c0, vh0x01234567);
615 
616         vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
617         vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
618         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
619         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
620 
621         c3 += 2;
622         c2 += 2;
623         c1 += 2;
624         c0 += 2;
625       }
626       if (nc & 1) {
627         *c3 = _mm_extract_epi16(vh3x01234567, 0);
628         *c2 = _mm_extract_epi16(vh2x01234567, 0);
629         *c1 = _mm_extract_epi16(vh1x01234567, 0);
630         *c0 = _mm_extract_epi16(vh0x01234567, 0);
631       }
632 
633       nc = 0;
634     }
635   } while (nc != 0);
636 }
637 
xnn_f32_qs8_vcvt_ukernel__avx2_x64(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])638 void xnn_f32_qs8_vcvt_ukernel__avx2_x64(
639     size_t n,
640     const float* x,
641     int8_t* y,
642     const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
643 {
644   assert(n != 0);
645   assert(n % sizeof(float) == 0);
646   assert(x != NULL);
647   assert(y != NULL);
648 
649   const __m256 vscale = _mm256_load_ps(params->avx2.scale);
650   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
651   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
652   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
653   const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
654 
655   for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
656     __m256 vx01 = _mm256_loadu_ps(x);
657     __m256 vx23 = _mm256_loadu_ps(x + 8);
658     __m256 vx45 = _mm256_loadu_ps(x + 16);
659     __m256 vx67 = _mm256_loadu_ps(x + 24);
660     __m256 vx89 = _mm256_loadu_ps(x + 32);
661     __m256 vxAB = _mm256_loadu_ps(x + 40);
662     __m256 vxCD = _mm256_loadu_ps(x + 48);
663     __m256 vxEF = _mm256_loadu_ps(x + 56);
664     x += 64;
665 
666     vx01 = _mm256_mul_ps(vx01, vscale);
667     vx23 = _mm256_mul_ps(vx23, vscale);
668     vx45 = _mm256_mul_ps(vx45, vscale);
669     vx67 = _mm256_mul_ps(vx67, vscale);
670     vx89 = _mm256_mul_ps(vx89, vscale);
671     vxAB = _mm256_mul_ps(vxAB, vscale);
672     vxCD = _mm256_mul_ps(vxCD, vscale);
673     vxEF = _mm256_mul_ps(vxEF, vscale);
674 
675     vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
676     vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
677     vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
678     vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
679     vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
680     vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
681     vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
682     vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
683 
684     const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
685     const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
686     const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
687     const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
688     const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
689     const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
690     const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
691     const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
692 
693     __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
694     __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
695     __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
696     __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
697 
698     vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
699     vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
700     vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
701     vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
702 
703     const __m256i vy02461357 = _mm256_packs_epi16(vacc0213, vacc4657);
704     const __m256i vy8ACE9BDF = _mm256_packs_epi16(vacc8A9B, vaccCEDF);
705 
706     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
707     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
708 
709     vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
710     vy89ABCDEF = _mm256_max_epi8(vy89ABCDEF, voutput_min);
711 
712     _mm256_storeu_si256((__m256i*) y, vy01234567);
713     _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
714     y += 64;
715   }
716   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
717     __m256 vx = _mm256_loadu_ps(x);
718     vx = _mm256_mul_ps(vx, vscale);
719     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
720     x += 8;
721 
722     const __m256i vacc = _mm256_cvtps_epi32(vx);
723 
724     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
725     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
726     vy = _mm_packs_epi16(vy, vy);
727     vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
728 
729     _mm_storel_epi64((__m128i*) y, vy);
730     y += 8;
731   }
732   if XNN_UNLIKELY(n != 0) {
733     assert(n >= 1 * sizeof(float));
734     assert(n <= 7 * sizeof(float));
735     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2.mask_table[7] - n));
736 
737     __m256 vx = _mm256_maskload_ps(x, vmask);
738     vx = _mm256_mul_ps(vx, vscale);
739     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
740 
741     const __m256i vacc = _mm256_cvtps_epi32(vx);
742 
743     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
744     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
745     vy = _mm_packs_epi16(vy, vy);
746     vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
747 
748     if (n & (4 * sizeof(float))) {
749       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
750       y += 4;
751       vy = _mm_srli_epi64(vy, 32);
752     }
753     if (n & (2 * sizeof(float))) {
754       *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
755       y += 2;
756       vy = _mm_srli_epi32(vy, 16);
757     }
758     if (n & (1 * sizeof(float))) {
759       *y = (int8_t) _mm_extract_epi8(vy, 0);
760     }
761   }
762 }
763 
xnn_f32_qu8_vcvt_ukernel__avx2_x64(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])764 void xnn_f32_qu8_vcvt_ukernel__avx2_x64(
765     size_t n,
766     const float* x,
767     uint8_t* y,
768     const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
769 {
770   assert(n != 0);
771   assert(n % sizeof(float) == 0);
772   assert(x != NULL);
773   assert(y != NULL);
774 
775   const __m256 vscale = _mm256_load_ps(params->avx2.scale);
776   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
777   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
778   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
779   const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
780 
781   for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
782     __m256 vx01 = _mm256_loadu_ps(x);
783     __m256 vx23 = _mm256_loadu_ps(x + 8);
784     __m256 vx45 = _mm256_loadu_ps(x + 16);
785     __m256 vx67 = _mm256_loadu_ps(x + 24);
786     __m256 vx89 = _mm256_loadu_ps(x + 32);
787     __m256 vxAB = _mm256_loadu_ps(x + 40);
788     __m256 vxCD = _mm256_loadu_ps(x + 48);
789     __m256 vxEF = _mm256_loadu_ps(x + 56);
790     x += 64;
791 
792     vx01 = _mm256_mul_ps(vx01, vscale);
793     vx23 = _mm256_mul_ps(vx23, vscale);
794     vx45 = _mm256_mul_ps(vx45, vscale);
795     vx67 = _mm256_mul_ps(vx67, vscale);
796     vx89 = _mm256_mul_ps(vx89, vscale);
797     vxAB = _mm256_mul_ps(vxAB, vscale);
798     vxCD = _mm256_mul_ps(vxCD, vscale);
799     vxEF = _mm256_mul_ps(vxEF, vscale);
800 
801     vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
802     vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
803     vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
804     vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
805     vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
806     vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
807     vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
808     vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
809 
810     const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
811     const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
812     const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
813     const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
814     const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
815     const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
816     const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
817     const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
818 
819     __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
820     __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
821     __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
822     __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
823 
824     vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
825     vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
826     vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
827     vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
828 
829     const __m256i vy02461357 = _mm256_packus_epi16(vacc0213, vacc4657);
830     const __m256i vy8ACE9BDF = _mm256_packus_epi16(vacc8A9B, vaccCEDF);
831 
832     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
833     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
834 
835     vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
836     vy89ABCDEF = _mm256_max_epu8(vy89ABCDEF, voutput_min);
837 
838     _mm256_storeu_si256((__m256i*) y, vy01234567);
839     _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
840     y += 64;
841   }
842   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
843     __m256 vx = _mm256_loadu_ps(x);
844     vx = _mm256_mul_ps(vx, vscale);
845     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
846     x += 8;
847 
848     const __m256i vacc = _mm256_cvtps_epi32(vx);
849 
850     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
851     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
852     vy = _mm_packus_epi16(vy, vy);
853     vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
854 
855     _mm_storel_epi64((__m128i*) y, vy);
856     y += 8;
857   }
858   if XNN_UNLIKELY(n != 0) {
859     assert(n >= 1 * sizeof(float));
860     assert(n <= 7 * sizeof(float));
861     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2.mask_table[7] - n));
862 
863     __m256 vx = _mm256_maskload_ps(x, vmask);
864     vx = _mm256_mul_ps(vx, vscale);
865     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
866 
867     const __m256i vacc = _mm256_cvtps_epi32(vx);
868 
869     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
870     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
871     vy = _mm_packus_epi16(vy, vy);
872     vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
873 
874     if (n & (4 * sizeof(float))) {
875       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
876       y += 4;
877       vy = _mm_srli_epi64(vy, 32);
878     }
879     if (n & (2 * sizeof(float))) {
880       *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
881       y += 2;
882       vy = _mm_srli_epi32(vy, 16);
883     }
884     if (n & (1 * sizeof(float))) {
885       *y = (uint8_t) _mm_extract_epi8(vy, 0);
886     }
887   }
888 }
889 
xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])890 void xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(
891     size_t n,
892     const float* x,
893     float* y,
894     const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
895 {
896   assert(n % sizeof(float) == 0);
897 
898   const __m256 vprescale = _mm256_load_ps(params->avx2_rr1_lut4_p4.prescale);
899   const __m256 valpha = _mm256_load_ps(params->avx2_rr1_lut4_p4.alpha);
900   const __m256 vbeta = _mm256_load_ps(params->avx2_rr1_lut4_p4.beta);
901   const __m256 vsat_cutoff = _mm256_load_ps(params->avx2_rr1_lut4_p4.sat_cutoff);
902   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_lut4_p4.magic_bias);
903   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_lut4_p4.log2e);
904   const __m256 vtable = _mm256_load_ps(params->avx2_rr1_lut4_p4.table);
905   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.minus_ln2);
906   const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c4);
907   const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c3);
908   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c2);
909 
910   for (; n >= 56 * sizeof(float); n -= 56 * sizeof(float)) {
911     __m256 vx0 = _mm256_loadu_ps(x);
912     __m256 vx1 = _mm256_loadu_ps(x + 8);
913     __m256 vx2 = _mm256_loadu_ps(x + 16);
914     __m256 vx3 = _mm256_loadu_ps(x + 24);
915     __m256 vx4 = _mm256_loadu_ps(x + 32);
916     __m256 vx5 = _mm256_loadu_ps(x + 40);
917     __m256 vx6 = _mm256_loadu_ps(x + 48);
918     x += 56;
919 
920     const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
921     const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
922     const __m256 vz2 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx2, vprescale));
923     const __m256 vz3 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx3, vprescale));
924     const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale));
925     const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale));
926     const __m256 vz6 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx6, vprescale));
927 
928     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
929     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
930     __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
931     __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
932     __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
933     __m256 vn5 = _mm256_fmadd_ps(vz5, vlog2e, vmagic_bias);
934     __m256 vn6 = _mm256_fmadd_ps(vz6, vlog2e, vmagic_bias);
935 
936     const __m256i ven0 = _mm256_slli_epi32(_mm256_castps_si256(vn0), 21);
937     const __m256i vl0 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn0)));
938     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
939     const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21);
940     const __m256i vl1 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn1)));
941     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
942     const __m256i ven2 = _mm256_slli_epi32(_mm256_castps_si256(vn2), 21);
943     const __m256i vl2 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn2)));
944     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
945     const __m256i ven3 = _mm256_slli_epi32(_mm256_castps_si256(vn3), 21);
946     const __m256i vl3 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn3)));
947     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
948     const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21);
949     const __m256i vl4 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn4)));
950     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
951     const __m256i ven5 = _mm256_slli_epi32(_mm256_castps_si256(vn5), 21);
952     const __m256i vl5 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn5)));
953     vn5 = _mm256_sub_ps(vn5, vmagic_bias);
954     const __m256i ven6 = _mm256_slli_epi32(_mm256_castps_si256(vn6), 21);
955     const __m256i vl6 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn6)));
956     vn6 = _mm256_sub_ps(vn6, vmagic_bias);
957 
958     __m256 vs0 = _mm256_castsi256_ps(_mm256_add_epi32(vl0, ven0));
959     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
960     __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1));
961     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
962     __m256 vs2 = _mm256_castsi256_ps(_mm256_add_epi32(vl2, ven2));
963     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
964     __m256 vs3 = _mm256_castsi256_ps(_mm256_add_epi32(vl3, ven3));
965     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
966     __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4));
967     __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
968     __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5));
969     __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vz5);
970     __m256 vs6 = _mm256_castsi256_ps(_mm256_add_epi32(vl6, ven6));
971     __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vz6);
972 
973     __m256 vp0 = _mm256_fmadd_ps(vc4, vt0, vc3);
974     __m256 vp1 = _mm256_fmadd_ps(vc4, vt1, vc3);
975     __m256 vp2 = _mm256_fmadd_ps(vc4, vt2, vc3);
976     __m256 vp3 = _mm256_fmadd_ps(vc4, vt3, vc3);
977     __m256 vp4 = _mm256_fmadd_ps(vc4, vt4, vc3);
978     __m256 vp5 = _mm256_fmadd_ps(vc4, vt5, vc3);
979     __m256 vp6 = _mm256_fmadd_ps(vc4, vt6, vc3);
980 
981     vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
982     vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
983     vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
984     vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
985     vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
986     vp5 = _mm256_fmadd_ps(vp5, vt5, vc2);
987     vp6 = _mm256_fmadd_ps(vp6, vt6, vc2);
988 
989     vp0 = _mm256_mul_ps(vp0, vt0);
990     vt0 = _mm256_mul_ps(vt0, vs0);
991     vp1 = _mm256_mul_ps(vp1, vt1);
992     vt1 = _mm256_mul_ps(vt1, vs1);
993     vp2 = _mm256_mul_ps(vp2, vt2);
994     vt2 = _mm256_mul_ps(vt2, vs2);
995     vp3 = _mm256_mul_ps(vp3, vt3);
996     vt3 = _mm256_mul_ps(vt3, vs3);
997     vp4 = _mm256_mul_ps(vp4, vt4);
998     vt4 = _mm256_mul_ps(vt4, vs4);
999     vp5 = _mm256_mul_ps(vp5, vt5);
1000     vt5 = _mm256_mul_ps(vt5, vs5);
1001     vp6 = _mm256_mul_ps(vp6, vt6);
1002     vt6 = _mm256_mul_ps(vt6, vs6);
1003 
1004     vs0 = _mm256_fmsub_ps(vs0, valpha, valpha);
1005     vp0 = _mm256_fmadd_ps(vp0, vt0, vt0);
1006     vs1 = _mm256_fmsub_ps(vs1, valpha, valpha);
1007     vp1 = _mm256_fmadd_ps(vp1, vt1, vt1);
1008     vs2 = _mm256_fmsub_ps(vs2, valpha, valpha);
1009     vp2 = _mm256_fmadd_ps(vp2, vt2, vt2);
1010     vs3 = _mm256_fmsub_ps(vs3, valpha, valpha);
1011     vp3 = _mm256_fmadd_ps(vp3, vt3, vt3);
1012     vs4 = _mm256_fmsub_ps(vs4, valpha, valpha);
1013     vp4 = _mm256_fmadd_ps(vp4, vt4, vt4);
1014     vs5 = _mm256_fmsub_ps(vs5, valpha, valpha);
1015     vp5 = _mm256_fmadd_ps(vp5, vt5, vt5);
1016     vs6 = _mm256_fmsub_ps(vs6, valpha, valpha);
1017     vp6 = _mm256_fmadd_ps(vp6, vt6, vt6);
1018 
1019     const __m256 ve0 = _mm256_fmadd_ps(vp0, valpha, vs0);
1020     vx0 = _mm256_mul_ps(vx0, vbeta);
1021     const __m256 ve1 = _mm256_fmadd_ps(vp1, valpha, vs1);
1022     vx1 = _mm256_mul_ps(vx1, vbeta);
1023     const __m256 ve2 = _mm256_fmadd_ps(vp2, valpha, vs2);
1024     vx2 = _mm256_mul_ps(vx2, vbeta);
1025     const __m256 ve3 = _mm256_fmadd_ps(vp3, valpha, vs3);
1026     vx3 = _mm256_mul_ps(vx3, vbeta);
1027     const __m256 ve4 = _mm256_fmadd_ps(vp4, valpha, vs4);
1028     vx4 = _mm256_mul_ps(vx4, vbeta);
1029     const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5);
1030     vx5 = _mm256_mul_ps(vx5, vbeta);
1031     const __m256 ve6 = _mm256_fmadd_ps(vp6, valpha, vs6);
1032     vx6 = _mm256_mul_ps(vx6, vbeta);
1033 
1034     const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
1035     const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
1036     const __m256 vy2 = _mm256_blendv_ps(vx2, ve2, vx2);
1037     const __m256 vy3 = _mm256_blendv_ps(vx3, ve3, vx3);
1038     const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4);
1039     const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5);
1040     const __m256 vy6 = _mm256_blendv_ps(vx6, ve6, vx6);
1041 
1042     _mm256_storeu_ps(y, vy0);
1043     _mm256_storeu_ps(y + 8, vy1);
1044     _mm256_storeu_ps(y + 16, vy2);
1045     _mm256_storeu_ps(y + 24, vy3);
1046     _mm256_storeu_ps(y + 32, vy4);
1047     _mm256_storeu_ps(y + 40, vy5);
1048     _mm256_storeu_ps(y + 48, vy6);
1049     y += 56;
1050   }
1051   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1052     __m256 vx = _mm256_loadu_ps(x);
1053     x += 8;
1054 
1055     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1056 
1057     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1058     const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
1059     const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
1060     __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
1061     vn = _mm256_sub_ps(vn, vmagic_bias);
1062 
1063     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1064 
1065     __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
1066     vp = _mm256_fmadd_ps(vp, vt, vc2);
1067     vp = _mm256_mul_ps(vp, vt);
1068 
1069     vt = _mm256_mul_ps(vt, vs);
1070     vs = _mm256_fmsub_ps(vs, valpha, valpha);
1071     vp = _mm256_fmadd_ps(vp, vt, vt);
1072     const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
1073 
1074     vx = _mm256_mul_ps(vx, vbeta);
1075     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1076 
1077     _mm256_storeu_ps(y, vy);
1078     y += 8;
1079   }
1080   if XNN_UNLIKELY(n != 0) {
1081     assert(n >= 1 * sizeof(float));
1082     assert(n <= 7 * sizeof(float));
1083     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2_rr1_lut4_p4.mask_table[7] - n));
1084 
1085     __m256 vx = _mm256_maskload_ps(x, vmask);
1086 
1087     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1088 
1089     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1090     const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
1091     const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
1092     __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
1093     vn = _mm256_sub_ps(vn, vmagic_bias);
1094 
1095     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1096 
1097     __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
1098     vp = _mm256_fmadd_ps(vp, vt, vc2);
1099     vp = _mm256_mul_ps(vp, vt);
1100 
1101     vt = _mm256_mul_ps(vt, vs);
1102     vs = _mm256_fmsub_ps(vs, valpha, valpha);
1103     vp = _mm256_fmadd_ps(vp, vt, vt);
1104     const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
1105 
1106     vx = _mm256_mul_ps(vx, vbeta);
1107     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1108 
1109     __m128 vy_lo = _mm256_castps256_ps128(vy);
1110     if (n & (4 * sizeof(float))) {
1111       _mm_storeu_ps(y, vy_lo);
1112       vy_lo = _mm256_extractf128_ps(vy, 1);
1113       y += 4;
1114     }
1115     if (n & (2 * sizeof(float))) {
1116       _mm_storel_pi((__m64*) y, vy_lo);
1117       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
1118       y += 2;
1119     }
1120     if (n & (1 * sizeof(float))) {
1121       _mm_store_ss(y, vy_lo);
1122     }
1123   }
1124 }
1125 
xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])1126 void xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(
1127     size_t n,
1128     const float* x,
1129     float* y,
1130     const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
1131 {
1132   assert(n % sizeof(float) == 0);
1133 
1134   const __m256 vsign_mask = _mm256_load_ps(params->avx2_rr1_p5.sign_mask);
1135   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p5.magic_bias);
1136   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p5.log2e);
1137   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p5.minus_ln2);
1138   const __m256 vc5 = _mm256_load_ps(params->avx2_rr1_p5.c5);
1139   const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_p5.c4);
1140   const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_p5.c3);
1141   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p5.c2);
1142   const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p5.c1);
1143   const __m256 vone = _mm256_load_ps(params->avx2_rr1_p5.one);
1144   const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx2_rr1_p5.denorm_cutoff);
1145 
1146   for (; n >= 40 * sizeof(float); n -= 40 * sizeof(float)) {
1147     const __m256 vx0 = _mm256_loadu_ps(x);
1148     const __m256 vx1 = _mm256_loadu_ps(x + 8);
1149     const __m256 vx2 = _mm256_loadu_ps(x + 16);
1150     const __m256 vx3 = _mm256_loadu_ps(x + 24);
1151     const __m256 vx4 = _mm256_loadu_ps(x + 32);
1152     x += 40;
1153 
1154     const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
1155     const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
1156     const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
1157     const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
1158     const __m256 vz4 = _mm256_or_ps(vx4, vsign_mask);
1159 
1160     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
1161     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
1162     __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
1163     __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
1164     __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
1165 
1166     const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
1167     const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
1168     const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23));
1169     const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23));
1170     const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23));
1171 
1172     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1173     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1174     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
1175     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
1176     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
1177 
1178     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
1179     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
1180     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
1181     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
1182     __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
1183 
1184     __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4);
1185     __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4);
1186     __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4);
1187     __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4);
1188     __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4);
1189 
1190     vp0 = _mm256_fmadd_ps(vp0, vt0, vc3);
1191     vp1 = _mm256_fmadd_ps(vp1, vt1, vc3);
1192     vp2 = _mm256_fmadd_ps(vp2, vt2, vc3);
1193     vp3 = _mm256_fmadd_ps(vp3, vt3, vc3);
1194     vp4 = _mm256_fmadd_ps(vp4, vt4, vc3);
1195 
1196     vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
1197     vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
1198     vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
1199     vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
1200     vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
1201 
1202     vp0 = _mm256_fmadd_ps(vp0, vt0, vc1);
1203     vp1 = _mm256_fmadd_ps(vp1, vt1, vc1);
1204     vp2 = _mm256_fmadd_ps(vp2, vt2, vc1);
1205     vp3 = _mm256_fmadd_ps(vp3, vt3, vc1);
1206     vp4 = _mm256_fmadd_ps(vp4, vt4, vc1);
1207 
1208     vt0 = _mm256_mul_ps(vt0, vs0);
1209     vt1 = _mm256_mul_ps(vt1, vs1);
1210     vt2 = _mm256_mul_ps(vt2, vs2);
1211     vt3 = _mm256_mul_ps(vt3, vs3);
1212     vt4 = _mm256_mul_ps(vt4, vs4);
1213 
1214     const __m256 ve0 = _mm256_fmadd_ps(vt0, vp0, vs0);
1215     const __m256 ve1 = _mm256_fmadd_ps(vt1, vp1, vs1);
1216     const __m256 ve2 = _mm256_fmadd_ps(vt2, vp2, vs2);
1217     const __m256 ve3 = _mm256_fmadd_ps(vt3, vp3, vs3);
1218     const __m256 ve4 = _mm256_fmadd_ps(vt4, vp4, vs4);
1219 
1220     const __m256 vd0 = _mm256_add_ps(ve0, vone);
1221     const __m256 vd1 = _mm256_add_ps(ve1, vone);
1222     const __m256 vd2 = _mm256_add_ps(ve2, vone);
1223     const __m256 vd3 = _mm256_add_ps(ve3, vone);
1224     const __m256 vd4 = _mm256_add_ps(ve4, vone);
1225 
1226     __m256 vf0 = _mm256_div_ps(ve0, vd0);
1227     __m256 vf1 = _mm256_div_ps(ve1, vd1);
1228     __m256 vf2 = _mm256_div_ps(ve2, vd2);
1229     __m256 vf3 = _mm256_div_ps(ve3, vd3);
1230     __m256 vf4 = _mm256_div_ps(ve4, vd4);
1231 
1232     vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
1233     vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
1234     vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
1235     vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
1236     vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vz4, vdenorm_cutoff, _CMP_LT_OS), vf4);
1237 
1238     vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
1239     vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
1240     vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
1241     vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
1242     vf4 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf4), vf4, vx4);
1243 
1244     _mm256_storeu_ps(y, vf0);
1245     _mm256_storeu_ps(y + 8, vf1);
1246     _mm256_storeu_ps(y + 16, vf2);
1247     _mm256_storeu_ps(y + 24, vf3);
1248     _mm256_storeu_ps(y + 32, vf4);
1249     y += 40;
1250   }
1251   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1252     const __m256 vx = _mm256_loadu_ps(x);
1253     x += 8;
1254 
1255     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1256 
1257     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1258     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1259     vn = _mm256_sub_ps(vn, vmagic_bias);
1260 
1261     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1262 
1263     __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
1264     vp = _mm256_fmadd_ps(vp, vt, vc3);
1265     vp = _mm256_fmadd_ps(vp, vt, vc2);
1266     vp = _mm256_fmadd_ps(vp, vt, vc1);
1267 
1268     vt = _mm256_mul_ps(vt, vs);
1269     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1270 
1271     const __m256 vd = _mm256_add_ps(ve, vone);
1272     __m256 vf = _mm256_div_ps(ve, vd);
1273 
1274     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1275     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1276 
1277     _mm256_storeu_ps(y, vf);
1278     y += 8;
1279   }
1280   if XNN_UNLIKELY(n != 0) {
1281     assert(n >= 1 * sizeof(float));
1282     assert(n <= 7 * sizeof(float));
1283     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2_rr1_p5.mask_table[7] - n));
1284 
1285     const __m256 vx = _mm256_maskload_ps(x, vmask);
1286 
1287     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1288 
1289     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1290     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1291     vn = _mm256_sub_ps(vn, vmagic_bias);
1292 
1293     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1294 
1295     __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
1296     vp = _mm256_fmadd_ps(vp, vt, vc3);
1297     vp = _mm256_fmadd_ps(vp, vt, vc2);
1298     vp = _mm256_fmadd_ps(vp, vt, vc1);
1299 
1300     vt = _mm256_mul_ps(vt, vs);
1301     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1302 
1303     const __m256 vd = _mm256_add_ps(ve, vone);
1304     __m256 vf = _mm256_div_ps(ve, vd);
1305 
1306     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1307     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1308 
1309     __m128 vf_lo = _mm256_castps256_ps128(vf);
1310     if (n & (4 * sizeof(float))) {
1311       _mm_storeu_ps(y, vf_lo);
1312       vf_lo = _mm256_extractf128_ps(vf, 1);
1313       y += 4;
1314     }
1315     if (n & (2 * sizeof(float))) {
1316       _mm_storel_pi((__m64*) y, vf_lo);
1317       vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
1318       y += 2;
1319     }
1320     if (n & (1 * sizeof(float))) {
1321       _mm_store_ss(y, vf_lo);
1322     }
1323   }
1324 }
1325 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1326 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
1327     size_t channels,
1328     size_t output_width,
1329     const int8_t** input,
1330     const void* weights,
1331     int8_t* output,
1332     size_t input_stride,
1333     size_t output_increment,
1334     size_t input_offset,
1335     const int8_t* zero,
1336     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1337 {
1338   assert(channels != 0);
1339   assert(output_width != 0);
1340 
1341   do {
1342     const int8_t* i0 = input[0];
1343     assert(i0 != NULL);
1344     if XNN_UNPREDICTABLE(i0 != zero) {
1345       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1346     }
1347     const int8_t* i1 = input[1];
1348     assert(i1 != NULL);
1349     if XNN_UNPREDICTABLE(i1 != zero) {
1350       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1351     }
1352     const int8_t* i2 = input[2];
1353     assert(i2 != NULL);
1354     if XNN_UNPREDICTABLE(i2 != zero) {
1355       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1356     }
1357     const int8_t* i3 = input[3];
1358     assert(i3 != NULL);
1359     if XNN_UNPREDICTABLE(i3 != zero) {
1360       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1361     }
1362     const int8_t* i4 = input[4];
1363     assert(i4 != NULL);
1364     if XNN_UNPREDICTABLE(i4 != zero) {
1365       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1366     }
1367     const int8_t* i5 = input[5];
1368     assert(i5 != NULL);
1369     if XNN_UNPREDICTABLE(i5 != zero) {
1370       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1371     }
1372     const int8_t* i6 = input[6];
1373     assert(i6 != NULL);
1374     if XNN_UNPREDICTABLE(i6 != zero) {
1375       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1376     }
1377     const int8_t* i7 = input[7];
1378     assert(i7 != NULL);
1379     if XNN_UNPREDICTABLE(i7 != zero) {
1380       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1381     }
1382     const int8_t* i8 = input[8];
1383     assert(i8 != NULL);
1384     if XNN_UNPREDICTABLE(i8 != zero) {
1385       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1386     }
1387     const int8_t* i9 = input[9];
1388     assert(i9 != NULL);
1389     if XNN_UNPREDICTABLE(i9 != zero) {
1390       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
1391     }
1392     const int8_t* i10 = input[10];
1393     assert(i10 != NULL);
1394     if XNN_UNPREDICTABLE(i10 != zero) {
1395       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
1396     }
1397     const int8_t* i11 = input[11];
1398     assert(i11 != NULL);
1399     if XNN_UNPREDICTABLE(i11 != zero) {
1400       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
1401     }
1402     const int8_t* i12 = input[12];
1403     assert(i12 != NULL);
1404     if XNN_UNPREDICTABLE(i12 != zero) {
1405       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
1406     }
1407     const int8_t* i13 = input[13];
1408     assert(i13 != NULL);
1409     if XNN_UNPREDICTABLE(i13 != zero) {
1410       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
1411     }
1412     const int8_t* i14 = input[14];
1413     assert(i14 != NULL);
1414     if XNN_UNPREDICTABLE(i14 != zero) {
1415       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
1416     }
1417     const int8_t* i15 = input[15];
1418     assert(i15 != NULL);
1419     if XNN_UNPREDICTABLE(i15 != zero) {
1420       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
1421     }
1422     const int8_t* i16 = input[16];
1423     assert(i16 != NULL);
1424     if XNN_UNPREDICTABLE(i16 != zero) {
1425       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
1426     }
1427     const int8_t* i17 = input[17];
1428     assert(i17 != NULL);
1429     if XNN_UNPREDICTABLE(i17 != zero) {
1430       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
1431     }
1432     const int8_t* i18 = input[18];
1433     assert(i18 != NULL);
1434     if XNN_UNPREDICTABLE(i18 != zero) {
1435       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
1436     }
1437     const int8_t* i19 = input[19];
1438     assert(i19 != NULL);
1439     if XNN_UNPREDICTABLE(i19 != zero) {
1440       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
1441     }
1442     const int8_t* i20 = input[20];
1443     assert(i20 != NULL);
1444     if XNN_UNPREDICTABLE(i20 != zero) {
1445       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
1446     }
1447     const int8_t* i21 = input[21];
1448     assert(i21 != NULL);
1449     if XNN_UNPREDICTABLE(i21 != zero) {
1450       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
1451     }
1452     const int8_t* i22 = input[22];
1453     assert(i22 != NULL);
1454     if XNN_UNPREDICTABLE(i22 != zero) {
1455       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
1456     }
1457     const int8_t* i23 = input[23];
1458     assert(i23 != NULL);
1459     if XNN_UNPREDICTABLE(i23 != zero) {
1460       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
1461     }
1462     const int8_t* i24 = input[24];
1463     assert(i24 != NULL);
1464     if XNN_UNPREDICTABLE(i24 != zero) {
1465       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
1466     }
1467     input = (const int8_t**) ((uintptr_t) input + input_stride);
1468 
1469     size_t c = channels;
1470     const void* w = weights;
1471     for (; c >= 16; c -= 16) {
1472       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1473       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
1474 
1475 
1476       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
1477       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
1478       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
1479       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
1480       i0 += 16;
1481 
1482       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
1483       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
1484 
1485       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
1486       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
1487       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
1488       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
1489       i1 += 16;
1490 
1491       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
1492       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
1493 
1494       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
1495       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
1496       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
1497       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
1498       i2 += 16;
1499 
1500       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
1501       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
1502 
1503       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
1504       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
1505       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
1506       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
1507       i3 += 16;
1508 
1509       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
1510       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
1511 
1512       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
1513       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
1514       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
1515       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
1516       i4 += 16;
1517 
1518       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
1519       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
1520 
1521       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
1522       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
1523       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
1524       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
1525       i5 += 16;
1526 
1527       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
1528       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
1529 
1530       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
1531       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
1532       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
1533       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
1534       i6 += 16;
1535 
1536       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
1537       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
1538 
1539       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
1540       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
1541       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
1542       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
1543       i7 += 16;
1544 
1545       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
1546       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
1547 
1548       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
1549       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
1550       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
1551       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
1552       i8 += 16;
1553 
1554       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
1555       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
1556 
1557       const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
1558       const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
1559       const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
1560       const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
1561       i9 += 16;
1562 
1563       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
1564       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
1565 
1566       const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
1567       const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
1568       const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
1569       const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
1570       i10 += 16;
1571 
1572       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
1573       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
1574 
1575       const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
1576       const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
1577       const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
1578       const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
1579       i11 += 16;
1580 
1581       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
1582       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
1583 
1584       const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
1585       const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
1586       const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
1587       const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
1588       i12 += 16;
1589 
1590       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
1591       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
1592 
1593       const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
1594       const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
1595       const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
1596       const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
1597       i13 += 16;
1598 
1599       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
1600       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
1601 
1602       const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
1603       const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
1604       const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
1605       const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
1606       i14 += 16;
1607 
1608       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
1609       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
1610 
1611       const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
1612       const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
1613       const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
1614       const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
1615       i15 += 16;
1616 
1617       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
1618       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
1619 
1620       const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
1621       const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
1622       const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
1623       const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
1624       i16 += 16;
1625 
1626       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
1627       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
1628 
1629       const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
1630       const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
1631       const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
1632       const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
1633       i17 += 16;
1634 
1635       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
1636       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
1637 
1638       const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
1639       const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
1640       const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
1641       const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
1642       i18 += 16;
1643 
1644       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
1645       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
1646 
1647       const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
1648       const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
1649       const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
1650       const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
1651       i19 += 16;
1652 
1653       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
1654       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
1655 
1656       const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
1657       const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
1658       const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
1659       const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
1660       i20 += 16;
1661 
1662       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
1663       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
1664 
1665       const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
1666       const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
1667       const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
1668       const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
1669       i21 += 16;
1670 
1671       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
1672       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
1673 
1674       const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
1675       const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
1676       const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
1677       const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
1678       i22 += 16;
1679 
1680       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
1681       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
1682 
1683       const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
1684       const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
1685       const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
1686       const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
1687       i23 += 16;
1688 
1689       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
1690       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
1691 
1692       const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
1693       const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
1694       const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
1695       const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
1696       i24 += 16;
1697 
1698       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
1699       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
1700 
1701       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
1702 
1703       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
1704       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
1705 
1706       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
1707       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
1708       w = (const void*) ((const float*) w + 16);
1709       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
1710       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
1711 
1712       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
1713       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
1714       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
1715 
1716       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
1717       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
1718 
1719       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
1720       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
1721 
1722       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
1723 
1724       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
1725       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
1726 
1727       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
1728       output += 16;
1729     }
1730     if XNN_UNLIKELY(c != 0) {
1731       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
1732       do {
1733         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1734 
1735 
1736         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
1737         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
1738         i0 += 8;
1739 
1740         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
1741 
1742         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
1743         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
1744         i1 += 8;
1745 
1746         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
1747 
1748         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
1749         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
1750         i2 += 8;
1751 
1752         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
1753 
1754         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
1755         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
1756         i3 += 8;
1757 
1758         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
1759 
1760         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
1761         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
1762         i4 += 8;
1763 
1764         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
1765 
1766         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
1767         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
1768         i5 += 8;
1769 
1770         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
1771 
1772         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
1773         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
1774         i6 += 8;
1775 
1776         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
1777 
1778         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
1779         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
1780         i7 += 8;
1781 
1782         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
1783 
1784         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
1785         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
1786         i8 += 8;
1787 
1788         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
1789 
1790         const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
1791         const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
1792         i9 += 8;
1793 
1794         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
1795 
1796         const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
1797         const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
1798         i10 += 8;
1799 
1800         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
1801 
1802         const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
1803         const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
1804         i11 += 8;
1805 
1806         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
1807 
1808         const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
1809         const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
1810         i12 += 8;
1811 
1812         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
1813 
1814         const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
1815         const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
1816         i13 += 8;
1817 
1818         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
1819 
1820         const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
1821         const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
1822         i14 += 8;
1823 
1824         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
1825 
1826         const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
1827         const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
1828         i15 += 8;
1829 
1830         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
1831 
1832         const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
1833         const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
1834         i16 += 8;
1835 
1836         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
1837 
1838         const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
1839         const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
1840         i17 += 8;
1841 
1842         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
1843 
1844         const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
1845         const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
1846         i18 += 8;
1847 
1848         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
1849 
1850         const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
1851         const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
1852         i19 += 8;
1853 
1854         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
1855 
1856         const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
1857         const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
1858         i20 += 8;
1859 
1860         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
1861 
1862         const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
1863         const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
1864         i21 += 8;
1865 
1866         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
1867 
1868         const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
1869         const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
1870         i22 += 8;
1871 
1872         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
1873 
1874         const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
1875         const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
1876         i23 += 8;
1877 
1878         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
1879 
1880         const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
1881         const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
1882         i24 += 8;
1883 
1884         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
1885 
1886         k += 8;
1887 
1888         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
1889         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)));
1890         vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
1891         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->avx2.output_max_less_zero_point));
1892         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
1893 
1894         w = (const void*) ((const int32_t*) w + 8);
1895 
1896         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
1897         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
1898 
1899         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1900 
1901         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
1902         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1903 
1904         if XNN_LIKELY(c >= 8) {
1905           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1906           output += 8;
1907           c -= 8;
1908         } else {
1909           if (c & 4) {
1910             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
1911             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1912             output += 4;
1913           }
1914           if (c & 2) {
1915             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
1916             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1917             output += 2;
1918           }
1919           if (c & 1) {
1920             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1921             output += 1;
1922           }
1923           c = 0;
1924         }
1925       } while (c != 0);
1926     }
1927 
1928     output = (int8_t*) ((uintptr_t) output + output_increment);
1929   } while (--output_width != 0);
1930 }
1931 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1932 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
1933     size_t channels,
1934     size_t output_width,
1935     const int8_t** input,
1936     const void* weights,
1937     int8_t* output,
1938     size_t input_stride,
1939     size_t output_increment,
1940     size_t input_offset,
1941     const int8_t* zero,
1942     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1943 {
1944   assert(channels != 0);
1945   assert(output_width != 0);
1946 
1947   do {
1948     const int8_t* i0 = input[0];
1949     assert(i0 != NULL);
1950     if XNN_UNPREDICTABLE(i0 != zero) {
1951       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1952     }
1953     const int8_t* i1 = input[1];
1954     assert(i1 != NULL);
1955     if XNN_UNPREDICTABLE(i1 != zero) {
1956       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1957     }
1958     const int8_t* i2 = input[2];
1959     assert(i2 != NULL);
1960     if XNN_UNPREDICTABLE(i2 != zero) {
1961       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1962     }
1963     const int8_t* i3 = input[3];
1964     assert(i3 != NULL);
1965     if XNN_UNPREDICTABLE(i3 != zero) {
1966       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1967     }
1968     const int8_t* i4 = input[4];
1969     assert(i4 != NULL);
1970     if XNN_UNPREDICTABLE(i4 != zero) {
1971       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1972     }
1973     const int8_t* i5 = input[5];
1974     assert(i5 != NULL);
1975     if XNN_UNPREDICTABLE(i5 != zero) {
1976       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1977     }
1978     const int8_t* i6 = input[6];
1979     assert(i6 != NULL);
1980     if XNN_UNPREDICTABLE(i6 != zero) {
1981       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1982     }
1983     const int8_t* i7 = input[7];
1984     assert(i7 != NULL);
1985     if XNN_UNPREDICTABLE(i7 != zero) {
1986       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1987     }
1988     const int8_t* i8 = input[8];
1989     assert(i8 != NULL);
1990     if XNN_UNPREDICTABLE(i8 != zero) {
1991       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1992     }
1993     input = (const int8_t**) ((uintptr_t) input + input_stride);
1994 
1995     size_t c = channels;
1996     const void* w = weights;
1997     for (; c >= 16; c -= 16) {
1998       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
1999       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
2000 
2001 
2002       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2003       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2004       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
2005       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
2006       i0 += 16;
2007 
2008       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2009       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
2010 
2011       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2012       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2013       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
2014       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
2015       i1 += 16;
2016 
2017       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2018       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
2019 
2020       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2021       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2022       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
2023       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
2024       i2 += 16;
2025 
2026       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2027       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
2028 
2029       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2030       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
2031       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
2032       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
2033       i3 += 16;
2034 
2035       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2036       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
2037 
2038       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2039       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
2040       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
2041       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
2042       i4 += 16;
2043 
2044       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2045       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
2046 
2047       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2048       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
2049       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
2050       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
2051       i5 += 16;
2052 
2053       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2054       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
2055 
2056       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2057       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
2058       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
2059       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
2060       i6 += 16;
2061 
2062       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2063       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
2064 
2065       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2066       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
2067       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
2068       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
2069       i7 += 16;
2070 
2071       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2072       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
2073 
2074       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2075       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
2076       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
2077       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
2078       i8 += 16;
2079 
2080       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2081       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
2082 
2083       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
2084 
2085       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2086       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
2087 
2088       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
2089       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
2090       w = (const void*) ((const float*) w + 16);
2091       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2092       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
2093 
2094       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2095       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
2096       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
2097 
2098       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2099       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
2100 
2101       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2102       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
2103 
2104       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
2105 
2106       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
2107       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
2108 
2109       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2110       output += 16;
2111     }
2112     if XNN_UNLIKELY(c != 0) {
2113       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
2114       do {
2115         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2116 
2117 
2118         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2119         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
2120         i0 += 8;
2121 
2122         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2123 
2124         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2125         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
2126         i1 += 8;
2127 
2128         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2129 
2130         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2131         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
2132         i2 += 8;
2133 
2134         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2135 
2136         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2137         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
2138         i3 += 8;
2139 
2140         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2141 
2142         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2143         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
2144         i4 += 8;
2145 
2146         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2147 
2148         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2149         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
2150         i5 += 8;
2151 
2152         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2153 
2154         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2155         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
2156         i6 += 8;
2157 
2158         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2159 
2160         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2161         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
2162         i7 += 8;
2163 
2164         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2165 
2166         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2167         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
2168         i8 += 8;
2169 
2170         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2171 
2172         k += 8;
2173 
2174         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2175         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
2176         vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2177         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->avx2.output_max_less_zero_point));
2178         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2179 
2180         w = (const void*) ((const int32_t*) w + 8);
2181 
2182         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
2183         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
2184 
2185         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2186 
2187         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
2188         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
2189 
2190         if XNN_LIKELY(c >= 8) {
2191           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
2192           output += 8;
2193           c -= 8;
2194         } else {
2195           if (c & 4) {
2196             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
2197             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
2198             output += 4;
2199           }
2200           if (c & 2) {
2201             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
2202             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
2203             output += 2;
2204           }
2205           if (c & 1) {
2206             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
2207             output += 1;
2208           }
2209           c = 0;
2210         }
2211       } while (c != 0);
2212     }
2213 
2214     output = (int8_t*) ((uintptr_t) output + output_increment);
2215   } while (--output_width != 0);
2216 }
2217 
xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2218 void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
2219     size_t mr,
2220     size_t nc,
2221     size_t kc,
2222     const int8_t* restrict a,
2223     size_t a_stride,
2224     const void* restrict w,
2225     int8_t* restrict c,
2226     size_t cm_stride,
2227     size_t cn_stride,
2228     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2229 {
2230   assert(mr != 0);
2231   assert(mr <= 1);
2232   assert(nc != 0);
2233   assert(kc != 0);
2234   assert(kc % sizeof(int8_t) == 0);
2235   assert(a != NULL);
2236   assert(w != NULL);
2237   assert(c != NULL);
2238 
2239   kc = round_up_po2(kc, 8);
2240   const int8_t* a0 = a;
2241   int8_t* c0 = c;
2242 
2243   do {
2244     const __m128i vbias0x0 = _mm_loadu_si32(w);
2245     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2246     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2247     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2248     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2249     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2250     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2251     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2252     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2253     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2254     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2255     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2256     w = (const void*) ((const int32_t*) w + 8);
2257 
2258     size_t k = 0;
2259     while (k < kc) {
2260       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2261       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2262       a0 += 8;
2263 
2264       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2265       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2266 
2267       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2268       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2269       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2270 
2271       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2272       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2273       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2274 
2275       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2276       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2277       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2278 
2279       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2280 
2281       w = (const void*) ((const int8_t*) w + 64);
2282       k += 8 * sizeof(int8_t);
2283     }
2284 
2285     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2286     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2287 
2288     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2289 
2290     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2291     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2292 
2293     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2294 
2295     const __m256 vscale01234567 = _mm256_load_ps(w);
2296     w = (const void*) ((const float*) w + 8);
2297     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2298 
2299     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2300     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2301 
2302     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2303 
2304     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2305     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
2306 
2307     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2308 
2309     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
2310 
2311     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2312 
2313     __m128i vout_lo = _mm256_castsi256_si128(vout);
2314     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2315 
2316     if (nc >= 8) {
2317       _mm_storel_epi64((__m128i*) c0, vout_lo);
2318 
2319       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2320 
2321       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2322 
2323       nc -= 8;
2324     } else {
2325       if (nc & 4) {
2326         _mm_storeu_si32(c0, vout_lo);
2327 
2328         c0 += 4;
2329 
2330         vout_lo = _mm_srli_epi64(vout_lo, 32);
2331         vout_hi = _mm_srli_epi64(vout_hi, 32);
2332       }
2333       if (nc & 2) {
2334         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2335 
2336         c0 += 2;
2337 
2338         vout_lo = _mm_srli_epi32(vout_lo, 16);
2339         vout_hi = _mm_srli_epi32(vout_hi, 16);
2340       }
2341       if (nc & 1) {
2342         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2343       }
2344 
2345       nc = 0;
2346     }
2347   } while (nc != 0);
2348 }
2349 
xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2350 void xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
2351     size_t mr,
2352     size_t nc,
2353     size_t kc,
2354     const int8_t* restrict a,
2355     size_t a_stride,
2356     const void* restrict w,
2357     int8_t* restrict c,
2358     size_t cm_stride,
2359     size_t cn_stride,
2360     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2361 {
2362   assert(mr != 0);
2363   assert(mr <= 3);
2364   assert(nc != 0);
2365   assert(kc != 0);
2366   assert(kc % sizeof(int8_t) == 0);
2367   assert(a != NULL);
2368   assert(w != NULL);
2369   assert(c != NULL);
2370 
2371   kc = round_up_po2(kc, 8);
2372   const int8_t* a0 = a;
2373   int8_t* c0 = c;
2374   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
2375   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2376   if XNN_UNPREDICTABLE(mr < 2) {
2377     a1 = a0;
2378     c1 = c0;
2379   }
2380   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
2381   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2382   if XNN_UNPREDICTABLE(mr <= 2) {
2383     a2 = a1;
2384     c2 = c1;
2385   }
2386 
2387   do {
2388     const __m128i vbias0x0 = _mm_loadu_si32(w);
2389     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2390     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2391     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2392     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2393     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2394     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2395     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2396     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2397     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2398     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2399     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2400     __m256i vacc1x01 = vacc0x01;
2401     __m256i vacc1x23 = vacc0x23;
2402     __m256i vacc1x45 = vacc0x45;
2403     __m256i vacc1x67 = vacc0x67;
2404     __m256i vacc2x01 = vacc0x01;
2405     __m256i vacc2x23 = vacc0x23;
2406     __m256i vacc2x45 = vacc0x45;
2407     __m256i vacc2x67 = vacc0x67;
2408     w = (const void*) ((const int32_t*) w + 8);
2409 
2410     size_t k = 0;
2411     while (k < kc) {
2412       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2413       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2414       a0 += 8;
2415       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
2416       const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
2417       a1 += 8;
2418       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
2419       const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
2420       a2 += 8;
2421 
2422       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2423       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2424 
2425       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2426       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
2427       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
2428       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2429       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2430 
2431       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2432       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
2433       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
2434       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2435       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2436 
2437       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2438       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
2439       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
2440       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2441       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2442 
2443       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2444       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
2445       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
2446 
2447       w = (const void*) ((const int8_t*) w + 64);
2448       k += 8 * sizeof(int8_t);
2449     }
2450 
2451     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2452     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2453     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
2454     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
2455     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
2456     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
2457 
2458     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2459     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
2460     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
2461 
2462     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2463     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2464     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
2465     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
2466 
2467     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2468     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
2469     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
2470 
2471     const __m256 vscale01234567 = _mm256_load_ps(w);
2472     w = (const void*) ((const float*) w + 8);
2473     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2474     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
2475     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
2476 
2477     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2478     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2479     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
2480     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
2481 
2482     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2483     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
2484     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
2485 
2486     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2487     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
2488     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
2489 
2490     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2491     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2492 
2493     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
2494 
2495     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2496 
2497     __m128i vout_lo = _mm256_castsi256_si128(vout);
2498     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2499 
2500     if (nc >= 8) {
2501       _mm_storel_epi64((__m128i*) c0, vout_lo);
2502       _mm_storel_epi64((__m128i*) c1, vout_hi);
2503       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
2504 
2505       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2506       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2507       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2508 
2509       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2510       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
2511       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
2512 
2513       nc -= 8;
2514     } else {
2515       if (nc & 4) {
2516         _mm_storeu_si32(c0, vout_lo);
2517         _mm_storeu_si32(c1, vout_hi);
2518         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
2519 
2520         c0 += 4;
2521         c1 += 4;
2522         c2 += 4;
2523 
2524         vout_lo = _mm_srli_epi64(vout_lo, 32);
2525         vout_hi = _mm_srli_epi64(vout_hi, 32);
2526       }
2527       if (nc & 2) {
2528         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2529         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
2530         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
2531 
2532         c0 += 2;
2533         c1 += 2;
2534         c2 += 2;
2535 
2536         vout_lo = _mm_srli_epi32(vout_lo, 16);
2537         vout_hi = _mm_srli_epi32(vout_hi, 16);
2538       }
2539       if (nc & 1) {
2540         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2541         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
2542         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
2543       }
2544 
2545       nc = 0;
2546     }
2547   } while (nc != 0);
2548 }
2549 
xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2550 void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
2551     size_t mr,
2552     size_t nc,
2553     size_t kc,
2554     size_t ks,
2555     const int8_t** restrict a,
2556     const void* restrict w,
2557     int8_t* restrict c,
2558     size_t cm_stride,
2559     size_t cn_stride,
2560     size_t a_offset,
2561     const int8_t* zero,
2562     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2563 {
2564   assert(mr != 0);
2565   assert(mr <= 1);
2566   assert(nc != 0);
2567   assert(kc != 0);
2568   assert(ks != 0);
2569   assert(ks % (1 * sizeof(void*)) == 0);
2570   assert(a_offset % sizeof(int8_t) == 0);
2571   assert(a != NULL);
2572   assert(w != NULL);
2573   assert(c != NULL);
2574 
2575   kc = round_up_po2(kc, 8);
2576   int8_t* c0 = c;
2577 
2578   do {
2579     const __m128i vbias0x0 = _mm_loadu_si32(w);
2580     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2581     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2582     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2583     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2584     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2585     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2586     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2587     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2588     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2589     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2590     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2591     w = (const void*) ((const int32_t*) w + 8);
2592 
2593     size_t p = ks;
2594     do {
2595       const int8_t* restrict a0 = a[0];
2596       if XNN_UNPREDICTABLE(a0 != zero) {
2597         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2598       }
2599       a += 1;
2600 
2601       size_t k = 0;
2602       while (k < kc) {
2603         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2604         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2605         a0 += 8;
2606 
2607         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2608         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2609 
2610         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2611         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2612         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2613 
2614         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2615         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2616         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2617 
2618         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2619         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2620         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2621 
2622         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2623 
2624         w = (const void*) ((const int8_t*) w + 64);
2625         k += 8 * sizeof(int8_t);
2626       }
2627       p -= 1 * sizeof(void*);
2628     } while (p != 0);
2629 
2630     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2631     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2632 
2633     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2634 
2635     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2636     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2637 
2638     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2639 
2640     const __m256 vscale01234567 = _mm256_load_ps(w);
2641     w = (const void*) ((const float*) w + 8);
2642     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2643 
2644     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2645     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2646 
2647     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2648 
2649     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2650     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
2651 
2652     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2653 
2654     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
2655 
2656     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2657 
2658     __m128i vout_lo = _mm256_castsi256_si128(vout);
2659     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2660 
2661     if (nc >= 8) {
2662       _mm_storel_epi64((__m128i*) c0, vout_lo);
2663 
2664       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2665 
2666       a = (const int8_t**restrict) ((uintptr_t) a - ks);
2667 
2668       nc -= 8;
2669     } else {
2670       if (nc & 4) {
2671         _mm_storeu_si32(c0, vout_lo);
2672 
2673         c0 += 4;
2674 
2675         vout_lo = _mm_srli_epi64(vout_lo, 32);
2676         vout_hi = _mm_srli_epi64(vout_hi, 32);
2677       }
2678       if (nc & 2) {
2679         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2680 
2681         c0 += 2;
2682 
2683         vout_lo = _mm_srli_epi32(vout_lo, 16);
2684         vout_hi = _mm_srli_epi32(vout_hi, 16);
2685       }
2686       if (nc & 1) {
2687         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2688       }
2689 
2690       nc = 0;
2691     }
2692   } while (nc != 0);
2693 }
2694 
xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2695 void xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
2696     size_t mr,
2697     size_t nc,
2698     size_t kc,
2699     size_t ks,
2700     const int8_t** restrict a,
2701     const void* restrict w,
2702     int8_t* restrict c,
2703     size_t cm_stride,
2704     size_t cn_stride,
2705     size_t a_offset,
2706     const int8_t* zero,
2707     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2708 {
2709   assert(mr != 0);
2710   assert(mr <= 3);
2711   assert(nc != 0);
2712   assert(kc != 0);
2713   assert(ks != 0);
2714   assert(ks % (3 * sizeof(void*)) == 0);
2715   assert(a_offset % sizeof(int8_t) == 0);
2716   assert(a != NULL);
2717   assert(w != NULL);
2718   assert(c != NULL);
2719 
2720   kc = round_up_po2(kc, 8);
2721   int8_t* c0 = c;
2722   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2723   if XNN_UNPREDICTABLE(mr < 2) {
2724     c1 = c0;
2725   }
2726   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2727   if XNN_UNPREDICTABLE(mr <= 2) {
2728     c2 = c1;
2729   }
2730 
2731   do {
2732     const __m128i vbias0x0 = _mm_loadu_si32(w);
2733     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
2734     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
2735     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
2736     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
2737     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
2738     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
2739     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
2740     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
2741     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
2742     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
2743     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
2744     __m256i vacc1x01 = vacc0x01;
2745     __m256i vacc1x23 = vacc0x23;
2746     __m256i vacc1x45 = vacc0x45;
2747     __m256i vacc1x67 = vacc0x67;
2748     __m256i vacc2x01 = vacc0x01;
2749     __m256i vacc2x23 = vacc0x23;
2750     __m256i vacc2x45 = vacc0x45;
2751     __m256i vacc2x67 = vacc0x67;
2752     w = (const void*) ((const int32_t*) w + 8);
2753 
2754     size_t p = ks;
2755     do {
2756       const int8_t* restrict a0 = a[0];
2757       if XNN_UNPREDICTABLE(a0 != zero) {
2758         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2759       }
2760       const int8_t* restrict a1 = a[1];
2761       if XNN_UNPREDICTABLE(a1 != zero) {
2762         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
2763       }
2764       const int8_t* restrict a2 = a[2];
2765       if XNN_UNPREDICTABLE(a2 != zero) {
2766         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
2767       }
2768       a += 3;
2769 
2770       size_t k = 0;
2771       while (k < kc) {
2772         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
2773         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
2774         a0 += 8;
2775         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
2776         const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
2777         a1 += 8;
2778         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
2779         const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
2780         a2 += 8;
2781 
2782         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
2783         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
2784 
2785         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
2786         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
2787         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
2788         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
2789         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
2790 
2791         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
2792         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
2793         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
2794         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
2795         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
2796 
2797         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
2798         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
2799         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
2800         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
2801         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
2802 
2803         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
2804         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
2805         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
2806 
2807         w = (const void*) ((const int8_t*) w + 64);
2808         k += 8 * sizeof(int8_t);
2809       }
2810       p -= 3 * sizeof(void*);
2811     } while (p != 0);
2812 
2813     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
2814     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
2815     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
2816     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
2817     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
2818     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
2819 
2820     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
2821     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
2822     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
2823 
2824     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
2825     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
2826     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
2827     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
2828 
2829     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
2830     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
2831     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
2832 
2833     const __m256 vscale01234567 = _mm256_load_ps(w);
2834     w = (const void*) ((const float*) w + 8);
2835     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
2836     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
2837     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
2838 
2839     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
2840     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
2841     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
2842     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
2843 
2844     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
2845     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
2846     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
2847 
2848     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
2849     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
2850     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
2851 
2852     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2853     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
2854 
2855     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
2856 
2857     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
2858 
2859     __m128i vout_lo = _mm256_castsi256_si128(vout);
2860     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
2861 
2862     if (nc >= 8) {
2863       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
2864       _mm_storel_epi64((__m128i*) c1, vout_hi);
2865       _mm_storel_epi64((__m128i*) c0, vout_lo);
2866 
2867       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2868       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2869       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2870 
2871       a = (const int8_t**restrict) ((uintptr_t) a - ks);
2872 
2873       nc -= 8;
2874     } else {
2875       if (nc & 4) {
2876         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
2877         _mm_storeu_si32(c1, vout_hi);
2878         _mm_storeu_si32(c0, vout_lo);
2879 
2880         c2 += 4;
2881         c1 += 4;
2882         c0 += 4;
2883 
2884         vout_lo = _mm_srli_epi64(vout_lo, 32);
2885         vout_hi = _mm_srli_epi64(vout_hi, 32);
2886       }
2887       if (nc & 2) {
2888         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
2889         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
2890         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
2891 
2892         c2 += 2;
2893         c1 += 2;
2894         c0 += 2;
2895 
2896         vout_lo = _mm_srli_epi32(vout_lo, 16);
2897         vout_hi = _mm_srli_epi32(vout_hi, 16);
2898       }
2899       if (nc & 1) {
2900         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
2901         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
2902         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
2903       }
2904 
2905       nc = 0;
2906     }
2907   } while (nc != 0);
2908 }
2909 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2910 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
2911     size_t channels,
2912     size_t output_width,
2913     const int8_t** input,
2914     const void* weights,
2915     int8_t* output,
2916     size_t input_stride,
2917     size_t output_increment,
2918     size_t input_offset,
2919     const int8_t* zero,
2920     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2921 {
2922   assert(channels != 0);
2923   assert(output_width != 0);
2924 
2925   do {
2926     const int8_t* i0 = input[0];
2927     assert(i0 != NULL);
2928     if XNN_UNPREDICTABLE(i0 != zero) {
2929       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2930     }
2931     const int8_t* i1 = input[1];
2932     assert(i1 != NULL);
2933     if XNN_UNPREDICTABLE(i1 != zero) {
2934       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2935     }
2936     const int8_t* i2 = input[2];
2937     assert(i2 != NULL);
2938     if XNN_UNPREDICTABLE(i2 != zero) {
2939       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2940     }
2941     const int8_t* i3 = input[3];
2942     assert(i3 != NULL);
2943     if XNN_UNPREDICTABLE(i3 != zero) {
2944       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2945     }
2946     const int8_t* i4 = input[4];
2947     assert(i4 != NULL);
2948     if XNN_UNPREDICTABLE(i4 != zero) {
2949       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2950     }
2951     const int8_t* i5 = input[5];
2952     assert(i5 != NULL);
2953     if XNN_UNPREDICTABLE(i5 != zero) {
2954       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2955     }
2956     const int8_t* i6 = input[6];
2957     assert(i6 != NULL);
2958     if XNN_UNPREDICTABLE(i6 != zero) {
2959       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2960     }
2961     const int8_t* i7 = input[7];
2962     assert(i7 != NULL);
2963     if XNN_UNPREDICTABLE(i7 != zero) {
2964       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2965     }
2966     const int8_t* i8 = input[8];
2967     assert(i8 != NULL);
2968     if XNN_UNPREDICTABLE(i8 != zero) {
2969       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2970     }
2971     const int8_t* i9 = input[9];
2972     assert(i9 != NULL);
2973     if XNN_UNPREDICTABLE(i9 != zero) {
2974       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2975     }
2976     const int8_t* i10 = input[10];
2977     assert(i10 != NULL);
2978     if XNN_UNPREDICTABLE(i10 != zero) {
2979       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2980     }
2981     const int8_t* i11 = input[11];
2982     assert(i11 != NULL);
2983     if XNN_UNPREDICTABLE(i11 != zero) {
2984       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2985     }
2986     const int8_t* i12 = input[12];
2987     assert(i12 != NULL);
2988     if XNN_UNPREDICTABLE(i12 != zero) {
2989       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2990     }
2991     const int8_t* i13 = input[13];
2992     assert(i13 != NULL);
2993     if XNN_UNPREDICTABLE(i13 != zero) {
2994       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2995     }
2996     const int8_t* i14 = input[14];
2997     assert(i14 != NULL);
2998     if XNN_UNPREDICTABLE(i14 != zero) {
2999       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
3000     }
3001     const int8_t* i15 = input[15];
3002     assert(i15 != NULL);
3003     if XNN_UNPREDICTABLE(i15 != zero) {
3004       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
3005     }
3006     const int8_t* i16 = input[16];
3007     assert(i16 != NULL);
3008     if XNN_UNPREDICTABLE(i16 != zero) {
3009       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
3010     }
3011     const int8_t* i17 = input[17];
3012     assert(i17 != NULL);
3013     if XNN_UNPREDICTABLE(i17 != zero) {
3014       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
3015     }
3016     const int8_t* i18 = input[18];
3017     assert(i18 != NULL);
3018     if XNN_UNPREDICTABLE(i18 != zero) {
3019       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
3020     }
3021     const int8_t* i19 = input[19];
3022     assert(i19 != NULL);
3023     if XNN_UNPREDICTABLE(i19 != zero) {
3024       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
3025     }
3026     const int8_t* i20 = input[20];
3027     assert(i20 != NULL);
3028     if XNN_UNPREDICTABLE(i20 != zero) {
3029       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
3030     }
3031     const int8_t* i21 = input[21];
3032     assert(i21 != NULL);
3033     if XNN_UNPREDICTABLE(i21 != zero) {
3034       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
3035     }
3036     const int8_t* i22 = input[22];
3037     assert(i22 != NULL);
3038     if XNN_UNPREDICTABLE(i22 != zero) {
3039       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
3040     }
3041     const int8_t* i23 = input[23];
3042     assert(i23 != NULL);
3043     if XNN_UNPREDICTABLE(i23 != zero) {
3044       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
3045     }
3046     const int8_t* i24 = input[24];
3047     assert(i24 != NULL);
3048     if XNN_UNPREDICTABLE(i24 != zero) {
3049       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
3050     }
3051     input = (const int8_t**) ((uintptr_t) input + input_stride);
3052 
3053     size_t c = channels;
3054     const void* w = weights;
3055     for (; c >= 16; c -= 16) {
3056       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3057       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
3058 
3059 
3060       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3061       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
3062       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
3063       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
3064       i0 += 16;
3065 
3066       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3067       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
3068 
3069       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3070       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
3071       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
3072       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
3073       i1 += 16;
3074 
3075       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3076       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
3077 
3078       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3079       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
3080       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
3081       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
3082       i2 += 16;
3083 
3084       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3085       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
3086 
3087       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3088       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
3089       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
3090       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
3091       i3 += 16;
3092 
3093       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3094       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
3095 
3096       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3097       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
3098       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
3099       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
3100       i4 += 16;
3101 
3102       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3103       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
3104 
3105       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3106       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
3107       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
3108       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
3109       i5 += 16;
3110 
3111       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3112       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
3113 
3114       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3115       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
3116       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
3117       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
3118       i6 += 16;
3119 
3120       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3121       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
3122 
3123       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3124       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
3125       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
3126       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
3127       i7 += 16;
3128 
3129       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3130       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
3131 
3132       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3133       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
3134       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
3135       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
3136       i8 += 16;
3137 
3138       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3139       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
3140 
3141       const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
3142       const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
3143       const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
3144       const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
3145       i9 += 16;
3146 
3147       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
3148       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
3149 
3150       const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
3151       const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
3152       const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
3153       const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
3154       i10 += 16;
3155 
3156       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
3157       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
3158 
3159       const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
3160       const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
3161       const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
3162       const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
3163       i11 += 16;
3164 
3165       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
3166       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
3167 
3168       const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
3169       const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
3170       const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
3171       const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
3172       i12 += 16;
3173 
3174       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
3175       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
3176 
3177       const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
3178       const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
3179       const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
3180       const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
3181       i13 += 16;
3182 
3183       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
3184       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
3185 
3186       const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
3187       const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
3188       const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
3189       const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
3190       i14 += 16;
3191 
3192       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
3193       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
3194 
3195       const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
3196       const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
3197       const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
3198       const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
3199       i15 += 16;
3200 
3201       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
3202       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
3203 
3204       const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
3205       const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
3206       const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
3207       const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
3208       i16 += 16;
3209 
3210       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
3211       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
3212 
3213       const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
3214       const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
3215       const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
3216       const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
3217       i17 += 16;
3218 
3219       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
3220       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
3221 
3222       const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
3223       const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
3224       const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
3225       const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
3226       i18 += 16;
3227 
3228       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
3229       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
3230 
3231       const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
3232       const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
3233       const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
3234       const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
3235       i19 += 16;
3236 
3237       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
3238       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
3239 
3240       const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
3241       const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
3242       const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
3243       const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
3244       i20 += 16;
3245 
3246       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
3247       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
3248 
3249       const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
3250       const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
3251       const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
3252       const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
3253       i21 += 16;
3254 
3255       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
3256       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
3257 
3258       const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
3259       const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
3260       const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
3261       const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
3262       i22 += 16;
3263 
3264       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
3265       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
3266 
3267       const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
3268       const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
3269       const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
3270       const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
3271       i23 += 16;
3272 
3273       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
3274       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
3275 
3276       const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
3277       const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
3278       const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
3279       const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
3280       i24 += 16;
3281 
3282       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
3283       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
3284 
3285       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
3286 
3287       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3288       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
3289 
3290       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3291       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
3292       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
3293 
3294       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3295       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
3296       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
3297 
3298       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3299       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
3300 
3301       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3302       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
3303 
3304       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3305 
3306       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3307       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3308 
3309       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3310       output += 16;
3311     }
3312     if XNN_UNLIKELY(c != 0) {
3313       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
3314       do {
3315         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3316 
3317 
3318         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3319         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
3320         i0 += 8;
3321 
3322         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3323 
3324         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3325         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
3326         i1 += 8;
3327 
3328         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3329 
3330         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3331         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
3332         i2 += 8;
3333 
3334         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3335 
3336         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3337         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
3338         i3 += 8;
3339 
3340         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3341 
3342         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3343         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
3344         i4 += 8;
3345 
3346         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3347 
3348         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3349         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
3350         i5 += 8;
3351 
3352         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3353 
3354         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3355         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
3356         i6 += 8;
3357 
3358         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3359 
3360         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3361         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
3362         i7 += 8;
3363 
3364         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3365 
3366         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3367         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
3368         i8 += 8;
3369 
3370         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3371 
3372         const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
3373         const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
3374         i9 += 8;
3375 
3376         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
3377 
3378         const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
3379         const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
3380         i10 += 8;
3381 
3382         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
3383 
3384         const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
3385         const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
3386         i11 += 8;
3387 
3388         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
3389 
3390         const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
3391         const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
3392         i12 += 8;
3393 
3394         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
3395 
3396         const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
3397         const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
3398         i13 += 8;
3399 
3400         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
3401 
3402         const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
3403         const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
3404         i14 += 8;
3405 
3406         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
3407 
3408         const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
3409         const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
3410         i15 += 8;
3411 
3412         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
3413 
3414         const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
3415         const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
3416         i16 += 8;
3417 
3418         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
3419 
3420         const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
3421         const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
3422         i17 += 8;
3423 
3424         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
3425 
3426         const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
3427         const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
3428         i18 += 8;
3429 
3430         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
3431 
3432         const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
3433         const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
3434         i19 += 8;
3435 
3436         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
3437 
3438         const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
3439         const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
3440         i20 += 8;
3441 
3442         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
3443 
3444         const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
3445         const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
3446         i21 += 8;
3447 
3448         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
3449 
3450         const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
3451         const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
3452         i22 += 8;
3453 
3454         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
3455 
3456         const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
3457         const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
3458         i23 += 8;
3459 
3460         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
3461 
3462         const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
3463         const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
3464         i24 += 8;
3465 
3466         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
3467 
3468         k += 8;
3469 
3470         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3471         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
3472         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3473         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3474 
3475         w = (const void*) ((const int32_t*) w + 8);
3476 
3477         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3478         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3479 
3480         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3481 
3482         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3483         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3484 
3485         if XNN_LIKELY(c >= 8) {
3486           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3487           output += 8;
3488           c -= 8;
3489         } else {
3490           if (c & 4) {
3491             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
3492             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3493             output += 4;
3494           }
3495           if (c & 2) {
3496             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
3497             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3498             output += 2;
3499           }
3500           if (c & 1) {
3501             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3502             output += 1;
3503           }
3504           c = 0;
3505         }
3506       } while (c != 0);
3507     }
3508 
3509     output = (int8_t*) ((uintptr_t) output + output_increment);
3510   } while (--output_width != 0);
3511 }
3512 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3513 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
3514     size_t channels,
3515     size_t output_width,
3516     const int8_t** input,
3517     const void* weights,
3518     int8_t* output,
3519     size_t input_stride,
3520     size_t output_increment,
3521     size_t input_offset,
3522     const int8_t* zero,
3523     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3524 {
3525   assert(channels != 0);
3526   assert(output_width != 0);
3527 
3528   do {
3529     const int8_t* i0 = input[0];
3530     assert(i0 != NULL);
3531     if XNN_UNPREDICTABLE(i0 != zero) {
3532       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3533     }
3534     const int8_t* i1 = input[1];
3535     assert(i1 != NULL);
3536     if XNN_UNPREDICTABLE(i1 != zero) {
3537       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3538     }
3539     const int8_t* i2 = input[2];
3540     assert(i2 != NULL);
3541     if XNN_UNPREDICTABLE(i2 != zero) {
3542       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3543     }
3544     const int8_t* i3 = input[3];
3545     assert(i3 != NULL);
3546     if XNN_UNPREDICTABLE(i3 != zero) {
3547       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3548     }
3549     const int8_t* i4 = input[4];
3550     assert(i4 != NULL);
3551     if XNN_UNPREDICTABLE(i4 != zero) {
3552       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3553     }
3554     const int8_t* i5 = input[5];
3555     assert(i5 != NULL);
3556     if XNN_UNPREDICTABLE(i5 != zero) {
3557       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3558     }
3559     const int8_t* i6 = input[6];
3560     assert(i6 != NULL);
3561     if XNN_UNPREDICTABLE(i6 != zero) {
3562       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3563     }
3564     const int8_t* i7 = input[7];
3565     assert(i7 != NULL);
3566     if XNN_UNPREDICTABLE(i7 != zero) {
3567       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3568     }
3569     const int8_t* i8 = input[8];
3570     assert(i8 != NULL);
3571     if XNN_UNPREDICTABLE(i8 != zero) {
3572       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3573     }
3574     input = (const int8_t**) ((uintptr_t) input + input_stride);
3575 
3576     size_t c = channels;
3577     const void* w = weights;
3578     for (; c >= 16; c -= 16) {
3579       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3580       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
3581 
3582 
3583       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3584       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
3585       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
3586       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
3587       i0 += 16;
3588 
3589       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3590       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
3591 
3592       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3593       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
3594       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
3595       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
3596       i1 += 16;
3597 
3598       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3599       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
3600 
3601       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3602       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
3603       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
3604       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
3605       i2 += 16;
3606 
3607       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3608       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
3609 
3610       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3611       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
3612       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
3613       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
3614       i3 += 16;
3615 
3616       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3617       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
3618 
3619       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3620       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
3621       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
3622       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
3623       i4 += 16;
3624 
3625       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3626       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
3627 
3628       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3629       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
3630       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
3631       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
3632       i5 += 16;
3633 
3634       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3635       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
3636 
3637       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3638       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
3639       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
3640       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
3641       i6 += 16;
3642 
3643       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3644       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
3645 
3646       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3647       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
3648       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
3649       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
3650       i7 += 16;
3651 
3652       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3653       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
3654 
3655       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3656       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
3657       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
3658       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
3659       i8 += 16;
3660 
3661       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3662       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
3663 
3664       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
3665 
3666       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3667       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
3668 
3669       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3670       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
3671       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
3672 
3673       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3674       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
3675       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
3676 
3677       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3678       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
3679 
3680       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3681       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
3682 
3683       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3684 
3685       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3686       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3687 
3688       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3689       output += 16;
3690     }
3691     if XNN_UNLIKELY(c != 0) {
3692       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
3693       do {
3694         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3695 
3696 
3697         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3698         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
3699         i0 += 8;
3700 
3701         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3702 
3703         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3704         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
3705         i1 += 8;
3706 
3707         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3708 
3709         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3710         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
3711         i2 += 8;
3712 
3713         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3714 
3715         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3716         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
3717         i3 += 8;
3718 
3719         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3720 
3721         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3722         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
3723         i4 += 8;
3724 
3725         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3726 
3727         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3728         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
3729         i5 += 8;
3730 
3731         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3732 
3733         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3734         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
3735         i6 += 8;
3736 
3737         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3738 
3739         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3740         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
3741         i7 += 8;
3742 
3743         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3744 
3745         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3746         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
3747         i8 += 8;
3748 
3749         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3750 
3751         k += 8;
3752 
3753         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3754         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
3755         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3756         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3757 
3758         w = (const void*) ((const int32_t*) w + 8);
3759 
3760         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3761         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3762 
3763         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3764 
3765         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3766         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3767 
3768         if XNN_LIKELY(c >= 8) {
3769           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3770           output += 8;
3771           c -= 8;
3772         } else {
3773           if (c & 4) {
3774             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
3775             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3776             output += 4;
3777           }
3778           if (c & 2) {
3779             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
3780             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3781             output += 2;
3782           }
3783           if (c & 1) {
3784             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3785             output += 1;
3786           }
3787           c = 0;
3788         }
3789       } while (c != 0);
3790     }
3791 
3792     output = (int8_t*) ((uintptr_t) output + output_increment);
3793   } while (--output_width != 0);
3794 }
3795 
xnn_qs8_f32_vcvt_ukernel__avx2_x16(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])3796 void xnn_qs8_f32_vcvt_ukernel__avx2_x16(
3797     size_t n,
3798     const int8_t* x,
3799     float* y,
3800     const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3801 {
3802   assert(n != 0);
3803   assert(n % sizeof(int8_t) == 0);
3804   assert(x != NULL);
3805   assert(y != NULL);
3806 
3807   const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
3808   const __m256 vscale = _mm256_load_ps(params->avx.scale);
3809   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
3810     __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3811     __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
3812     x += 16;
3813 
3814     vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
3815     vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
3816 
3817     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
3818     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
3819 
3820     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
3821     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
3822 
3823     _mm256_storeu_ps(y, vy01234567);
3824     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3825     y += 16;
3826   }
3827   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
3828     __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3829     vx = _mm256_add_epi32(vx, vminus_zero_point);
3830     x += 8;
3831 
3832     __m256 vy = _mm256_cvtepi32_ps(vx);
3833     vy = _mm256_mul_ps(vy, vscale);
3834 
3835     _mm256_storeu_ps(y, vy);
3836     y += 8;
3837   }
3838   if XNN_UNLIKELY(n != 0) {
3839     assert(n >= 1 * sizeof(int8_t));
3840     assert(n <= 7 * sizeof(int8_t));
3841 
3842     __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
3843     vx = _mm256_add_epi32(vx, vminus_zero_point);
3844 
3845     __m256 vy = _mm256_cvtepi32_ps(vx);
3846     vy = _mm256_mul_ps(vy, vscale);
3847 
3848     __m128 vy_lo = _mm256_castps256_ps128(vy);
3849     if (n & (4 * sizeof(int8_t))) {
3850       _mm_storeu_ps(y, vy_lo);
3851       vy_lo = _mm256_extractf128_ps(vy, 1);
3852       y += 4;
3853     }
3854     if (n & (2 * sizeof(int8_t))) {
3855       _mm_storel_pi((__m64*) y, vy_lo);
3856       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3857       y += 2;
3858     }
3859     if (n & (1 * sizeof(int8_t))) {
3860       _mm_store_ss(y, vy_lo);
3861     }
3862   }
3863 }
3864 
xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3865 void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
3866     size_t mr,
3867     size_t nc,
3868     size_t kc,
3869     const int8_t* restrict a,
3870     size_t a_stride,
3871     const void* restrict w,
3872     int8_t* restrict c,
3873     size_t cm_stride,
3874     size_t cn_stride,
3875     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3876 {
3877   assert(mr != 0);
3878   assert(mr <= 1);
3879   assert(nc != 0);
3880   assert(kc != 0);
3881   assert(kc % sizeof(int8_t) == 0);
3882   assert(a != NULL);
3883   assert(w != NULL);
3884   assert(c != NULL);
3885 
3886   kc = round_up_po2(kc, 8);
3887   const int8_t* a0 = a;
3888   int8_t* c0 = c;
3889 
3890   do {
3891     const __m128i vbias0x0 = _mm_loadu_si32(w);
3892     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
3893     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3894     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
3895     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
3896     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3897     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
3898     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
3899     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3900     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
3901     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
3902     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3903     w = (const void*) ((const int32_t*) w + 8);
3904 
3905     size_t k = 0;
3906     while (k < kc) {
3907       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3908       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3909       a0 += 8;
3910 
3911       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3912       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3913 
3914       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3915       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3916       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3917 
3918       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3919       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3920       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3921 
3922       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3923       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3924       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3925 
3926       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3927 
3928       w = (const void*) ((const int8_t*) w + 64);
3929       k += 8 * sizeof(int8_t);
3930     }
3931 
3932     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3933     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3934 
3935     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3936 
3937     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3938     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3939 
3940     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3941 
3942     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
3943     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
3944 
3945     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3946     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3947 
3948     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3949 
3950     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3951     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
3952 
3953     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3954 
3955     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
3956 
3957     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3958 
3959     __m128i vout_lo = _mm256_castsi256_si128(vout);
3960     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3961 
3962     if (nc >= 8) {
3963       _mm_storel_epi64((__m128i*) c0, vout_lo);
3964 
3965       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3966 
3967       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
3968 
3969       nc -= 8;
3970     } else {
3971       if (nc & 4) {
3972         _mm_storeu_si32(c0, vout_lo);
3973 
3974         c0 += 4;
3975 
3976         vout_lo = _mm_srli_epi64(vout_lo, 32);
3977         vout_hi = _mm_srli_epi64(vout_hi, 32);
3978       }
3979       if (nc & 2) {
3980         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
3981 
3982         c0 += 2;
3983 
3984         vout_lo = _mm_srli_epi32(vout_lo, 16);
3985         vout_hi = _mm_srli_epi32(vout_hi, 16);
3986       }
3987       if (nc & 1) {
3988         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
3989       }
3990 
3991       nc = 0;
3992     }
3993   } while (nc != 0);
3994 }
3995 
xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3996 void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
3997     size_t mr,
3998     size_t nc,
3999     size_t kc,
4000     const int8_t* restrict a,
4001     size_t a_stride,
4002     const void* restrict w,
4003     int8_t* restrict c,
4004     size_t cm_stride,
4005     size_t cn_stride,
4006     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4007 {
4008   assert(mr != 0);
4009   assert(mr <= 3);
4010   assert(nc != 0);
4011   assert(kc != 0);
4012   assert(kc % sizeof(int8_t) == 0);
4013   assert(a != NULL);
4014   assert(w != NULL);
4015   assert(c != NULL);
4016 
4017   kc = round_up_po2(kc, 8);
4018   const int8_t* a0 = a;
4019   int8_t* c0 = c;
4020   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
4021   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4022   if XNN_UNPREDICTABLE(mr < 2) {
4023     a1 = a0;
4024     c1 = c0;
4025   }
4026   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
4027   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4028   if XNN_UNPREDICTABLE(mr <= 2) {
4029     a2 = a1;
4030     c2 = c1;
4031   }
4032 
4033   do {
4034     const __m128i vbias0x0 = _mm_loadu_si32(w);
4035     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4036     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4037     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4038     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4039     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4040     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4041     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4042     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4043     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4044     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4045     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4046     __m256i vacc1x01 = vacc0x01;
4047     __m256i vacc1x23 = vacc0x23;
4048     __m256i vacc1x45 = vacc0x45;
4049     __m256i vacc1x67 = vacc0x67;
4050     __m256i vacc2x01 = vacc0x01;
4051     __m256i vacc2x23 = vacc0x23;
4052     __m256i vacc2x45 = vacc0x45;
4053     __m256i vacc2x67 = vacc0x67;
4054     w = (const void*) ((const int32_t*) w + 8);
4055 
4056     size_t k = 0;
4057     while (k < kc) {
4058       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4059       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4060       a0 += 8;
4061       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
4062       const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
4063       a1 += 8;
4064       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
4065       const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
4066       a2 += 8;
4067 
4068       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4069       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4070 
4071       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4072       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
4073       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
4074       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4075       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4076 
4077       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4078       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
4079       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
4080       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4081       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4082 
4083       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4084       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
4085       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
4086       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4087       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4088 
4089       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4090       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
4091       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
4092 
4093       w = (const void*) ((const int8_t*) w + 64);
4094       k += 8 * sizeof(int8_t);
4095     }
4096 
4097     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4098     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4099     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
4100     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
4101     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
4102     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
4103 
4104     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4105     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
4106     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
4107 
4108     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4109     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4110     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
4111     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
4112 
4113     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4114     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
4115     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
4116 
4117     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4118     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4119     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
4120     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
4121 
4122     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4123     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4124     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
4125     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
4126 
4127     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4128     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
4129     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
4130 
4131     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4132     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
4133     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
4134 
4135     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4136     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4137 
4138     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
4139 
4140     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4141 
4142     __m128i vout_lo = _mm256_castsi256_si128(vout);
4143     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4144 
4145     if (nc >= 8) {
4146       _mm_storel_epi64((__m128i*) c0, vout_lo);
4147       _mm_storel_epi64((__m128i*) c1, vout_hi);
4148       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
4149 
4150       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4151       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4152       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4153 
4154       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4155       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
4156       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
4157 
4158       nc -= 8;
4159     } else {
4160       if (nc & 4) {
4161         _mm_storeu_si32(c0, vout_lo);
4162         _mm_storeu_si32(c1, vout_hi);
4163         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
4164 
4165         c0 += 4;
4166         c1 += 4;
4167         c2 += 4;
4168 
4169         vout_lo = _mm_srli_epi64(vout_lo, 32);
4170         vout_hi = _mm_srli_epi64(vout_hi, 32);
4171       }
4172       if (nc & 2) {
4173         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4174         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
4175         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
4176 
4177         c0 += 2;
4178         c1 += 2;
4179         c2 += 2;
4180 
4181         vout_lo = _mm_srli_epi32(vout_lo, 16);
4182         vout_hi = _mm_srli_epi32(vout_hi, 16);
4183       }
4184       if (nc & 1) {
4185         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4186         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
4187         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
4188       }
4189 
4190       nc = 0;
4191     }
4192   } while (nc != 0);
4193 }
4194 
xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4195 void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
4196     size_t mr,
4197     size_t nc,
4198     size_t kc,
4199     size_t ks,
4200     const int8_t** restrict a,
4201     const void* restrict w,
4202     int8_t* restrict c,
4203     size_t cm_stride,
4204     size_t cn_stride,
4205     size_t a_offset,
4206     const int8_t* zero,
4207     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4208 {
4209   assert(mr != 0);
4210   assert(mr <= 1);
4211   assert(nc != 0);
4212   assert(kc != 0);
4213   assert(ks != 0);
4214   assert(ks % (1 * sizeof(void*)) == 0);
4215   assert(a_offset % sizeof(int8_t) == 0);
4216   assert(a != NULL);
4217   assert(w != NULL);
4218   assert(c != NULL);
4219 
4220   kc = round_up_po2(kc, 8);
4221   int8_t* c0 = c;
4222 
4223   do {
4224     const __m128i vbias0x0 = _mm_loadu_si32(w);
4225     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4226     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4227     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4228     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4229     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4230     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4231     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4232     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4233     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4234     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4235     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4236     w = (const void*) ((const int32_t*) w + 8);
4237 
4238     size_t p = ks;
4239     do {
4240       const int8_t* restrict a0 = a[0];
4241       if XNN_UNPREDICTABLE(a0 != zero) {
4242         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4243       }
4244       a += 1;
4245 
4246       size_t k = 0;
4247       while (k < kc) {
4248         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4249         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4250         a0 += 8;
4251 
4252         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4253         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4254 
4255         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4256         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4257         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4258 
4259         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4260         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4261         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4262 
4263         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4264         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4265         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4266 
4267         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4268 
4269         w = (const void*) ((const int8_t*) w + 64);
4270         k += 8 * sizeof(int8_t);
4271       }
4272       p -= 1 * sizeof(void*);
4273     } while (p != 0);
4274 
4275     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4276     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4277 
4278     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4279 
4280     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4281     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4282 
4283     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4284 
4285     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4286     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4287 
4288     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4289     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4290 
4291     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4292 
4293     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4294     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
4295 
4296     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4297 
4298     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
4299 
4300     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4301 
4302     __m128i vout_lo = _mm256_castsi256_si128(vout);
4303     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4304 
4305     if (nc >= 8) {
4306       _mm_storel_epi64((__m128i*) c0, vout_lo);
4307 
4308       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4309 
4310       a = (const int8_t**restrict) ((uintptr_t) a - ks);
4311 
4312       nc -= 8;
4313     } else {
4314       if (nc & 4) {
4315         _mm_storeu_si32(c0, vout_lo);
4316 
4317         c0 += 4;
4318 
4319         vout_lo = _mm_srli_epi64(vout_lo, 32);
4320         vout_hi = _mm_srli_epi64(vout_hi, 32);
4321       }
4322       if (nc & 2) {
4323         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4324 
4325         c0 += 2;
4326 
4327         vout_lo = _mm_srli_epi32(vout_lo, 16);
4328         vout_hi = _mm_srli_epi32(vout_hi, 16);
4329       }
4330       if (nc & 1) {
4331         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4332       }
4333 
4334       nc = 0;
4335     }
4336   } while (nc != 0);
4337 }
4338 
xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4339 void xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
4340     size_t mr,
4341     size_t nc,
4342     size_t kc,
4343     size_t ks,
4344     const int8_t** restrict a,
4345     const void* restrict w,
4346     int8_t* restrict c,
4347     size_t cm_stride,
4348     size_t cn_stride,
4349     size_t a_offset,
4350     const int8_t* zero,
4351     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4352 {
4353   assert(mr != 0);
4354   assert(mr <= 3);
4355   assert(nc != 0);
4356   assert(kc != 0);
4357   assert(ks != 0);
4358   assert(ks % (3 * sizeof(void*)) == 0);
4359   assert(a_offset % sizeof(int8_t) == 0);
4360   assert(a != NULL);
4361   assert(w != NULL);
4362   assert(c != NULL);
4363 
4364   kc = round_up_po2(kc, 8);
4365   int8_t* c0 = c;
4366   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4367   if XNN_UNPREDICTABLE(mr < 2) {
4368     c1 = c0;
4369   }
4370   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4371   if XNN_UNPREDICTABLE(mr <= 2) {
4372     c2 = c1;
4373   }
4374 
4375   do {
4376     const __m128i vbias0x0 = _mm_loadu_si32(w);
4377     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
4378     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
4379     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
4380     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
4381     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
4382     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
4383     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
4384     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
4385     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
4386     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
4387     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
4388     __m256i vacc1x01 = vacc0x01;
4389     __m256i vacc1x23 = vacc0x23;
4390     __m256i vacc1x45 = vacc0x45;
4391     __m256i vacc1x67 = vacc0x67;
4392     __m256i vacc2x01 = vacc0x01;
4393     __m256i vacc2x23 = vacc0x23;
4394     __m256i vacc2x45 = vacc0x45;
4395     __m256i vacc2x67 = vacc0x67;
4396     w = (const void*) ((const int32_t*) w + 8);
4397 
4398     size_t p = ks;
4399     do {
4400       const int8_t* restrict a0 = a[0];
4401       if XNN_UNPREDICTABLE(a0 != zero) {
4402         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4403       }
4404       const int8_t* restrict a1 = a[1];
4405       if XNN_UNPREDICTABLE(a1 != zero) {
4406         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
4407       }
4408       const int8_t* restrict a2 = a[2];
4409       if XNN_UNPREDICTABLE(a2 != zero) {
4410         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
4411       }
4412       a += 3;
4413 
4414       size_t k = 0;
4415       while (k < kc) {
4416         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
4417         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
4418         a0 += 8;
4419         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
4420         const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
4421         a1 += 8;
4422         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
4423         const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
4424         a2 += 8;
4425 
4426         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
4427         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
4428 
4429         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
4430         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
4431         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
4432         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
4433         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
4434 
4435         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
4436         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
4437         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
4438         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
4439         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
4440 
4441         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
4442         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
4443         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
4444         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
4445         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
4446 
4447         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
4448         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
4449         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
4450 
4451         w = (const void*) ((const int8_t*) w + 64);
4452         k += 8 * sizeof(int8_t);
4453       }
4454       p -= 3 * sizeof(void*);
4455     } while (p != 0);
4456 
4457     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
4458     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
4459     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
4460     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
4461     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
4462     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
4463 
4464     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
4465     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
4466     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
4467 
4468     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
4469     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
4470     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
4471     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
4472 
4473     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
4474     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
4475     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
4476 
4477     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4478     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
4479     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
4480     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
4481 
4482     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4483     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
4484     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
4485     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
4486 
4487     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
4488     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
4489     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
4490 
4491     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4492     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
4493     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
4494 
4495     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4496     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
4497 
4498     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
4499 
4500     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
4501 
4502     __m128i vout_lo = _mm256_castsi256_si128(vout);
4503     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
4504 
4505     if (nc >= 8) {
4506       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
4507       _mm_storel_epi64((__m128i*) c1, vout_hi);
4508       _mm_storel_epi64((__m128i*) c0, vout_lo);
4509 
4510       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4511       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4512       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4513 
4514       a = (const int8_t**restrict) ((uintptr_t) a - ks);
4515 
4516       nc -= 8;
4517     } else {
4518       if (nc & 4) {
4519         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
4520         _mm_storeu_si32(c1, vout_hi);
4521         _mm_storeu_si32(c0, vout_lo);
4522 
4523         c2 += 4;
4524         c1 += 4;
4525         c0 += 4;
4526 
4527         vout_lo = _mm_srli_epi64(vout_lo, 32);
4528         vout_hi = _mm_srli_epi64(vout_hi, 32);
4529       }
4530       if (nc & 2) {
4531         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
4532         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
4533         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
4534 
4535         c2 += 2;
4536         c1 += 2;
4537         c0 += 2;
4538 
4539         vout_lo = _mm_srli_epi32(vout_lo, 16);
4540         vout_hi = _mm_srli_epi32(vout_hi, 16);
4541       }
4542       if (nc & 1) {
4543         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
4544         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
4545         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4546       }
4547 
4548       nc = 0;
4549     }
4550   } while (nc != 0);
4551 }
4552 
xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4553 void xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
4554     size_t n,
4555     const int8_t* input_a,
4556     const int8_t* input_b,
4557     int8_t* output,
4558     const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4559 {
4560   const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
4561   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
4562   const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
4563   const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
4564   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
4565   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
4566   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
4567 
4568   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4569     const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4570     const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
4571     const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
4572     const __m256i vb89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
4573     input_a += 16;
4574     input_b += 16;
4575 
4576     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4577     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
4578 
4579     vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
4580     vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
4581 
4582     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4583     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
4584 
4585     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4586 
4587     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4588 
4589     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4590 
4591     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
4592 
4593     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4594     output += 16;
4595   }
4596   if XNN_UNLIKELY(n != 0) {
4597     do {
4598       const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4599       const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
4600       input_a += 8;
4601       input_b += 8;
4602 
4603       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4604 
4605       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
4606 
4607       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4608 
4609       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
4610       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4611       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4612       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
4613 
4614       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
4615         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4616         output += 8;
4617         n -= 8 * sizeof(int8_t);
4618       } else {
4619         if (n & (4 * sizeof(int8_t))) {
4620           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
4621           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4622           output += 4;
4623         }
4624         if (n & (2 * sizeof(int8_t))) {
4625           *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
4626           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4627           output += 2;
4628         }
4629         if (n & (1 * sizeof(int8_t))) {
4630           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4631         }
4632         n = 0;
4633       }
4634     } while (n != 0);
4635   }
4636 }
4637 
xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4638 void xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
4639     size_t n,
4640     const int8_t* input_a,
4641     const int8_t* input_b,
4642     int8_t* output,
4643     const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4644 {
4645   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
4646   const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
4647   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
4648   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
4649   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
4650 
4651   const __m256i vbias = _mm256_add_epi32(
4652     _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
4653     _mm256_load_si256((const __m256i*) params->avx2.bias));
4654   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4655     const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4656     const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
4657     input_a += 16;
4658 
4659     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4660     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
4661 
4662     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4663     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
4664 
4665     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4666 
4667     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4668 
4669     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4670 
4671     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
4672 
4673     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4674     output += 16;
4675   }
4676   if XNN_UNLIKELY(n != 0) {
4677     do {
4678       const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
4679       input_a += 8;
4680 
4681       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
4682 
4683       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
4684 
4685       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
4686       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4687       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4688       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
4689 
4690       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
4691         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4692         output += 8;
4693         n -= 8 * sizeof(int8_t);
4694       } else {
4695         if (n & (4 * sizeof(int8_t))) {
4696           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
4697           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4698           output += 4;
4699         }
4700         if (n & (2 * sizeof(int8_t))) {
4701           *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
4702           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4703           output += 2;
4704         }
4705         if (n & (1 * sizeof(int8_t))) {
4706           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4707         }
4708         n = 0;
4709       }
4710     } while (n != 0);
4711   }
4712 }
4713 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4714 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
4715     size_t channels,
4716     size_t output_width,
4717     const uint8_t** input,
4718     const void* weights,
4719     uint8_t* output,
4720     size_t input_stride,
4721     size_t output_increment,
4722     size_t input_offset,
4723     const uint8_t* zero,
4724     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4725 {
4726   assert(channels != 0);
4727   assert(output_width != 0);
4728 
4729   const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
4730   do {
4731     const uint8_t* i0 = input[0];
4732     assert(i0 != NULL);
4733     if XNN_UNPREDICTABLE(i0 != zero) {
4734       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
4735     }
4736     const uint8_t* i1 = input[1];
4737     assert(i1 != NULL);
4738     if XNN_UNPREDICTABLE(i1 != zero) {
4739       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
4740     }
4741     const uint8_t* i2 = input[2];
4742     assert(i2 != NULL);
4743     if XNN_UNPREDICTABLE(i2 != zero) {
4744       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
4745     }
4746     const uint8_t* i3 = input[3];
4747     assert(i3 != NULL);
4748     if XNN_UNPREDICTABLE(i3 != zero) {
4749       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
4750     }
4751     const uint8_t* i4 = input[4];
4752     assert(i4 != NULL);
4753     if XNN_UNPREDICTABLE(i4 != zero) {
4754       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
4755     }
4756     const uint8_t* i5 = input[5];
4757     assert(i5 != NULL);
4758     if XNN_UNPREDICTABLE(i5 != zero) {
4759       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
4760     }
4761     const uint8_t* i6 = input[6];
4762     assert(i6 != NULL);
4763     if XNN_UNPREDICTABLE(i6 != zero) {
4764       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
4765     }
4766     const uint8_t* i7 = input[7];
4767     assert(i7 != NULL);
4768     if XNN_UNPREDICTABLE(i7 != zero) {
4769       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
4770     }
4771     const uint8_t* i8 = input[8];
4772     assert(i8 != NULL);
4773     if XNN_UNPREDICTABLE(i8 != zero) {
4774       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
4775     }
4776     const uint8_t* i9 = input[9];
4777     assert(i9 != NULL);
4778     if XNN_UNPREDICTABLE(i9 != zero) {
4779       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
4780     }
4781     const uint8_t* i10 = input[10];
4782     assert(i10 != NULL);
4783     if XNN_UNPREDICTABLE(i10 != zero) {
4784       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
4785     }
4786     const uint8_t* i11 = input[11];
4787     assert(i11 != NULL);
4788     if XNN_UNPREDICTABLE(i11 != zero) {
4789       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
4790     }
4791     const uint8_t* i12 = input[12];
4792     assert(i12 != NULL);
4793     if XNN_UNPREDICTABLE(i12 != zero) {
4794       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
4795     }
4796     const uint8_t* i13 = input[13];
4797     assert(i13 != NULL);
4798     if XNN_UNPREDICTABLE(i13 != zero) {
4799       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
4800     }
4801     const uint8_t* i14 = input[14];
4802     assert(i14 != NULL);
4803     if XNN_UNPREDICTABLE(i14 != zero) {
4804       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
4805     }
4806     const uint8_t* i15 = input[15];
4807     assert(i15 != NULL);
4808     if XNN_UNPREDICTABLE(i15 != zero) {
4809       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
4810     }
4811     const uint8_t* i16 = input[16];
4812     assert(i16 != NULL);
4813     if XNN_UNPREDICTABLE(i16 != zero) {
4814       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
4815     }
4816     const uint8_t* i17 = input[17];
4817     assert(i17 != NULL);
4818     if XNN_UNPREDICTABLE(i17 != zero) {
4819       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
4820     }
4821     const uint8_t* i18 = input[18];
4822     assert(i18 != NULL);
4823     if XNN_UNPREDICTABLE(i18 != zero) {
4824       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
4825     }
4826     const uint8_t* i19 = input[19];
4827     assert(i19 != NULL);
4828     if XNN_UNPREDICTABLE(i19 != zero) {
4829       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
4830     }
4831     const uint8_t* i20 = input[20];
4832     assert(i20 != NULL);
4833     if XNN_UNPREDICTABLE(i20 != zero) {
4834       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
4835     }
4836     const uint8_t* i21 = input[21];
4837     assert(i21 != NULL);
4838     if XNN_UNPREDICTABLE(i21 != zero) {
4839       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
4840     }
4841     const uint8_t* i22 = input[22];
4842     assert(i22 != NULL);
4843     if XNN_UNPREDICTABLE(i22 != zero) {
4844       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
4845     }
4846     const uint8_t* i23 = input[23];
4847     assert(i23 != NULL);
4848     if XNN_UNPREDICTABLE(i23 != zero) {
4849       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
4850     }
4851     const uint8_t* i24 = input[24];
4852     assert(i24 != NULL);
4853     if XNN_UNPREDICTABLE(i24 != zero) {
4854       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
4855     }
4856     input = (const uint8_t**) ((uintptr_t) input + input_stride);
4857 
4858     size_t c = channels;
4859     const void* w = weights;
4860     for (; c >= 16; c -= 16) {
4861       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4862       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
4863 
4864 
4865       const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4866       const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
4867       const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
4868       const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
4869       i0 += 16;
4870 
4871       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4872       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
4873 
4874       const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4875       const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
4876       const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
4877       const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
4878       i1 += 16;
4879 
4880       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4881       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
4882 
4883       const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4884       const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
4885       const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
4886       const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
4887       i2 += 16;
4888 
4889       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4890       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
4891 
4892       const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4893       const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
4894       const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
4895       const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
4896       i3 += 16;
4897 
4898       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4899       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
4900 
4901       const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4902       const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
4903       const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
4904       const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
4905       i4 += 16;
4906 
4907       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4908       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
4909 
4910       const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4911       const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
4912       const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
4913       const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
4914       i5 += 16;
4915 
4916       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4917       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
4918 
4919       const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4920       const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
4921       const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
4922       const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
4923       i6 += 16;
4924 
4925       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4926       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
4927 
4928       const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4929       const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
4930       const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
4931       const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
4932       i7 += 16;
4933 
4934       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4935       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
4936 
4937       const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4938       const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
4939       const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
4940       const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
4941       i8 += 16;
4942 
4943       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4944       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
4945 
4946       const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
4947       const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
4948       const __m256i vi9x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
4949       const __m256i vk9x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
4950       i9 += 16;
4951 
4952       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
4953       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
4954 
4955       const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
4956       const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
4957       const __m256i vi10x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
4958       const __m256i vk10x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
4959       i10 += 16;
4960 
4961       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
4962       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
4963 
4964       const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
4965       const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
4966       const __m256i vi11x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
4967       const __m256i vk11x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
4968       i11 += 16;
4969 
4970       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
4971       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
4972 
4973       const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
4974       const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
4975       const __m256i vi12x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
4976       const __m256i vk12x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
4977       i12 += 16;
4978 
4979       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
4980       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
4981 
4982       const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
4983       const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
4984       const __m256i vi13x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
4985       const __m256i vk13x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
4986       i13 += 16;
4987 
4988       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
4989       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
4990 
4991       const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
4992       const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
4993       const __m256i vi14x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
4994       const __m256i vk14x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
4995       i14 += 16;
4996 
4997       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
4998       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
4999 
5000       const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
5001       const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
5002       const __m256i vi15x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
5003       const __m256i vk15x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
5004       i15 += 16;
5005 
5006       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
5007       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
5008 
5009       const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
5010       const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
5011       const __m256i vi16x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
5012       const __m256i vk16x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
5013       i16 += 16;
5014 
5015       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
5016       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
5017 
5018       const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
5019       const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
5020       const __m256i vi17x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
5021       const __m256i vk17x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
5022       i17 += 16;
5023 
5024       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
5025       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
5026 
5027       const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
5028       const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
5029       const __m256i vi18x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
5030       const __m256i vk18x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
5031       i18 += 16;
5032 
5033       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
5034       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
5035 
5036       const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
5037       const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
5038       const __m256i vi19x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
5039       const __m256i vk19x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
5040       i19 += 16;
5041 
5042       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
5043       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
5044 
5045       const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
5046       const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
5047       const __m256i vi20x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
5048       const __m256i vk20x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
5049       i20 += 16;
5050 
5051       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
5052       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
5053 
5054       const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
5055       const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
5056       const __m256i vi21x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
5057       const __m256i vk21x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
5058       i21 += 16;
5059 
5060       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
5061       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
5062 
5063       const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
5064       const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
5065       const __m256i vi22x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
5066       const __m256i vk22x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
5067       i22 += 16;
5068 
5069       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
5070       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
5071 
5072       const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
5073       const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
5074       const __m256i vi23x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
5075       const __m256i vk23x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
5076       i23 += 16;
5077 
5078       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
5079       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
5080 
5081       const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
5082       const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
5083       const __m256i vi24x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
5084       const __m256i vk24x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
5085       i24 += 16;
5086 
5087       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
5088       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
5089 
5090       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
5091 
5092       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5093       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
5094 
5095       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5096       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
5097       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
5098 
5099       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5100       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
5101       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
5102 
5103       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5104       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
5105 
5106       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5107       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5108 
5109       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5110 
5111       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5112       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5113 
5114       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5115       output += 16;
5116     }
5117     if XNN_UNLIKELY(c != 0) {
5118       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
5119       do {
5120         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5121 
5122 
5123         const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5124         const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
5125         i0 += 8;
5126 
5127         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5128 
5129         const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5130         const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
5131         i1 += 8;
5132 
5133         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5134 
5135         const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5136         const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
5137         i2 += 8;
5138 
5139         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5140 
5141         const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5142         const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
5143         i3 += 8;
5144 
5145         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5146 
5147         const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5148         const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
5149         i4 += 8;
5150 
5151         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5152 
5153         const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5154         const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
5155         i5 += 8;
5156 
5157         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5158 
5159         const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5160         const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
5161         i6 += 8;
5162 
5163         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5164 
5165         const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5166         const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
5167         i7 += 8;
5168 
5169         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5170 
5171         const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5172         const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
5173         i8 += 8;
5174 
5175         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5176 
5177         const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
5178         const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144))), vk_zero_point);
5179         i9 += 8;
5180 
5181         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
5182 
5183         const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
5184         const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160))), vk_zero_point);
5185         i10 += 8;
5186 
5187         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
5188 
5189         const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
5190         const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176))), vk_zero_point);
5191         i11 += 8;
5192 
5193         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
5194 
5195         const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
5196         const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192))), vk_zero_point);
5197         i12 += 8;
5198 
5199         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
5200 
5201         const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
5202         const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208))), vk_zero_point);
5203         i13 += 8;
5204 
5205         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
5206 
5207         const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
5208         const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224))), vk_zero_point);
5209         i14 += 8;
5210 
5211         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
5212 
5213         const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
5214         const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240))), vk_zero_point);
5215         i15 += 8;
5216 
5217         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
5218 
5219         const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
5220         const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256))), vk_zero_point);
5221         i16 += 8;
5222 
5223         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
5224 
5225         const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
5226         const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272))), vk_zero_point);
5227         i17 += 8;
5228 
5229         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
5230 
5231         const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
5232         const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288))), vk_zero_point);
5233         i18 += 8;
5234 
5235         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
5236 
5237         const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
5238         const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304))), vk_zero_point);
5239         i19 += 8;
5240 
5241         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
5242 
5243         const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
5244         const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320))), vk_zero_point);
5245         i20 += 8;
5246 
5247         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
5248 
5249         const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
5250         const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336))), vk_zero_point);
5251         i21 += 8;
5252 
5253         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
5254 
5255         const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
5256         const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352))), vk_zero_point);
5257         i22 += 8;
5258 
5259         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
5260 
5261         const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
5262         const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368))), vk_zero_point);
5263         i23 += 8;
5264 
5265         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
5266 
5267         const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
5268         const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384))), vk_zero_point);
5269         i24 += 8;
5270 
5271         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
5272 
5273         k += 8;
5274 
5275         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5276         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
5277         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
5278         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5279 
5280         w = (const void*) ((const int32_t*) w + 8);
5281 
5282         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
5283         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
5284 
5285         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
5286 
5287         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5288         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
5289 
5290         if XNN_LIKELY(c >= 8) {
5291           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5292           output += 8;
5293           c -= 8;
5294         } else {
5295           if (c & 4) {
5296             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
5297             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5298             output += 4;
5299           }
5300           if (c & 2) {
5301             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
5302             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5303             output += 2;
5304           }
5305           if (c & 1) {
5306             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
5307             output += 1;
5308           }
5309           c = 0;
5310         }
5311       } while (c != 0);
5312     }
5313 
5314     output = (uint8_t*) ((uintptr_t) output + output_increment);
5315   } while (--output_width != 0);
5316 }
5317 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5318 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
5319     size_t channels,
5320     size_t output_width,
5321     const uint8_t** input,
5322     const void* weights,
5323     uint8_t* output,
5324     size_t input_stride,
5325     size_t output_increment,
5326     size_t input_offset,
5327     const uint8_t* zero,
5328     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5329 {
5330   assert(channels != 0);
5331   assert(output_width != 0);
5332 
5333   const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
5334   do {
5335     const uint8_t* i0 = input[0];
5336     assert(i0 != NULL);
5337     if XNN_UNPREDICTABLE(i0 != zero) {
5338       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
5339     }
5340     const uint8_t* i1 = input[1];
5341     assert(i1 != NULL);
5342     if XNN_UNPREDICTABLE(i1 != zero) {
5343       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
5344     }
5345     const uint8_t* i2 = input[2];
5346     assert(i2 != NULL);
5347     if XNN_UNPREDICTABLE(i2 != zero) {
5348       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
5349     }
5350     const uint8_t* i3 = input[3];
5351     assert(i3 != NULL);
5352     if XNN_UNPREDICTABLE(i3 != zero) {
5353       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
5354     }
5355     const uint8_t* i4 = input[4];
5356     assert(i4 != NULL);
5357     if XNN_UNPREDICTABLE(i4 != zero) {
5358       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
5359     }
5360     const uint8_t* i5 = input[5];
5361     assert(i5 != NULL);
5362     if XNN_UNPREDICTABLE(i5 != zero) {
5363       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
5364     }
5365     const uint8_t* i6 = input[6];
5366     assert(i6 != NULL);
5367     if XNN_UNPREDICTABLE(i6 != zero) {
5368       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
5369     }
5370     const uint8_t* i7 = input[7];
5371     assert(i7 != NULL);
5372     if XNN_UNPREDICTABLE(i7 != zero) {
5373       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
5374     }
5375     const uint8_t* i8 = input[8];
5376     assert(i8 != NULL);
5377     if XNN_UNPREDICTABLE(i8 != zero) {
5378       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
5379     }
5380     input = (const uint8_t**) ((uintptr_t) input + input_stride);
5381 
5382     size_t c = channels;
5383     const void* w = weights;
5384     for (; c >= 16; c -= 16) {
5385       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5386       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
5387 
5388 
5389       const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5390       const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
5391       const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
5392       const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
5393       i0 += 16;
5394 
5395       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5396       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
5397 
5398       const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5399       const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
5400       const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
5401       const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
5402       i1 += 16;
5403 
5404       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5405       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
5406 
5407       const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5408       const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
5409       const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
5410       const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
5411       i2 += 16;
5412 
5413       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5414       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
5415 
5416       const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5417       const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
5418       const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
5419       const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
5420       i3 += 16;
5421 
5422       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5423       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
5424 
5425       const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5426       const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
5427       const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
5428       const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
5429       i4 += 16;
5430 
5431       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5432       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
5433 
5434       const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5435       const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
5436       const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
5437       const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
5438       i5 += 16;
5439 
5440       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5441       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
5442 
5443       const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5444       const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
5445       const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
5446       const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
5447       i6 += 16;
5448 
5449       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5450       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
5451 
5452       const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5453       const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
5454       const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
5455       const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
5456       i7 += 16;
5457 
5458       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5459       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
5460 
5461       const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5462       const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
5463       const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
5464       const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
5465       i8 += 16;
5466 
5467       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5468       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
5469 
5470       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
5471 
5472       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5473       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
5474 
5475       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5476       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
5477       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
5478 
5479       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5480       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
5481       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
5482 
5483       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5484       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
5485 
5486       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5487       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5488 
5489       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5490 
5491       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5492       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5493 
5494       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5495       output += 16;
5496     }
5497     if XNN_UNLIKELY(c != 0) {
5498       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
5499       do {
5500         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
5501 
5502 
5503         const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
5504         const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
5505         i0 += 8;
5506 
5507         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
5508 
5509         const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
5510         const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
5511         i1 += 8;
5512 
5513         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
5514 
5515         const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
5516         const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
5517         i2 += 8;
5518 
5519         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
5520 
5521         const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
5522         const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
5523         i3 += 8;
5524 
5525         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
5526 
5527         const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
5528         const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
5529         i4 += 8;
5530 
5531         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
5532 
5533         const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
5534         const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
5535         i5 += 8;
5536 
5537         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
5538 
5539         const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
5540         const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
5541         i6 += 8;
5542 
5543         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
5544 
5545         const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
5546         const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
5547         i7 += 8;
5548 
5549         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
5550 
5551         const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
5552         const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
5553         i8 += 8;
5554 
5555         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
5556 
5557         k += 8;
5558 
5559         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
5560         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
5561         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
5562         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
5563 
5564         w = (const void*) ((const int32_t*) w + 8);
5565 
5566         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
5567         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
5568 
5569         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
5570 
5571         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
5572         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
5573 
5574         if XNN_LIKELY(c >= 8) {
5575           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5576           output += 8;
5577           c -= 8;
5578         } else {
5579           if (c & 4) {
5580             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
5581             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5582             output += 4;
5583           }
5584           if (c & 2) {
5585             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
5586             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5587             output += 2;
5588           }
5589           if (c & 1) {
5590             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
5591             output += 1;
5592           }
5593           c = 0;
5594         }
5595       } while (c != 0);
5596     }
5597 
5598     output = (uint8_t*) ((uintptr_t) output + output_increment);
5599   } while (--output_width != 0);
5600 }
5601 
xnn_qu8_f32_vcvt_ukernel__avx2_x16(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5602 void xnn_qu8_f32_vcvt_ukernel__avx2_x16(
5603     size_t n,
5604     const uint8_t* x,
5605     float* y,
5606     const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5607 {
5608   assert(n != 0);
5609   assert(n % sizeof(uint8_t) == 0);
5610   assert(x != NULL);
5611   assert(y != NULL);
5612 
5613   const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
5614   const __m256 vscale = _mm256_load_ps(params->avx.scale);
5615   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
5616     __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5617     __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
5618     x += 16;
5619 
5620     vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
5621     vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
5622 
5623     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
5624     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
5625 
5626     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
5627     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
5628 
5629     _mm256_storeu_ps(y, vy01234567);
5630     _mm256_storeu_ps(y + 8, vy89ABCDEF);
5631     y += 16;
5632   }
5633   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
5634     __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5635     vx = _mm256_add_epi32(vx, vminus_zero_point);
5636     x += 8;
5637 
5638     __m256 vy = _mm256_cvtepi32_ps(vx);
5639     vy = _mm256_mul_ps(vy, vscale);
5640 
5641     _mm256_storeu_ps(y, vy);
5642     y += 8;
5643   }
5644   if XNN_UNLIKELY(n != 0) {
5645     assert(n >= 1 * sizeof(uint8_t));
5646     assert(n <= 7 * sizeof(uint8_t));
5647 
5648     __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
5649     vx = _mm256_add_epi32(vx, vminus_zero_point);
5650 
5651     __m256 vy = _mm256_cvtepi32_ps(vx);
5652     vy = _mm256_mul_ps(vy, vscale);
5653 
5654     __m128 vy_lo = _mm256_castps256_ps128(vy);
5655     if (n & (4 * sizeof(uint8_t))) {
5656       _mm_storeu_ps(y, vy_lo);
5657       vy_lo = _mm256_extractf128_ps(vy, 1);
5658       y += 4;
5659     }
5660     if (n & (2 * sizeof(uint8_t))) {
5661       _mm_storel_pi((__m64*) y, vy_lo);
5662       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
5663       y += 2;
5664     }
5665     if (n & (1 * sizeof(uint8_t))) {
5666       _mm_store_ss(y, vy_lo);
5667     }
5668   }
5669 }
5670 
xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5671 void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
5672     size_t mr,
5673     size_t nc,
5674     size_t kc,
5675     const uint8_t* restrict a,
5676     size_t a_stride,
5677     const void* restrict w,
5678     uint8_t* restrict c,
5679     size_t cm_stride,
5680     size_t cn_stride,
5681     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5682 {
5683   assert(mr != 0);
5684   assert(mr <= 1);
5685   assert(nc != 0);
5686   assert(kc != 0);
5687   assert(kc % sizeof(uint8_t) == 0);
5688   assert(a != NULL);
5689   assert(w != NULL);
5690   assert(c != NULL);
5691 
5692   kc = round_up_po2(kc, 8);
5693   const uint8_t* a0 = a;
5694   uint8_t* c0 = c;
5695 
5696   do {
5697     const __m128i vbias0x0 = _mm_loadu_si32(w);
5698     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
5699     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5700     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
5701     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
5702     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5703     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
5704     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
5705     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5706     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
5707     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
5708     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5709     w = (const void*) ((const int32_t*) w + 8);
5710 
5711     size_t k = 0;
5712     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
5713     while (k < kc) {
5714       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5715       const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
5716       a0 += 8;
5717 
5718       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5719       const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
5720 
5721       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5722       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
5723       const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
5724 
5725       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5726       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
5727       const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
5728 
5729       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5730       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
5731       const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
5732 
5733       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5734 
5735       w = (const void*) ((const uint8_t*) w + 64);
5736       k += 8 * sizeof(uint8_t);
5737     }
5738 
5739     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5740     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5741 
5742     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5743 
5744     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5745     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5746 
5747     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5748 
5749     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5750     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5751 
5752     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5753     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5754 
5755     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5756 
5757     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5758     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
5759 
5760     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5761 
5762     __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
5763 
5764     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5765 
5766     __m128i vout_lo = _mm256_castsi256_si128(vout);
5767     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5768 
5769     if (nc >= 8) {
5770       _mm_storel_epi64((__m128i*) c0, vout_lo);
5771 
5772       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
5773 
5774       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
5775 
5776       nc -= 8;
5777     } else {
5778       if (nc & 4) {
5779         _mm_storeu_si32(c0, vout_lo);
5780 
5781         c0 += 4;
5782 
5783         vout_lo = _mm_srli_epi64(vout_lo, 32);
5784         vout_hi = _mm_srli_epi64(vout_hi, 32);
5785       }
5786       if (nc & 2) {
5787         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
5788 
5789         c0 += 2;
5790 
5791         vout_lo = _mm_srli_epi32(vout_lo, 16);
5792         vout_hi = _mm_srli_epi32(vout_hi, 16);
5793       }
5794       if (nc & 1) {
5795         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
5796       }
5797 
5798       nc = 0;
5799     }
5800   } while (nc != 0);
5801 }
5802 
xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5803 void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
5804     size_t mr,
5805     size_t nc,
5806     size_t kc,
5807     const uint8_t* restrict a,
5808     size_t a_stride,
5809     const void* restrict w,
5810     uint8_t* restrict c,
5811     size_t cm_stride,
5812     size_t cn_stride,
5813     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5814 {
5815   assert(mr != 0);
5816   assert(mr <= 3);
5817   assert(nc != 0);
5818   assert(kc != 0);
5819   assert(kc % sizeof(uint8_t) == 0);
5820   assert(a != NULL);
5821   assert(w != NULL);
5822   assert(c != NULL);
5823 
5824   kc = round_up_po2(kc, 8);
5825   const uint8_t* a0 = a;
5826   uint8_t* c0 = c;
5827   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
5828   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
5829   if XNN_UNPREDICTABLE(mr < 2) {
5830     a1 = a0;
5831     c1 = c0;
5832   }
5833   const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
5834   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
5835   if XNN_UNPREDICTABLE(mr <= 2) {
5836     a2 = a1;
5837     c2 = c1;
5838   }
5839 
5840   do {
5841     const __m128i vbias0x0 = _mm_loadu_si32(w);
5842     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
5843     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5844     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
5845     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
5846     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5847     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
5848     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
5849     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5850     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
5851     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
5852     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5853     __m256i vacc1x01 = vacc0x01;
5854     __m256i vacc1x23 = vacc0x23;
5855     __m256i vacc1x45 = vacc0x45;
5856     __m256i vacc1x67 = vacc0x67;
5857     __m256i vacc2x01 = vacc0x01;
5858     __m256i vacc2x23 = vacc0x23;
5859     __m256i vacc2x45 = vacc0x45;
5860     __m256i vacc2x67 = vacc0x67;
5861     w = (const void*) ((const int32_t*) w + 8);
5862 
5863     size_t k = 0;
5864     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
5865     while (k < kc) {
5866       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5867       const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
5868       a0 += 8;
5869       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
5870       const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
5871       a1 += 8;
5872       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
5873       const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
5874       a2 += 8;
5875 
5876       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5877       const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
5878 
5879       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5880       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
5881       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
5882       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
5883       const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
5884 
5885       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5886       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
5887       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
5888       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
5889       const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
5890 
5891       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5892       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
5893       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
5894       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
5895       const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
5896 
5897       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5898       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
5899       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
5900 
5901       w = (const void*) ((const uint8_t*) w + 64);
5902       k += 8 * sizeof(uint8_t);
5903     }
5904 
5905     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5906     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5907     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
5908     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
5909     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
5910     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
5911 
5912     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5913     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
5914     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
5915 
5916     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5917     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5918     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
5919     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
5920 
5921     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5922     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
5923     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
5924 
5925     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5926     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5927     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
5928     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
5929 
5930     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5931     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5932     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
5933     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
5934 
5935     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5936     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
5937     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
5938 
5939     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5940     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
5941     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
5942 
5943     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5944     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5945 
5946     __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
5947 
5948     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5949 
5950     __m128i vout_lo = _mm256_castsi256_si128(vout);
5951     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5952 
5953     if (nc >= 8) {
5954       _mm_storel_epi64((__m128i*) c0, vout_lo);
5955       _mm_storel_epi64((__m128i*) c1, vout_hi);
5956       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
5957 
5958       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
5959       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
5960       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
5961 
5962       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
5963       a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
5964       a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
5965 
5966       nc -= 8;
5967     } else {
5968       if (nc & 4) {
5969         _mm_storeu_si32(c0, vout_lo);
5970         _mm_storeu_si32(c1, vout_hi);
5971         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
5972 
5973         c0 += 4;
5974         c1 += 4;
5975         c2 += 4;
5976 
5977         vout_lo = _mm_srli_epi64(vout_lo, 32);
5978         vout_hi = _mm_srli_epi64(vout_hi, 32);
5979       }
5980       if (nc & 2) {
5981         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
5982         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
5983         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
5984 
5985         c0 += 2;
5986         c1 += 2;
5987         c2 += 2;
5988 
5989         vout_lo = _mm_srli_epi32(vout_lo, 16);
5990         vout_hi = _mm_srli_epi32(vout_hi, 16);
5991       }
5992       if (nc & 1) {
5993         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
5994         *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
5995         *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
5996       }
5997 
5998       nc = 0;
5999     }
6000   } while (nc != 0);
6001 }
6002 
xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6003 void xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
6004     size_t mr,
6005     size_t nc,
6006     size_t kc,
6007     size_t ks,
6008     const uint8_t** restrict a,
6009     const void* restrict w,
6010     uint8_t* restrict c,
6011     size_t cm_stride,
6012     size_t cn_stride,
6013     size_t a_offset,
6014     const uint8_t* zero,
6015     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6016 {
6017   assert(mr != 0);
6018   assert(mr <= 1);
6019   assert(nc != 0);
6020   assert(kc != 0);
6021   assert(ks != 0);
6022   assert(ks % (1 * sizeof(void*)) == 0);
6023   assert(a_offset % sizeof(uint8_t) == 0);
6024   assert(a != NULL);
6025   assert(w != NULL);
6026   assert(c != NULL);
6027 
6028   kc = round_up_po2(kc, 8);
6029   uint8_t* c0 = c;
6030 
6031   do {
6032     const __m128i vbias0x0 = _mm_loadu_si32(w);
6033     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
6034     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
6035     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
6036     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
6037     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
6038     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
6039     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
6040     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
6041     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
6042     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
6043     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
6044     w = (const void*) ((const int32_t*) w + 8);
6045 
6046     size_t p = ks;
6047     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
6048     do {
6049       const uint8_t* restrict a0 = a[0];
6050       if XNN_UNPREDICTABLE(a0 != zero) {
6051         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
6052       }
6053       a += 1;
6054 
6055       size_t k = 0;
6056       while (k < kc) {
6057         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
6058         const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
6059         a0 += 8;
6060 
6061         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6062         const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
6063 
6064         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
6065         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
6066         const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
6067 
6068         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
6069         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
6070         const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
6071 
6072         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
6073         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
6074         const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
6075 
6076         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
6077 
6078         w = (const void*) ((const uint8_t*) w + 64);
6079         k += 8 * sizeof(uint8_t);
6080       }
6081       p -= 1 * sizeof(void*);
6082     } while (p != 0);
6083 
6084     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
6085     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
6086 
6087     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
6088 
6089     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
6090     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
6091 
6092     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
6093 
6094     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6095     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
6096 
6097     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6098     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
6099 
6100     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
6101 
6102     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6103     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
6104 
6105     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6106 
6107     __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
6108 
6109     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
6110 
6111     __m128i vout_lo = _mm256_castsi256_si128(vout);
6112     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
6113 
6114     if (nc >= 8) {
6115       _mm_storel_epi64((__m128i*) c0, vout_lo);
6116 
6117       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
6118 
6119       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
6120 
6121       nc -= 8;
6122     } else {
6123       if (nc & 4) {
6124         _mm_storeu_si32(c0, vout_lo);
6125 
6126         c0 += 4;
6127 
6128         vout_lo = _mm_srli_epi64(vout_lo, 32);
6129         vout_hi = _mm_srli_epi64(vout_hi, 32);
6130       }
6131       if (nc & 2) {
6132         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
6133 
6134         c0 += 2;
6135 
6136         vout_lo = _mm_srli_epi32(vout_lo, 16);
6137         vout_hi = _mm_srli_epi32(vout_hi, 16);
6138       }
6139       if (nc & 1) {
6140         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
6141       }
6142 
6143       nc = 0;
6144     }
6145   } while (nc != 0);
6146 }
6147 
xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6148 void xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
6149     size_t mr,
6150     size_t nc,
6151     size_t kc,
6152     size_t ks,
6153     const uint8_t** restrict a,
6154     const void* restrict w,
6155     uint8_t* restrict c,
6156     size_t cm_stride,
6157     size_t cn_stride,
6158     size_t a_offset,
6159     const uint8_t* zero,
6160     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6161 {
6162   assert(mr != 0);
6163   assert(mr <= 3);
6164   assert(nc != 0);
6165   assert(kc != 0);
6166   assert(ks != 0);
6167   assert(ks % (3 * sizeof(void*)) == 0);
6168   assert(a_offset % sizeof(uint8_t) == 0);
6169   assert(a != NULL);
6170   assert(w != NULL);
6171   assert(c != NULL);
6172 
6173   kc = round_up_po2(kc, 8);
6174   uint8_t* c0 = c;
6175   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
6176   if XNN_UNPREDICTABLE(mr < 2) {
6177     c1 = c0;
6178   }
6179   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
6180   if XNN_UNPREDICTABLE(mr <= 2) {
6181     c2 = c1;
6182   }
6183 
6184   do {
6185     const __m128i vbias0x0 = _mm_loadu_si32(w);
6186     const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
6187     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
6188     const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
6189     const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
6190     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
6191     const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
6192     const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
6193     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
6194     const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
6195     const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
6196     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
6197     __m256i vacc1x01 = vacc0x01;
6198     __m256i vacc1x23 = vacc0x23;
6199     __m256i vacc1x45 = vacc0x45;
6200     __m256i vacc1x67 = vacc0x67;
6201     __m256i vacc2x01 = vacc0x01;
6202     __m256i vacc2x23 = vacc0x23;
6203     __m256i vacc2x45 = vacc0x45;
6204     __m256i vacc2x67 = vacc0x67;
6205     w = (const void*) ((const int32_t*) w + 8);
6206 
6207     size_t p = ks;
6208     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
6209     do {
6210       const uint8_t* restrict a0 = a[0];
6211       if XNN_UNPREDICTABLE(a0 != zero) {
6212         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
6213       }
6214       const uint8_t* restrict a1 = a[1];
6215       if XNN_UNPREDICTABLE(a1 != zero) {
6216         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
6217       }
6218       const uint8_t* restrict a2 = a[2];
6219       if XNN_UNPREDICTABLE(a2 != zero) {
6220         a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
6221       }
6222       a += 3;
6223 
6224       size_t k = 0;
6225       while (k < kc) {
6226         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
6227         const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
6228         a0 += 8;
6229         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
6230         const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
6231         a1 += 8;
6232         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
6233         const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
6234         a2 += 8;
6235 
6236         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6237         const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
6238 
6239         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
6240         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
6241         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
6242         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
6243         const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
6244 
6245         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
6246         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
6247         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
6248         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
6249         const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
6250 
6251         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
6252         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
6253         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
6254         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
6255         const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
6256 
6257         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
6258         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
6259         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
6260 
6261         w = (const void*) ((const uint8_t*) w + 64);
6262         k += 8 * sizeof(uint8_t);
6263       }
6264       p -= 3 * sizeof(void*);
6265     } while (p != 0);
6266 
6267     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
6268     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
6269     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
6270     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
6271     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
6272     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
6273 
6274     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
6275     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
6276     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
6277 
6278     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
6279     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
6280     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
6281     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
6282 
6283     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
6284     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
6285     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
6286 
6287     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6288     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
6289     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
6290     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
6291 
6292     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6293     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
6294     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
6295     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
6296 
6297     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
6298     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
6299     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
6300 
6301     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6302     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
6303     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
6304 
6305     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6306     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
6307 
6308     __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
6309 
6310     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
6311 
6312     __m128i vout_lo = _mm256_castsi256_si128(vout);
6313     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
6314 
6315     if (nc >= 8) {
6316       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
6317       _mm_storel_epi64((__m128i*) c1, vout_hi);
6318       _mm_storel_epi64((__m128i*) c0, vout_lo);
6319 
6320       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
6321       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
6322       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
6323 
6324       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
6325 
6326       nc -= 8;
6327     } else {
6328       if (nc & 4) {
6329         *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
6330         _mm_storeu_si32(c1, vout_hi);
6331         _mm_storeu_si32(c0, vout_lo);
6332 
6333         c2 += 4;
6334         c1 += 4;
6335         c0 += 4;
6336 
6337         vout_lo = _mm_srli_epi64(vout_lo, 32);
6338         vout_hi = _mm_srli_epi64(vout_hi, 32);
6339       }
6340       if (nc & 2) {
6341         *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
6342         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
6343         *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
6344 
6345         c2 += 2;
6346         c1 += 2;
6347         c0 += 2;
6348 
6349         vout_lo = _mm_srli_epi32(vout_lo, 16);
6350         vout_hi = _mm_srli_epi32(vout_hi, 16);
6351       }
6352       if (nc & 1) {
6353         *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
6354         *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
6355         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
6356       }
6357 
6358       nc = 0;
6359     }
6360   } while (nc != 0);
6361 }
6362 
xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6363 void xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
6364     size_t n,
6365     const uint8_t* input_a,
6366     const uint8_t* input_b,
6367     uint8_t* output,
6368     const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6369 {
6370   const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
6371   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
6372   const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
6373   const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
6374   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
6375   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
6376   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
6377 
6378   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6379     const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6380     const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
6381     const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
6382     const __m256i vb89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
6383     input_a += 16;
6384     input_b += 16;
6385 
6386     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6387     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
6388 
6389     vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
6390     vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
6391 
6392     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6393     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
6394 
6395     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6396 
6397     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6398 
6399     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6400 
6401     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
6402 
6403     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6404     output += 16;
6405   }
6406   if XNN_UNLIKELY(n != 0) {
6407     do {
6408       const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6409       const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
6410       input_a += 8;
6411       input_b += 8;
6412 
6413       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6414 
6415       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
6416 
6417       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6418 
6419       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
6420       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6421       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6422       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
6423 
6424       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
6425         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6426         output += 8;
6427         n -= 8 * sizeof(uint8_t);
6428       } else {
6429         if (n & (4 * sizeof(uint8_t))) {
6430           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6431           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6432           output += 4;
6433         }
6434         if (n & (2 * sizeof(uint8_t))) {
6435           *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
6436           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6437           output += 2;
6438         }
6439         if (n & (1 * sizeof(uint8_t))) {
6440           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6441         }
6442         n = 0;
6443       }
6444     } while (n != 0);
6445   }
6446 }
6447 
xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6448 void xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
6449     size_t n,
6450     const uint8_t* input_a,
6451     const uint8_t* input_b,
6452     uint8_t* output,
6453     const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6454 {
6455   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
6456   const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
6457   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
6458   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
6459   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
6460 
6461   const __m256i vbias = _mm256_add_epi32(
6462     _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
6463     _mm256_load_si256((const __m256i*) params->avx2.bias));
6464   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6465     const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6466     const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
6467     input_a += 16;
6468 
6469     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6470     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
6471 
6472     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6473     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
6474 
6475     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6476 
6477     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6478 
6479     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6480 
6481     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
6482 
6483     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6484     output += 16;
6485   }
6486   if XNN_UNLIKELY(n != 0) {
6487     do {
6488       const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
6489       input_a += 8;
6490 
6491       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
6492 
6493       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
6494 
6495       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
6496       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6497       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6498       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
6499 
6500       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
6501         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6502         output += 8;
6503         n -= 8 * sizeof(uint8_t);
6504       } else {
6505         if (n & (4 * sizeof(uint8_t))) {
6506           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6507           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6508           output += 4;
6509         }
6510         if (n & (2 * sizeof(uint8_t))) {
6511           *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
6512           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6513           output += 2;
6514         }
6515         if (n & (1 * sizeof(uint8_t))) {
6516           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6517         }
6518         n = 0;
6519       }
6520     } while (n != 0);
6521   }
6522 }
6523 
xnn_x8_lut_ukernel__avx2_x128(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])6524 void xnn_x8_lut_ukernel__avx2_x128(
6525     size_t n,
6526     const uint8_t* x,
6527     uint8_t* y,
6528     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
6529 {
6530   assert(n != 0);
6531   assert(x != NULL);
6532   assert(y != NULL);
6533 
6534   const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
6535   const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
6536   const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
6537   const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
6538   const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
6539   const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
6540   const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
6541   const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
6542   const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
6543   const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
6544   const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
6545   const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
6546   const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
6547   const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
6548   const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
6549   const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
6550 
6551   const __m256i vtable0 = vt0;
6552   const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
6553   const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
6554   const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
6555   const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
6556   const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
6557   const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
6558   const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
6559   const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
6560   const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
6561   const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
6562   const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
6563   const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
6564   const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
6565   const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
6566   const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
6567 
6568   const __m256i voffset = _mm256_set1_epi8(16);
6569   for (; n >= 128 * sizeof(uint8_t); n -= 128 * sizeof(uint8_t)) {
6570     __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
6571     __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
6572     __m256i vx2 = _mm256_loadu_si256((const __m256i*) (x + 64));
6573     __m256i vx3 = _mm256_loadu_si256((const __m256i*) (x + 96));
6574     x += 128;
6575 
6576     __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
6577     __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
6578     __m256i vy2 = _mm256_shuffle_epi8(vtable0, vx2);
6579     __m256i vy3 = _mm256_shuffle_epi8(vtable0, vx3);
6580 
6581     vx0 = _mm256_sub_epi8(vx0, voffset);
6582     vx1 = _mm256_sub_epi8(vx1, voffset);
6583     vx2 = _mm256_sub_epi8(vx2, voffset);
6584     vx3 = _mm256_sub_epi8(vx3, voffset);
6585     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
6586     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
6587     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable1, vx2));
6588     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable1, vx3));
6589     vx0 = _mm256_sub_epi8(vx0, voffset);
6590     vx1 = _mm256_sub_epi8(vx1, voffset);
6591     vx2 = _mm256_sub_epi8(vx2, voffset);
6592     vx3 = _mm256_sub_epi8(vx3, voffset);
6593     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
6594     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
6595     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable2, vx2));
6596     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable2, vx3));
6597     vx0 = _mm256_sub_epi8(vx0, voffset);
6598     vx1 = _mm256_sub_epi8(vx1, voffset);
6599     vx2 = _mm256_sub_epi8(vx2, voffset);
6600     vx3 = _mm256_sub_epi8(vx3, voffset);
6601     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
6602     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
6603     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable3, vx2));
6604     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable3, vx3));
6605     vx0 = _mm256_sub_epi8(vx0, voffset);
6606     vx1 = _mm256_sub_epi8(vx1, voffset);
6607     vx2 = _mm256_sub_epi8(vx2, voffset);
6608     vx3 = _mm256_sub_epi8(vx3, voffset);
6609     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
6610     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
6611     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable4, vx2));
6612     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable4, vx3));
6613     vx0 = _mm256_sub_epi8(vx0, voffset);
6614     vx1 = _mm256_sub_epi8(vx1, voffset);
6615     vx2 = _mm256_sub_epi8(vx2, voffset);
6616     vx3 = _mm256_sub_epi8(vx3, voffset);
6617     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
6618     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
6619     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable5, vx2));
6620     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable5, vx3));
6621     vx0 = _mm256_sub_epi8(vx0, voffset);
6622     vx1 = _mm256_sub_epi8(vx1, voffset);
6623     vx2 = _mm256_sub_epi8(vx2, voffset);
6624     vx3 = _mm256_sub_epi8(vx3, voffset);
6625     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
6626     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
6627     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable6, vx2));
6628     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable6, vx3));
6629     vx0 = _mm256_sub_epi8(vx0, voffset);
6630     vx1 = _mm256_sub_epi8(vx1, voffset);
6631     vx2 = _mm256_sub_epi8(vx2, voffset);
6632     vx3 = _mm256_sub_epi8(vx3, voffset);
6633     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
6634     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
6635     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable7, vx2));
6636     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable7, vx3));
6637     vx0 = _mm256_sub_epi8(vx0, voffset);
6638     vx1 = _mm256_sub_epi8(vx1, voffset);
6639     vx2 = _mm256_sub_epi8(vx2, voffset);
6640     vx3 = _mm256_sub_epi8(vx3, voffset);
6641     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
6642     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
6643     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable8, vx2));
6644     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable8, vx3));
6645 
6646     vx0 = _mm256_subs_epi8(vx0, voffset);
6647     vx1 = _mm256_subs_epi8(vx1, voffset);
6648     vx2 = _mm256_subs_epi8(vx2, voffset);
6649     vx3 = _mm256_subs_epi8(vx3, voffset);
6650     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
6651     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
6652     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable9, vx2));
6653     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable9, vx3));
6654     vx0 = _mm256_subs_epi8(vx0, voffset);
6655     vx1 = _mm256_subs_epi8(vx1, voffset);
6656     vx2 = _mm256_subs_epi8(vx2, voffset);
6657     vx3 = _mm256_subs_epi8(vx3, voffset);
6658     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
6659     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
6660     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableA, vx2));
6661     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableA, vx3));
6662     vx0 = _mm256_subs_epi8(vx0, voffset);
6663     vx1 = _mm256_subs_epi8(vx1, voffset);
6664     vx2 = _mm256_subs_epi8(vx2, voffset);
6665     vx3 = _mm256_subs_epi8(vx3, voffset);
6666     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
6667     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
6668     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableB, vx2));
6669     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableB, vx3));
6670     vx0 = _mm256_subs_epi8(vx0, voffset);
6671     vx1 = _mm256_subs_epi8(vx1, voffset);
6672     vx2 = _mm256_subs_epi8(vx2, voffset);
6673     vx3 = _mm256_subs_epi8(vx3, voffset);
6674     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
6675     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
6676     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableC, vx2));
6677     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableC, vx3));
6678     vx0 = _mm256_subs_epi8(vx0, voffset);
6679     vx1 = _mm256_subs_epi8(vx1, voffset);
6680     vx2 = _mm256_subs_epi8(vx2, voffset);
6681     vx3 = _mm256_subs_epi8(vx3, voffset);
6682     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
6683     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
6684     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableD, vx2));
6685     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableD, vx3));
6686     vx0 = _mm256_subs_epi8(vx0, voffset);
6687     vx1 = _mm256_subs_epi8(vx1, voffset);
6688     vx2 = _mm256_subs_epi8(vx2, voffset);
6689     vx3 = _mm256_subs_epi8(vx3, voffset);
6690     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
6691     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
6692     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableE, vx2));
6693     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableE, vx3));
6694     vx0 = _mm256_subs_epi8(vx0, voffset);
6695     vx1 = _mm256_subs_epi8(vx1, voffset);
6696     vx2 = _mm256_subs_epi8(vx2, voffset);
6697     vx3 = _mm256_subs_epi8(vx3, voffset);
6698     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
6699     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
6700     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableF, vx2));
6701     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableF, vx3));
6702 
6703     _mm256_storeu_si256((__m256i*) y, vy0);
6704     _mm256_storeu_si256((__m256i*) (y + 32), vy1);
6705     _mm256_storeu_si256((__m256i*) (y + 64), vy2);
6706     _mm256_storeu_si256((__m256i*) (y + 96), vy3);
6707     y += 128;
6708   }
6709   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6710     __m128i vx = _mm_loadu_si128((const __m128i*) x);
6711     x += 16;
6712 
6713     __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
6714 
6715     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6716     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
6717     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6718     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
6719     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6720     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
6721     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6722     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
6723     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6724     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
6725     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6726     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
6727     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6728     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
6729     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6730     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
6731 
6732     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6733     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
6734     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6735     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
6736     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6737     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
6738     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6739     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
6740     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6741     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
6742     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6743     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
6744     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6745     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
6746 
6747     _mm_storeu_si128((__m128i*) y, vy);
6748     y += 16;
6749   }
6750   if XNN_UNLIKELY(n != 0) {
6751     __m128i vx = _mm_loadu_si128((const __m128i*) x);
6752 
6753     __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
6754 
6755     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6756     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
6757     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6758     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
6759     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6760     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
6761     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6762     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
6763     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6764     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
6765     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6766     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
6767     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6768     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
6769     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
6770     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
6771 
6772     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6773     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
6774     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6775     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
6776     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6777     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
6778     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6779     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
6780     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6781     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
6782     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6783     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
6784     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
6785     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
6786 
6787     if (n & (8 * sizeof(uint8_t))) {
6788       _mm_storel_epi64((__m128i*) y, vy);
6789       vy = _mm_unpackhi_epi64(vy, vy);
6790       y += 8;
6791     }
6792     if (n & (4 * sizeof(uint8_t))) {
6793       _mm_storeu_si32(y, vy);
6794       vy = _mm_srli_epi64(vy, 32);
6795       y += 4;
6796     }
6797     if (n & (2 * sizeof(uint8_t))) {
6798       *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
6799       vy = _mm_srli_epi32(vy, 16);
6800       y += 2;
6801     }
6802     if (n & (1 * sizeof(uint8_t))) {
6803       *y = (uint8_t) _mm_extract_epi8(vy, 0);
6804     }
6805   }
6806 }
6807