• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-dwconv/unipass-avx2-mul16-vpunpck.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <immintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 
16 
xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck(
18     size_t channels,
19     size_t output_width,
20     const int8_t** input,
21     const void* weights,
22     int8_t* output,
23     size_t input_stride,
24     size_t output_increment,
25     size_t input_offset,
26     const int8_t* zero,
27     const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(channels != 0);
30   assert(output_width != 0);
31 
32   do {
33     const int8_t* i0 = input[0];
34     assert(i0 != NULL);
35     if XNN_UNPREDICTABLE(i0 != zero) {
36       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
37     }
38     const int8_t* i1 = input[1];
39     assert(i1 != NULL);
40     if XNN_UNPREDICTABLE(i1 != zero) {
41       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
42     }
43     const int8_t* i2 = input[2];
44     assert(i2 != NULL);
45     if XNN_UNPREDICTABLE(i2 != zero) {
46       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
47     }
48     const int8_t* i3 = input[3];
49     assert(i3 != NULL);
50     if XNN_UNPREDICTABLE(i3 != zero) {
51       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
52     }
53     const int8_t* i4 = input[4];
54     assert(i4 != NULL);
55     if XNN_UNPREDICTABLE(i4 != zero) {
56       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
57     }
58     const int8_t* i5 = input[5];
59     assert(i5 != NULL);
60     if XNN_UNPREDICTABLE(i5 != zero) {
61       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
62     }
63     const int8_t* i6 = input[6];
64     assert(i6 != NULL);
65     if XNN_UNPREDICTABLE(i6 != zero) {
66       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
67     }
68     const int8_t* i7 = input[7];
69     assert(i7 != NULL);
70     if XNN_UNPREDICTABLE(i7 != zero) {
71       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
72     }
73     const int8_t* i8 = input[8];
74     assert(i8 != NULL);
75     if XNN_UNPREDICTABLE(i8 != zero) {
76       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
77     }
78     input = (const int8_t**) ((uintptr_t) input + input_stride);
79 
80     size_t c = channels;
81     const void* w = weights;
82     for (; c >= 32; c -= 32) {
83       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
84       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
85       __m256i vaccGHIJKLMN = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 16 * sizeof(int32_t)));
86       __m256i vaccOPQRSTUV = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 24 * sizeof(int32_t)));
87 
88       __m256i vacc012389AB = _mm256_inserti128_si256(vacc01234567, _mm256_castsi256_si128(vacc89ABCDEF), 1);
89       __m256i vacc4567CDEF = _mm256_permute2x128_si256(vacc01234567, vacc89ABCDEF, 0x31);
90       __m256i vaccGHIJOPQR = _mm256_inserti128_si256(vaccGHIJKLMN, _mm256_castsi256_si128(vaccOPQRSTUV), 1);
91       __m256i vaccKLMNSTUV = _mm256_permute2x128_si256(vaccGHIJKLMN, vaccOPQRSTUV, 0x31);
92 
93 
94       const __m256i vi0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i0));
95       const __m256i vk0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
96       const __m256i vi0xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i0 + 16)));
97       const __m256i vk0xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
98       i0 += 32;
99 
100       __m256i vacc0123456789ABCDEF = _mm256_mullo_epi16(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF);
101       __m256i vaccGHIJKLMNOPQRSTUV = _mm256_mullo_epi16(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV);
102 
103 
104       const __m256i vi1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i1));
105       const __m256i vk1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
106       const __m256i vi1xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i1 + 16)));
107       const __m256i vk1xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
108       i1 += 32;
109 
110       vacc0123456789ABCDEF = _mm256_add_epi16(vacc0123456789ABCDEF, _mm256_mullo_epi16(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
111       vaccGHIJKLMNOPQRSTUV = _mm256_add_epi16(vaccGHIJKLMNOPQRSTUV, _mm256_mullo_epi16(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
112 
113       __m256i vsignacc0123456789ABCDEF = _mm256_srai_epi16(vacc0123456789ABCDEF, 15);
114       vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
115       vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
116       __m256i vsignaccGHIJKLMNOPQRSTUV = _mm256_srai_epi16(vaccGHIJKLMNOPQRSTUV, 15);
117       vaccGHIJOPQR = _mm256_add_epi32(vaccGHIJOPQR, _mm256_unpacklo_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
118       vaccKLMNSTUV = _mm256_add_epi32(vaccKLMNSTUV, _mm256_unpackhi_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
119 
120       const __m256i vi2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i2));
121       const __m256i vk2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
122       const __m256i vi2xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i2 + 16)));
123       const __m256i vk2xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
124       i2 += 32;
125 
126       vacc0123456789ABCDEF = _mm256_mullo_epi16(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF);
127       vaccGHIJKLMNOPQRSTUV = _mm256_mullo_epi16(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV);
128 
129 
130       const __m256i vi3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i3));
131       const __m256i vk3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t))));
132       const __m256i vi3xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i3 + 16)));
133       const __m256i vk3xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t))));
134       i3 += 32;
135 
136       vacc0123456789ABCDEF = _mm256_add_epi16(vacc0123456789ABCDEF, _mm256_mullo_epi16(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
137       vaccGHIJKLMNOPQRSTUV = _mm256_add_epi16(vaccGHIJKLMNOPQRSTUV, _mm256_mullo_epi16(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
138 
139       vsignacc0123456789ABCDEF = _mm256_srai_epi16(vacc0123456789ABCDEF, 15);
140       vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
141       vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
142       vsignaccGHIJKLMNOPQRSTUV = _mm256_srai_epi16(vaccGHIJKLMNOPQRSTUV, 15);
143       vaccGHIJOPQR = _mm256_add_epi32(vaccGHIJOPQR, _mm256_unpacklo_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
144       vaccKLMNSTUV = _mm256_add_epi32(vaccKLMNSTUV, _mm256_unpackhi_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
145 
146       const __m256i vi4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i4));
147       const __m256i vk4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t))));
148       const __m256i vi4xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i4 + 16)));
149       const __m256i vk4xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t))));
150       i4 += 32;
151 
152       vacc0123456789ABCDEF = _mm256_mullo_epi16(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF);
153       vaccGHIJKLMNOPQRSTUV = _mm256_mullo_epi16(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV);
154 
155 
156       const __m256i vi5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i5));
157       const __m256i vk5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t))));
158       const __m256i vi5xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i5 + 16)));
159       const __m256i vk5xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t))));
160       i5 += 32;
161 
162       vacc0123456789ABCDEF = _mm256_add_epi16(vacc0123456789ABCDEF, _mm256_mullo_epi16(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
163       vaccGHIJKLMNOPQRSTUV = _mm256_add_epi16(vaccGHIJKLMNOPQRSTUV, _mm256_mullo_epi16(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
164 
165       vsignacc0123456789ABCDEF = _mm256_srai_epi16(vacc0123456789ABCDEF, 15);
166       vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
167       vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
168       vsignaccGHIJKLMNOPQRSTUV = _mm256_srai_epi16(vaccGHIJKLMNOPQRSTUV, 15);
169       vaccGHIJOPQR = _mm256_add_epi32(vaccGHIJOPQR, _mm256_unpacklo_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
170       vaccKLMNSTUV = _mm256_add_epi32(vaccKLMNSTUV, _mm256_unpackhi_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
171 
172       const __m256i vi6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i6));
173       const __m256i vk6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t))));
174       const __m256i vi6xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i6 + 16)));
175       const __m256i vk6xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t))));
176       i6 += 32;
177 
178       vacc0123456789ABCDEF = _mm256_mullo_epi16(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF);
179       vaccGHIJKLMNOPQRSTUV = _mm256_mullo_epi16(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV);
180 
181 
182       const __m256i vi7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i7));
183       const __m256i vk7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t))));
184       const __m256i vi7xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i7 + 16)));
185       const __m256i vk7xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t))));
186       i7 += 32;
187 
188       vacc0123456789ABCDEF = _mm256_add_epi16(vacc0123456789ABCDEF, _mm256_mullo_epi16(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
189       vaccGHIJKLMNOPQRSTUV = _mm256_add_epi16(vaccGHIJKLMNOPQRSTUV, _mm256_mullo_epi16(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
190 
191       vsignacc0123456789ABCDEF = _mm256_srai_epi16(vacc0123456789ABCDEF, 15);
192       vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
193       vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
194       vsignaccGHIJKLMNOPQRSTUV = _mm256_srai_epi16(vaccGHIJKLMNOPQRSTUV, 15);
195       vaccGHIJOPQR = _mm256_add_epi32(vaccGHIJOPQR, _mm256_unpacklo_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
196       vaccKLMNSTUV = _mm256_add_epi32(vaccKLMNSTUV, _mm256_unpackhi_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
197 
198       const __m256i vi8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i8));
199       const __m256i vk8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t))));
200       const __m256i vi8xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i8 + 16)));
201       const __m256i vk8xGHIJKLMNOPQRSTUV = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t))));
202       i8 += 32;
203 
204       vacc0123456789ABCDEF = _mm256_mullo_epi16(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF);
205       vaccGHIJKLMNOPQRSTUV = _mm256_mullo_epi16(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV);
206 
207       vsignacc0123456789ABCDEF = _mm256_srai_epi16(vacc0123456789ABCDEF, 15);
208       vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
209       vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vacc0123456789ABCDEF, vsignacc0123456789ABCDEF));
210       vsignaccGHIJKLMNOPQRSTUV = _mm256_srai_epi16(vaccGHIJKLMNOPQRSTUV, 15);
211       vaccGHIJOPQR = _mm256_add_epi32(vaccGHIJOPQR, _mm256_unpacklo_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
212       vaccKLMNSTUV = _mm256_add_epi32(vaccKLMNSTUV, _mm256_unpackhi_epi16(vaccGHIJKLMNOPQRSTUV, vsignaccGHIJKLMNOPQRSTUV));
213 
214       w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t));
215 
216       vacc01234567 = _mm256_inserti128_si256(vacc012389AB, _mm256_castsi256_si128(vacc4567CDEF), 1);
217       vacc89ABCDEF = _mm256_permute2x128_si256(vacc012389AB, vacc4567CDEF, 0x31);
218       vaccGHIJKLMN = _mm256_inserti128_si256(vaccGHIJOPQR, _mm256_castsi256_si128(vaccKLMNSTUV), 1);
219       vaccOPQRSTUV = _mm256_permute2x128_si256(vaccGHIJOPQR, vaccKLMNSTUV, 0x31);
220 
221       __m256 vfpacc01234567 = _mm256_cvtepi32_ps(vacc01234567);
222       __m256 vfpacc89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
223       __m256 vfpaccGHIJKLMN = _mm256_cvtepi32_ps(vaccGHIJKLMN);
224       __m256 vfpaccOPQRSTUV = _mm256_cvtepi32_ps(vaccOPQRSTUV);
225 
226       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
227       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(float)));
228       const __m256 vscaleGHIJKLMN = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(float)));
229       const __m256 vscaleOPQRSTUV = _mm256_loadu_ps((const float*) ((uintptr_t) w + 24 * sizeof(float)));
230       w = (const void*) ((uintptr_t) w + 32 * sizeof(float));
231       vfpacc01234567 = _mm256_mul_ps(vfpacc01234567, vscale01234567);
232       vfpacc89ABCDEF = _mm256_mul_ps(vfpacc89ABCDEF, vscale89ABCDEF);
233       vfpaccGHIJKLMN = _mm256_mul_ps(vfpaccGHIJKLMN, vscaleGHIJKLMN);
234       vfpaccOPQRSTUV = _mm256_mul_ps(vfpaccOPQRSTUV, vscaleOPQRSTUV);
235 
236       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
237       vfpacc01234567 = _mm256_min_ps(vfpacc01234567, voutput_max_less_zero_point);
238       vfpacc89ABCDEF = _mm256_min_ps(vfpacc89ABCDEF, voutput_max_less_zero_point);
239       vfpaccGHIJKLMN = _mm256_min_ps(vfpaccGHIJKLMN, voutput_max_less_zero_point);
240       vfpaccOPQRSTUV = _mm256_min_ps(vfpaccOPQRSTUV, voutput_max_less_zero_point);
241 
242       vacc01234567 = _mm256_cvtps_epi32(vfpacc01234567);
243       vacc89ABCDEF = _mm256_cvtps_epi32(vfpacc89ABCDEF);
244       vaccGHIJKLMN = _mm256_cvtps_epi32(vfpaccGHIJKLMN);
245       vaccOPQRSTUV = _mm256_cvtps_epi32(vfpaccOPQRSTUV);
246 
247       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
248       const __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
249       const __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
250 
251       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
252       __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV), _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1)), _MM_SHUFFLE(3, 1, 2, 0));
253 
254       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
255       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
256       voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, voutput_min);
257 
258       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
259       _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
260       output += 32;
261     }
262     if XNN_UNLIKELY(c != 0) {
263       const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
264       do {
265         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
266         __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
267 
268         __m256i vacc012389AB = _mm256_inserti128_si256(vacc01234567, _mm256_castsi256_si128(vacc89ABCDEF), 1);
269         __m256i vacc4567CDEF = _mm256_permute2x128_si256(vacc01234567, vacc89ABCDEF, 0x31);
270 
271 
272         const __m256i vi0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i0));
273         const __m256i vk0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) k));
274         i0 += 16;
275 
276         const __m256i vprod0x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF);
277         const __m256i vprod0x0123456789ABCDEFhi = _mm256_srai_epi16(vprod0x0123456789ABCDEFlo, 15);
278 
279         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod0x0123456789ABCDEFlo, vprod0x0123456789ABCDEFhi));
280         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod0x0123456789ABCDEFlo, vprod0x0123456789ABCDEFhi));
281 
282         const __m256i vi1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i1));
283         const __m256i vk1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 32)));
284         i1 += 16;
285 
286         const __m256i vprod1x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
287         const __m256i vprod1x0123456789ABCDEFhi = _mm256_srai_epi16(vprod1x0123456789ABCDEFlo, 15);
288 
289         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod1x0123456789ABCDEFlo, vprod1x0123456789ABCDEFhi));
290         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod1x0123456789ABCDEFlo, vprod1x0123456789ABCDEFhi));
291 
292         const __m256i vi2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i2));
293         const __m256i vk2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 64)));
294         i2 += 16;
295 
296         const __m256i vprod2x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF);
297         const __m256i vprod2x0123456789ABCDEFhi = _mm256_srai_epi16(vprod2x0123456789ABCDEFlo, 15);
298 
299         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod2x0123456789ABCDEFlo, vprod2x0123456789ABCDEFhi));
300         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod2x0123456789ABCDEFlo, vprod2x0123456789ABCDEFhi));
301 
302         const __m256i vi3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i3));
303         const __m256i vk3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 96)));
304         i3 += 16;
305 
306         const __m256i vprod3x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF);
307         const __m256i vprod3x0123456789ABCDEFhi = _mm256_srai_epi16(vprod3x0123456789ABCDEFlo, 15);
308 
309         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod3x0123456789ABCDEFlo, vprod3x0123456789ABCDEFhi));
310         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod3x0123456789ABCDEFlo, vprod3x0123456789ABCDEFhi));
311 
312         const __m256i vi4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i4));
313         const __m256i vk4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 128)));
314         i4 += 16;
315 
316         const __m256i vprod4x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF);
317         const __m256i vprod4x0123456789ABCDEFhi = _mm256_srai_epi16(vprod4x0123456789ABCDEFlo, 15);
318 
319         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod4x0123456789ABCDEFlo, vprod4x0123456789ABCDEFhi));
320         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod4x0123456789ABCDEFlo, vprod4x0123456789ABCDEFhi));
321 
322         const __m256i vi5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i5));
323         const __m256i vk5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 160)));
324         i5 += 16;
325 
326         const __m256i vprod5x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF);
327         const __m256i vprod5x0123456789ABCDEFhi = _mm256_srai_epi16(vprod5x0123456789ABCDEFlo, 15);
328 
329         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod5x0123456789ABCDEFlo, vprod5x0123456789ABCDEFhi));
330         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod5x0123456789ABCDEFlo, vprod5x0123456789ABCDEFhi));
331 
332         const __m256i vi6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i6));
333         const __m256i vk6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 192)));
334         i6 += 16;
335 
336         const __m256i vprod6x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF);
337         const __m256i vprod6x0123456789ABCDEFhi = _mm256_srai_epi16(vprod6x0123456789ABCDEFlo, 15);
338 
339         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod6x0123456789ABCDEFlo, vprod6x0123456789ABCDEFhi));
340         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod6x0123456789ABCDEFlo, vprod6x0123456789ABCDEFhi));
341 
342         const __m256i vi7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i7));
343         const __m256i vk7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 224)));
344         i7 += 16;
345 
346         const __m256i vprod7x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF);
347         const __m256i vprod7x0123456789ABCDEFhi = _mm256_srai_epi16(vprod7x0123456789ABCDEFlo, 15);
348 
349         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod7x0123456789ABCDEFlo, vprod7x0123456789ABCDEFhi));
350         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod7x0123456789ABCDEFlo, vprod7x0123456789ABCDEFhi));
351 
352         const __m256i vi8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i8));
353         const __m256i vk8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (k + 256)));
354         i8 += 16;
355 
356         const __m256i vprod8x0123456789ABCDEFlo =  _mm256_mullo_epi16(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF);
357         const __m256i vprod8x0123456789ABCDEFhi = _mm256_srai_epi16(vprod8x0123456789ABCDEFlo, 15);
358 
359         vacc012389AB = _mm256_add_epi32(vacc012389AB, _mm256_unpacklo_epi16(vprod8x0123456789ABCDEFlo, vprod8x0123456789ABCDEFhi));
360         vacc4567CDEF = _mm256_add_epi32(vacc4567CDEF, _mm256_unpackhi_epi16(vprod8x0123456789ABCDEFlo, vprod8x0123456789ABCDEFhi));
361 
362         vacc01234567 = _mm256_inserti128_si256(vacc012389AB, _mm256_castsi256_si128(vacc4567CDEF), 1);
363         vacc89ABCDEF = _mm256_permute2x128_si256(vacc012389AB, vacc4567CDEF, 0x31);
364 
365         k += 16;
366 
367         __m256 vfpacc01234567 = _mm256_cvtepi32_ps(vacc01234567);
368         __m256 vfpacc89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
369 
370         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t)));
371         const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t) + 8 * sizeof(float)));
372         vfpacc01234567 = _mm256_mul_ps(vfpacc01234567, vscale01234567);
373         vfpacc89ABCDEF = _mm256_mul_ps(vfpacc89ABCDEF, vscale89ABCDEF);
374 
375         const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
376         vfpacc01234567 = _mm256_min_ps(vfpacc01234567, voutput_max_less_zero_point);
377         vfpacc89ABCDEF = _mm256_min_ps(vfpacc89ABCDEF, voutput_max_less_zero_point);
378 
379         vacc01234567 = _mm256_cvtps_epi32(vfpacc01234567);
380         vacc89ABCDEF = _mm256_cvtps_epi32(vfpacc89ABCDEF);
381 
382         w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
383 
384         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
385         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
386         __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
387 
388         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
389 
390         __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
391         vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
392 
393         if XNN_LIKELY(c >= 16) {
394           _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
395           output += 16;
396           c -= 16;
397         } else {
398           if (c & 8) {
399             _mm_storel_epi64((__m128i*) output, vout0123456789ABCDEF);
400             vout0123456789ABCDEF = _mm_unpackhi_epi64(vout0123456789ABCDEF, vout0123456789ABCDEF);
401             output += 8;
402           }
403           if (c & 4) {
404             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456789ABCDEF);
405             vout0123456789ABCDEF = _mm_srli_epi64(vout0123456789ABCDEF, 32);
406             output += 4;
407           }
408           if (c & 2) {
409             *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456789ABCDEF, 0);
410             vout0123456789ABCDEF = _mm_srli_epi32(vout0123456789ABCDEF, 16);
411             output += 2;
412           }
413           if (c & 1) {
414             *output = (int8_t) _mm_extract_epi8(vout0123456789ABCDEF, 0);
415             output += 1;
416           }
417           c = 0;
418         }
419       } while (c != 0);
420     }
421 
422     output = (int8_t*) ((uintptr_t) output + output_increment);
423   } while (--output_width != 0);
424 }
425