1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-dwconv/unipass-avx2-mul16.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <immintrin.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16(
18 size_t channels,
19 size_t output_width,
20 const int8_t** input,
21 const void* weights,
22 int8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const int8_t* zero,
27 const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
28 {
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 do {
33 const int8_t* i0 = input[0];
34 assert(i0 != NULL);
35 if XNN_UNPREDICTABLE(i0 != zero) {
36 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
37 }
38 const int8_t* i1 = input[1];
39 assert(i1 != NULL);
40 if XNN_UNPREDICTABLE(i1 != zero) {
41 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
42 }
43 const int8_t* i2 = input[2];
44 assert(i2 != NULL);
45 if XNN_UNPREDICTABLE(i2 != zero) {
46 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
47 }
48 const int8_t* i3 = input[3];
49 assert(i3 != NULL);
50 if XNN_UNPREDICTABLE(i3 != zero) {
51 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
52 }
53 const int8_t* i4 = input[4];
54 assert(i4 != NULL);
55 if XNN_UNPREDICTABLE(i4 != zero) {
56 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
57 }
58 const int8_t* i5 = input[5];
59 assert(i5 != NULL);
60 if XNN_UNPREDICTABLE(i5 != zero) {
61 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
62 }
63 const int8_t* i6 = input[6];
64 assert(i6 != NULL);
65 if XNN_UNPREDICTABLE(i6 != zero) {
66 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
67 }
68 const int8_t* i7 = input[7];
69 assert(i7 != NULL);
70 if XNN_UNPREDICTABLE(i7 != zero) {
71 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
72 }
73 const int8_t* i8 = input[8];
74 assert(i8 != NULL);
75 if XNN_UNPREDICTABLE(i8 != zero) {
76 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
77 }
78 input = (const int8_t**) ((uintptr_t) input + input_stride);
79
80 size_t c = channels;
81 const void* w = weights;
82 for (; c >= 16; c -= 16) {
83 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
84 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
85
86
87 const __m256i vi0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i0));
88 const __m256i vk0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
89 i0 += 16;
90
91 const __m256i vprod0x0123456789ABCDEF = _mm256_mullo_epi16(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF);
92 const __m128i vprod0x89ABCDEF = _mm256_extracti128_si256(vprod0x0123456789ABCDEF, 1);
93 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod0x0123456789ABCDEF)));
94 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod0x89ABCDEF));
95
96 const __m256i vi1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i1));
97 const __m256i vk1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
98 i1 += 16;
99
100 const __m256i vprod1x0123456789ABCDEF = _mm256_mullo_epi16(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
101 const __m128i vprod1x89ABCDEF = _mm256_extracti128_si256(vprod1x0123456789ABCDEF, 1);
102 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod1x0123456789ABCDEF)));
103 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod1x89ABCDEF));
104
105 const __m256i vi2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i2));
106 const __m256i vk2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
107 i2 += 16;
108
109 const __m256i vprod2x0123456789ABCDEF = _mm256_mullo_epi16(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF);
110 const __m128i vprod2x89ABCDEF = _mm256_extracti128_si256(vprod2x0123456789ABCDEF, 1);
111 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod2x0123456789ABCDEF)));
112 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod2x89ABCDEF));
113
114 const __m256i vi3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i3));
115 const __m256i vk3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
116 i3 += 16;
117
118 const __m256i vprod3x0123456789ABCDEF = _mm256_mullo_epi16(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF);
119 const __m128i vprod3x89ABCDEF = _mm256_extracti128_si256(vprod3x0123456789ABCDEF, 1);
120 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod3x0123456789ABCDEF)));
121 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod3x89ABCDEF));
122
123 const __m256i vi4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i4));
124 const __m256i vk4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
125 i4 += 16;
126
127 const __m256i vprod4x0123456789ABCDEF = _mm256_mullo_epi16(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF);
128 const __m128i vprod4x89ABCDEF = _mm256_extracti128_si256(vprod4x0123456789ABCDEF, 1);
129 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod4x0123456789ABCDEF)));
130 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod4x89ABCDEF));
131
132 const __m256i vi5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i5));
133 const __m256i vk5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
134 i5 += 16;
135
136 const __m256i vprod5x0123456789ABCDEF = _mm256_mullo_epi16(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF);
137 const __m128i vprod5x89ABCDEF = _mm256_extracti128_si256(vprod5x0123456789ABCDEF, 1);
138 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod5x0123456789ABCDEF)));
139 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod5x89ABCDEF));
140
141 const __m256i vi6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i6));
142 const __m256i vk6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
143 i6 += 16;
144
145 const __m256i vprod6x0123456789ABCDEF = _mm256_mullo_epi16(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF);
146 const __m128i vprod6x89ABCDEF = _mm256_extracti128_si256(vprod6x0123456789ABCDEF, 1);
147 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod6x0123456789ABCDEF)));
148 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod6x89ABCDEF));
149
150 const __m256i vi7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i7));
151 const __m256i vk7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
152 i7 += 16;
153
154 const __m256i vprod7x0123456789ABCDEF = _mm256_mullo_epi16(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF);
155 const __m128i vprod7x89ABCDEF = _mm256_extracti128_si256(vprod7x0123456789ABCDEF, 1);
156 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod7x0123456789ABCDEF)));
157 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod7x89ABCDEF));
158
159 const __m256i vi8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i8));
160 const __m256i vk8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
161 i8 += 16;
162
163 const __m256i vprod8x0123456789ABCDEF = _mm256_mullo_epi16(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF);
164 const __m128i vprod8x89ABCDEF = _mm256_extracti128_si256(vprod8x0123456789ABCDEF, 1);
165 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod8x0123456789ABCDEF)));
166 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod8x89ABCDEF));
167
168 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
169
170 const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
171 const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
172
173 const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
174 const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
175
176 const __m256i vprod0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc01234567, vmultiplier), vrounding);
177 const __m256i vprod1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1357, vmultiplier), vrounding);
178 const __m256i vprod8ACE = _mm256_add_epi64(_mm256_mul_epi32(vacc89ABCDEF, vmultiplier), vrounding);
179 const __m256i vprod9BDF = _mm256_add_epi64(_mm256_mul_epi32(vacc9BDF, vmultiplier), vrounding);
180
181 const __m256i vq31prod0246 = _mm256_srli_epi64(vprod0246, 31);
182 const __m256i vq31prod1357 = _mm256_add_epi64(vprod1357, vprod1357);
183 const __m256i vq31prod8ACE = _mm256_srli_epi64(vprod8ACE, 31);
184 const __m256i vq31prod9BDF = _mm256_add_epi64(vprod9BDF, vprod9BDF);
185
186 const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
187 const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
188
189 const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
190 const __m256i vrem01234567 =
191 _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
192 const __m256i vrem89ABCDEF =
193 _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
194
195 const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
196 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
197 vacc01234567 =
198 _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
199 vacc89ABCDEF =
200 _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
201
202 const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
203 __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
204
205 const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
206 const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
207 vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
208
209 __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
210
211 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
212 output += 16;
213 }
214 if XNN_UNLIKELY(c != 0) {
215 {
216 __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
217 __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
218
219
220 const __m256i vi0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i0));
221 const __m256i vk0x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
222
223 const __m256i vprod0x0123456789ABCDEF = _mm256_mullo_epi16(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF);
224 const __m128i vprod0x89ABCDEF = _mm256_extracti128_si256(vprod0x0123456789ABCDEF, 1);
225 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod0x0123456789ABCDEF)));
226 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod0x89ABCDEF));
227
228 const __m256i vi1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i1));
229 const __m256i vk1x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
230
231 const __m256i vprod1x0123456789ABCDEF = _mm256_mullo_epi16(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
232 const __m128i vprod1x89ABCDEF = _mm256_extracti128_si256(vprod1x0123456789ABCDEF, 1);
233 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod1x0123456789ABCDEF)));
234 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod1x89ABCDEF));
235
236 const __m256i vi2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i2));
237 const __m256i vk2x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
238
239 const __m256i vprod2x0123456789ABCDEF = _mm256_mullo_epi16(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF);
240 const __m128i vprod2x89ABCDEF = _mm256_extracti128_si256(vprod2x0123456789ABCDEF, 1);
241 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod2x0123456789ABCDEF)));
242 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod2x89ABCDEF));
243
244 const __m256i vi3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i3));
245 const __m256i vk3x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
246
247 const __m256i vprod3x0123456789ABCDEF = _mm256_mullo_epi16(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF);
248 const __m128i vprod3x89ABCDEF = _mm256_extracti128_si256(vprod3x0123456789ABCDEF, 1);
249 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod3x0123456789ABCDEF)));
250 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod3x89ABCDEF));
251
252 const __m256i vi4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i4));
253 const __m256i vk4x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
254
255 const __m256i vprod4x0123456789ABCDEF = _mm256_mullo_epi16(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF);
256 const __m128i vprod4x89ABCDEF = _mm256_extracti128_si256(vprod4x0123456789ABCDEF, 1);
257 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod4x0123456789ABCDEF)));
258 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod4x89ABCDEF));
259
260 const __m256i vi5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i5));
261 const __m256i vk5x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
262
263 const __m256i vprod5x0123456789ABCDEF = _mm256_mullo_epi16(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF);
264 const __m128i vprod5x89ABCDEF = _mm256_extracti128_si256(vprod5x0123456789ABCDEF, 1);
265 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod5x0123456789ABCDEF)));
266 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod5x89ABCDEF));
267
268 const __m256i vi6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i6));
269 const __m256i vk6x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
270
271 const __m256i vprod6x0123456789ABCDEF = _mm256_mullo_epi16(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF);
272 const __m128i vprod6x89ABCDEF = _mm256_extracti128_si256(vprod6x0123456789ABCDEF, 1);
273 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod6x0123456789ABCDEF)));
274 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod6x89ABCDEF));
275
276 const __m256i vi7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i7));
277 const __m256i vk7x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
278
279 const __m256i vprod7x0123456789ABCDEF = _mm256_mullo_epi16(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF);
280 const __m128i vprod7x89ABCDEF = _mm256_extracti128_si256(vprod7x0123456789ABCDEF, 1);
281 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod7x0123456789ABCDEF)));
282 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod7x89ABCDEF));
283
284 const __m256i vi8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i8));
285 const __m256i vk8x0123456789ABCDEF = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
286
287 const __m256i vprod8x0123456789ABCDEF = _mm256_mullo_epi16(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF);
288 const __m128i vprod8x89ABCDEF = _mm256_extracti128_si256(vprod8x0123456789ABCDEF, 1);
289 vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vprod8x0123456789ABCDEF)));
290 vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod8x89ABCDEF));
291
292
293 const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
294 const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
295
296 const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
297 const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
298
299 const __m256i vprod0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc01234567, vmultiplier), vrounding);
300 const __m256i vprod1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1357, vmultiplier), vrounding);
301 const __m256i vprod8ACE = _mm256_add_epi64(_mm256_mul_epi32(vacc89ABCDEF, vmultiplier), vrounding);
302 const __m256i vprod9BDF = _mm256_add_epi64(_mm256_mul_epi32(vacc9BDF, vmultiplier), vrounding);
303
304 const __m256i vq31prod0246 = _mm256_srli_epi64(vprod0246, 31);
305 const __m256i vq31prod1357 = _mm256_add_epi64(vprod1357, vprod1357);
306 const __m256i vq31prod8ACE = _mm256_srli_epi64(vprod8ACE, 31);
307 const __m256i vq31prod9BDF = _mm256_add_epi64(vprod9BDF, vprod9BDF);
308
309 const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
310 const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
311
312 const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
313 const __m256i vrem01234567 =
314 _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
315 const __m256i vrem89ABCDEF =
316 _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
317
318 const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
319 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
320 vacc01234567 =
321 _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
322 vacc89ABCDEF =
323 _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
324
325 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
326 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
327 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
328
329 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
330 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
331 vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
332 vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
333
334 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
335
336 if (c & 8) {
337 _mm_storel_epi64((__m128i*) output, vout0123456789ABCDEF);
338 vout0123456789ABCDEF = _mm_unpackhi_epi64(vout0123456789ABCDEF, vout0123456789ABCDEF);
339 output += 8;
340 }
341 if (c & 4) {
342 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456789ABCDEF);
343 vout0123456789ABCDEF = _mm_srli_epi64(vout0123456789ABCDEF, 32);
344 output += 4;
345 }
346 if (c & 2) {
347 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456789ABCDEF, 0);
348 vout0123456789ABCDEF = _mm_srli_epi32(vout0123456789ABCDEF, 16);
349 output += 2;
350 }
351 if (c & 1) {
352 *output = (int8_t) _mm_extract_epi8(vout0123456789ABCDEF, 0);
353 output += 1;
354 }
355 }
356 }
357
358 output = (int8_t*) ((uintptr_t) output + output_increment);
359 } while (--output_width != 0);
360 }
361