1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <emmintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2(
19 size_t rows,
20 size_t channels,
21 const int8_t* input,
22 size_t input_stride,
23 const int8_t* zero,
24 int32_t* buffer,
25 int8_t* output,
26 const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const int8_t* i0 = input;
32 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
33 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
34 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
35 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
36 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
37 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
39
40 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
41 int32_t* b = buffer;
42 size_t c = channels;
43 for (; c != 0; c = doz(c, 16)) {
44 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
45 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
46 i0 += 16;
47 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
48 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
49 i1 += 16;
50 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
51 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
52 i2 += 16;
53 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
54 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
55 i3 += 16;
56 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
57 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
58 i4 += 16;
59 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
60 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
61 i5 += 16;
62 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
63 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
64 i6 += 16;
65
66 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
67 const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x89ABCDEF));
68 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
69 const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x89ABCDEF));
70 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
71 const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x89ABCDEF));
72 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
73 const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x89ABCDEF));
74 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
75 const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x89ABCDEF));
76 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
77 const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x89ABCDEF));
78 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
79 const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x89ABCDEF));
80
81 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
82 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
83 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
84 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
85
86 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
87 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
88 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
89 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
90 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
91 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
92
93 // Add up all accumulators to vacc0x0123456789ABCDEF
94 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
95 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
96
97 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567);
98 const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567));
99 const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567));
100 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF);
101 const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF));
102 const __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF));
103
104 _mm_store_si128((__m128i*) b, vacc0123);
105 _mm_store_si128((__m128i*) (b + 4), vacc4567);
106 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
107 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
108 b += 16;
109 }
110
111 for (rows -= 7; rows > 7; rows -= 7) {
112 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
113 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
114 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
115 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
116 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
117 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
118 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
119
120 int32_t* b = buffer;
121 size_t c = channels;
122 for (; c != 0; c = doz(c, 16)) {
123 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
124 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
125 i0 += 16;
126 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
127 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
128 i1 += 16;
129 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
130 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
131 i2 += 16;
132 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
133 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
134 i3 += 16;
135 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
136 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
137 i4 += 16;
138 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
139 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
140 i5 += 16;
141 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
142 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
143 i6 += 16;
144
145 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
146 const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x89ABCDEF));
147 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
148 const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x89ABCDEF));
149 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
150 const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x89ABCDEF));
151 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
152 const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x89ABCDEF));
153 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
154 const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x89ABCDEF));
155 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
156 const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x89ABCDEF));
157 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
158 const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x89ABCDEF));
159
160 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
161 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
162 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
163 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
164
165 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
166 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
167 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
168 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
169 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
170 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
171
172 // Add up all accumulators to vacc0x0123456789ABCDEF
173 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
174 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
175
176 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567);
177 const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) (b + 0)));
178 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) (b + 4)));
179 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF);
180 const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
181 const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_load_si128((const __m128i*) (b + 12)));
182
183 _mm_store_si128((__m128i*) b, vacc0123);
184 _mm_store_si128((__m128i*) (b + 4), vacc4567);
185 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
186 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
187 b += 16;
188 }
189 }
190
191 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
192 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
193 if XNN_UNPREDICTABLE(rows < 2) {
194 i1 = zero;
195 }
196 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
197 if XNN_UNPREDICTABLE(rows <= 2) {
198 i2 = zero;
199 }
200 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
201 if XNN_UNPREDICTABLE(rows < 4) {
202 i3 = zero;
203 }
204 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
205 if XNN_UNPREDICTABLE(rows <= 4) {
206 i4 = zero;
207 }
208 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
209 if XNN_UNPREDICTABLE(rows < 6) {
210 i5 = zero;
211 }
212 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
213 if XNN_UNPREDICTABLE(rows <= 6) {
214 i6 = zero;
215 }
216
217 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
218 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
219 const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
220 while (channels >= 16) {
221 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
222 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
223 i0 += 16;
224 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
225 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
226 i1 += 16;
227 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
228 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
229 i2 += 16;
230 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
231 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
232 i3 += 16;
233 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
234 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
235 i4 += 16;
236 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
237 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
238 i5 += 16;
239 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
240 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
241 i6 += 16;
242
243 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
244 const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x89ABCDEF));
245 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
246 const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x89ABCDEF));
247 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
248 const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x89ABCDEF));
249 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
250 const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x89ABCDEF));
251 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
252 const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x89ABCDEF));
253 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
254 const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x89ABCDEF));
255 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
256 const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x89ABCDEF));
257
258 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
259 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
260 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
261 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
262
263 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
264 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
265 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
266 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
267 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
268 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
269
270 // Add up all accumulators to vacc0x0123456789ABCDEF
271 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
272 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
273
274 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567);
275 const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
276 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
277 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF);
278 const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
279 const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 12)));
280 buffer += 16;
281
282 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123);
283 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567);
284 const __m128i vsgnacc89AB = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB);
285 const __m128i vsgnaccCDEF = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF);
286
287 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123);
288 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567);
289 const __m128i vabsacc89AB = _mm_sub_epi32(_mm_xor_si128(vacc89AB, vsgnacc89AB), vsgnacc89AB);
290 const __m128i vabsaccCDEF = _mm_sub_epi32(_mm_xor_si128(vaccCDEF, vsgnaccCDEF), vsgnaccCDEF);
291
292 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1));
293 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1));
294 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1));
295 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1));
296
297 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier);
298 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier);
299 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier);
300 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier);
301 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier);
302 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier);
303 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier);
304 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier);
305
306 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift);
307 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift);
308 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift);
309 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift);
310 const __m128i vabsout8A = _mm_srl_epi64(_mm_add_epi64(vabsprod8A, vrounding), vshift);
311 const __m128i vabsout9B = _mm_srl_epi64(_mm_add_epi64(vabsprod9B, vrounding), vshift);
312 const __m128i vabsoutCE = _mm_srl_epi64(_mm_add_epi64(vabsprodCE, vrounding), vshift);
313 const __m128i vabsoutDF = _mm_srl_epi64(_mm_add_epi64(vabsprodDF, vrounding), vshift);
314
315 const __m128i vabsout0213 = _mm_castps_si128(
316 _mm_shuffle_ps(_mm_castsi128_ps(vabsout02), _mm_castsi128_ps(vabsout13), _MM_SHUFFLE(2, 0, 2, 0)));
317 const __m128i vabsout4657 = _mm_castps_si128(
318 _mm_shuffle_ps(_mm_castsi128_ps(vabsout46), _mm_castsi128_ps(vabsout57), _MM_SHUFFLE(2, 0, 2, 0)));
319 const __m128i vabsout8A9B = _mm_castps_si128(
320 _mm_shuffle_ps(_mm_castsi128_ps(vabsout8A), _mm_castsi128_ps(vabsout9B), _MM_SHUFFLE(2, 0, 2, 0)));
321 const __m128i vabsoutCEDF = _mm_castps_si128(
322 _mm_shuffle_ps(_mm_castsi128_ps(vabsoutCE), _mm_castsi128_ps(vabsoutDF), _MM_SHUFFLE(2, 0, 2, 0)));
323
324 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0));
325 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0));
326 const __m128i vabsout89AB = _mm_shuffle_epi32(vabsout8A9B, _MM_SHUFFLE(3, 1, 2, 0));
327 const __m128i vabsoutCDEF = _mm_shuffle_epi32(vabsoutCEDF, _MM_SHUFFLE(3, 1, 2, 0));
328
329 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123);
330 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567);
331 const __m128i vout89AB = _mm_sub_epi32(_mm_xor_si128(vabsout89AB, vsgnacc89AB), vsgnacc89AB);
332 const __m128i voutCDEF = _mm_sub_epi32(_mm_xor_si128(vabsoutCDEF, vsgnaccCDEF), vsgnaccCDEF);
333
334 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
335 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point);
336 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vout89AB, voutCDEF), voutput_zero_point);
337
338 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
339 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
340 vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
341 vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
342
343 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
344
345 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
346 output += 16;
347
348 channels -= 16;
349 }
350 if XNN_UNLIKELY(channels != 0) {
351 do {
352 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
353 i0 += 8;
354 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
355 i1 += 8;
356 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
357 i2 += 8;
358 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
359 i3 += 8;
360 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
361 i4 += 8;
362 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
363 i5 += 8;
364 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
365 i6 += 8;
366
367 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
368 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
369 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
370 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
371 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
372 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
373 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
374
375 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
376 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
377
378 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
379 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
380 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
381
382 // Add up all accumulators to vacc0x01234567
383 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
384
385 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567);
386 const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) buffer));
387 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
388 buffer += 8;
389
390 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123);
391 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567);
392
393 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123);
394 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567);
395
396 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1));
397 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1));
398
399 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier);
400 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier);
401 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier);
402 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier);
403
404 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift);
405 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift);
406 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift);
407 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift);
408
409 const __m128i vabsout0213 = _mm_castps_si128(
410 _mm_shuffle_ps(_mm_castsi128_ps(vabsout02), _mm_castsi128_ps(vabsout13), _MM_SHUFFLE(2, 0, 2, 0)));
411 const __m128i vabsout4657 = _mm_castps_si128(
412 _mm_shuffle_ps(_mm_castsi128_ps(vabsout46), _mm_castsi128_ps(vabsout57), _MM_SHUFFLE(2, 0, 2, 0)));
413
414 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0));
415 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0));
416
417 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123);
418 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567);
419
420 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
421 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point);
422
423 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
424 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
425 vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
426
427 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
428
429 if XNN_LIKELY(channels >= 8) {
430 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
431 output += 8;
432 channels -= 8;
433 } else {
434 if (channels & 4) {
435 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
436 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
437 output += 4;
438 }
439 if (channels & 2) {
440 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
441 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
442 output += 2;
443 }
444 if (channels & 1) {
445 *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
446 output += 1;
447 }
448 channels = 0;
449 }
450 } while (channels != 0);
451 }
452 }
453