1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transpose/sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <immintrin.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17
xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2(
19 const uint8_t* input,
20 uint8_t* output,
21 size_t input_stride,
22 size_t output_stride,
23 size_t block_width,
24 size_t block_height)
25 {
26 assert(output_stride >= block_height * sizeof(uint8_t));
27 assert(input_stride >= block_width * sizeof(uint8_t));
28
29 const size_t tile_height = 16;
30 const size_t tile_width = 16;
31 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35
36 const uint8_t* i0 = input;
37 uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38 const size_t minus_output_stride = -output_stride;
39
40 do {
41 const size_t rem = min(block_width - 1, 15);
42 const size_t oN_stride = rem * output_stride;
43 const size_t oN_offset = oN_stride + tile_hbytes;
44 size_t bh = block_height;
45 for (; bh >= 16; bh -= 16) {
46 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
47 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
49 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
51 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
53 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
55 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
57 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
59 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
61 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
63 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
65 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
67 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
69 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
71 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
73 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
75 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76 const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
77 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78
79 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
80 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
81 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
82 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
83 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
84 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
85 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
86 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
87 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
88 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
89 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
90 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
91 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
92 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
93 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
94 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
95
96 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
97 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
98 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
99 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
100 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
101 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
102 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
103 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
104 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
105 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
106 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
107 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
108 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
109 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
110 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
111 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
112
113 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
114 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
115 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
116 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
117 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
118 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
119 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
120 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
121 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
122 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
123 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
124 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
125 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
126 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
127 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
128 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
129
130 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
131 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
132 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
133 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
134 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
135 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
136 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
137 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
138 const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
139 const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
140 const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
141 const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
142 const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
143 const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
144 const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
145 const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
146
147 o = (uint8_t*) ((uintptr_t) o + oN_offset);
148 _mm_storeu_si128((__m128i*) o, v0_15);
149 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
150 if XNN_UNPREDICTABLE(block_width > 15) {
151 o = oN;
152 }
153 _mm_storeu_si128((__m128i*) o, v0_14);
154 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
155 if XNN_UNPREDICTABLE(block_width >= 15) {
156 o = oN;
157 }
158 _mm_storeu_si128((__m128i*) o, v0_13);
159 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
160 if XNN_UNPREDICTABLE(block_width > 13) {
161 o = oN;
162 }
163 _mm_storeu_si128((__m128i*) o, v0_12);
164 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
165 if XNN_UNPREDICTABLE(block_width >= 13) {
166 o = oN;
167 }
168 _mm_storeu_si128((__m128i*) o, v0_11);
169 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
170 if XNN_UNPREDICTABLE(block_width > 11) {
171 o = oN;
172 }
173 _mm_storeu_si128((__m128i*) o, v0_10);
174 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
175 if XNN_UNPREDICTABLE(block_width >= 11) {
176 o = oN;
177 }
178 _mm_storeu_si128((__m128i*) o, v0_9);
179 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
180 if XNN_UNPREDICTABLE(block_width > 9) {
181 o = oN;
182 }
183 _mm_storeu_si128((__m128i*) o, v0_8);
184 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
185 if XNN_UNPREDICTABLE(block_width >= 9) {
186 o = oN;
187 }
188 _mm_storeu_si128((__m128i*) o, v0_7);
189 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
190 if XNN_UNPREDICTABLE(block_width > 7) {
191 o = oN;
192 }
193 _mm_storeu_si128((__m128i*) o, v0_6);
194 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
195 if XNN_UNPREDICTABLE(block_width >= 7) {
196 o = oN;
197 }
198 _mm_storeu_si128((__m128i*) o, v0_5);
199 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
200 if XNN_UNPREDICTABLE(block_width > 5) {
201 o = oN;
202 }
203 _mm_storeu_si128((__m128i*) o, v0_4);
204 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
205 if XNN_UNPREDICTABLE(block_width >= 5) {
206 o = oN;
207 }
208 _mm_storeu_si128((__m128i*) o, v0_3);
209 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
210 if XNN_UNPREDICTABLE(block_width > 3) {
211 o = oN;
212 }
213 _mm_storeu_si128((__m128i*) o, v0_2);
214 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
215 if XNN_UNPREDICTABLE(block_width >= 3) {
216 o = oN;
217 }
218 _mm_storeu_si128((__m128i*) o, v0_1);
219 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
220 if XNN_UNPREDICTABLE(block_width > 1) {
221 o = oN;
222 }
223 _mm_storeu_si128((__m128i*) o, v0_0);
224 }
225 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
226 if (bh != 0) {
227 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
228 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
229 if XNN_UNPREDICTABLE(bh < 2) {
230 i1 = i0;
231 }
232 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
233 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
234 if XNN_UNPREDICTABLE(bh <= 2) {
235 i2 = i1;
236 }
237 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
238 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
239 if XNN_UNPREDICTABLE(bh < 4) {
240 i3 = i2;
241 }
242 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
243 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
244 if XNN_UNPREDICTABLE(bh <= 4) {
245 i4 = i3;
246 }
247 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
248 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
249 if XNN_UNPREDICTABLE(bh < 6) {
250 i5 = i4;
251 }
252 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
253 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
254 if XNN_UNPREDICTABLE(bh <= 6) {
255 i6 = i5;
256 }
257 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
258 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
259 if XNN_UNPREDICTABLE(bh < 8) {
260 i7 = i6;
261 }
262 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
263 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
264 if XNN_UNPREDICTABLE(bh <= 8) {
265 i8 = i7;
266 }
267 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
268 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
269 if XNN_UNPREDICTABLE(bh < 10) {
270 i9 = i8;
271 }
272 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
273 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
274 if XNN_UNPREDICTABLE(bh <= 10) {
275 i10 = i9;
276 }
277 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
278 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
279 if XNN_UNPREDICTABLE(bh < 12) {
280 i11 = i10;
281 }
282 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
283 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
284 if XNN_UNPREDICTABLE(bh <= 12) {
285 i12 = i11;
286 }
287 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
288 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
289 if XNN_UNPREDICTABLE(bh < 14) {
290 i13 = i12;
291 }
292 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
293 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
294 if XNN_UNPREDICTABLE(bh <= 14) {
295 i14 = i13;
296 }
297 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
298 const __m128i v4_15 = _mm_undefined_si128();
299
300 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
301 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
302 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
303 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
304 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
305 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
306 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
307 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
308 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
309 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
310 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
311 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
312 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
313 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
314 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
315 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
316
317 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
318 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
319 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
320 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
321 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
322 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
323 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
324 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
325 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
326 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
327 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
328 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
329 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
330 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
331 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
332 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
333
334 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
335 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
336 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
337 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
338 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
339 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
340 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
341 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
342 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
343 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
344 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
345 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
346 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
347 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
348 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
349 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
350
351 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
352 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
353 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
354 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
355 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
356 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
357 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
358 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
359 __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
360 __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
361 __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
362 __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
363 __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
364 __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
365 __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
366 __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
367
368 if (bh & 8) {
369 o = (uint8_t*) ((uintptr_t) o + oN_stride);
370 _mm_storel_epi64((__m128i*) o, v0_15);
371 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
372 if XNN_UNPREDICTABLE(block_width > 15) {
373 o = oN;
374 }
375 _mm_storel_epi64((__m128i*) o, v0_14);
376 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
377 if XNN_UNPREDICTABLE(block_width >= 15) {
378 o = oN;
379 }
380 _mm_storel_epi64((__m128i*) o, v0_13);
381 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
382 if XNN_UNPREDICTABLE(block_width > 13) {
383 o = oN;
384 }
385 _mm_storel_epi64((__m128i*) o, v0_12);
386 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
387 if XNN_UNPREDICTABLE(block_width >= 13) {
388 o = oN;
389 }
390 _mm_storel_epi64((__m128i*) o, v0_11);
391 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
392 if XNN_UNPREDICTABLE(block_width > 11) {
393 o = oN;
394 }
395 _mm_storel_epi64((__m128i*) o, v0_10);
396 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
397 if XNN_UNPREDICTABLE(block_width >= 11) {
398 o = oN;
399 }
400 _mm_storel_epi64((__m128i*) o, v0_9);
401 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
402 if XNN_UNPREDICTABLE(block_width > 9) {
403 o = oN;
404 }
405 _mm_storel_epi64((__m128i*) o, v0_8);
406 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
407 if XNN_UNPREDICTABLE(block_width >= 9) {
408 o = oN;
409 }
410 _mm_storel_epi64((__m128i*) o, v0_7);
411 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
412 if XNN_UNPREDICTABLE(block_width > 7) {
413 o = oN;
414 }
415 _mm_storel_epi64((__m128i*) o, v0_6);
416 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
417 if XNN_UNPREDICTABLE(block_width >= 7) {
418 o = oN;
419 }
420 _mm_storel_epi64((__m128i*) o, v0_5);
421 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
422 if XNN_UNPREDICTABLE(block_width > 5) {
423 o = oN;
424 }
425 _mm_storel_epi64((__m128i*) o, v0_4);
426 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
427 if XNN_UNPREDICTABLE(block_width >= 5) {
428 o = oN;
429 }
430 _mm_storel_epi64((__m128i*) o, v0_3);
431 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
432 if XNN_UNPREDICTABLE(block_width > 3) {
433 o = oN;
434 }
435 _mm_storel_epi64((__m128i*) o, v0_2);
436 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
437 if XNN_UNPREDICTABLE(block_width >= 3) {
438 o = oN;
439 }
440 _mm_storel_epi64((__m128i*) o, v0_1);
441 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
442 if XNN_UNPREDICTABLE(block_width > 1) {
443 o = oN;
444 }
445 _mm_storel_epi64((__m128i*) o, v0_0);
446 o += 8;
447 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
448 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
449 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
450 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
451 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
452 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
453 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
454 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
455 v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
456 v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
457 v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
458 v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
459 v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
460 v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
461 v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
462 v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
463 }
464
465 if (bh & 4) {
466 o = (uint8_t*) ((uintptr_t) o + oN_stride);
467 *((int*) o) = _mm_cvtsi128_si32(v0_15);
468 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
469 if XNN_UNPREDICTABLE(block_width > 15) {
470 o = oN;
471 }
472 *((int*) o) = _mm_cvtsi128_si32(v0_14);
473 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
474 if XNN_UNPREDICTABLE(block_width >= 15) {
475 o = oN;
476 }
477 *((int*) o) = _mm_cvtsi128_si32(v0_13);
478 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
479 if XNN_UNPREDICTABLE(block_width > 13) {
480 o = oN;
481 }
482 *((int*) o) = _mm_cvtsi128_si32(v0_12);
483 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
484 if XNN_UNPREDICTABLE(block_width >= 13) {
485 o = oN;
486 }
487 *((int*) o) = _mm_cvtsi128_si32(v0_11);
488 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
489 if XNN_UNPREDICTABLE(block_width > 11) {
490 o = oN;
491 }
492 *((int*) o) = _mm_cvtsi128_si32(v0_10);
493 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
494 if XNN_UNPREDICTABLE(block_width >= 11) {
495 o = oN;
496 }
497 *((int*) o) = _mm_cvtsi128_si32(v0_9);
498 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
499 if XNN_UNPREDICTABLE(block_width > 9) {
500 o = oN;
501 }
502 *((int*) o) = _mm_cvtsi128_si32(v0_8);
503 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
504 if XNN_UNPREDICTABLE(block_width >= 9) {
505 o = oN;
506 }
507 *((int*) o) = _mm_cvtsi128_si32(v0_7);
508 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
509 if XNN_UNPREDICTABLE(block_width > 7) {
510 o = oN;
511 }
512 *((int*) o) = _mm_cvtsi128_si32(v0_6);
513 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
514 if XNN_UNPREDICTABLE(block_width >= 7) {
515 o = oN;
516 }
517 *((int*) o) = _mm_cvtsi128_si32(v0_5);
518 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
519 if XNN_UNPREDICTABLE(block_width > 5) {
520 o = oN;
521 }
522 *((int*) o) = _mm_cvtsi128_si32(v0_4);
523 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
524 if XNN_UNPREDICTABLE(block_width >= 5) {
525 o = oN;
526 }
527 *((int*) o) = _mm_cvtsi128_si32(v0_3);
528 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
529 if XNN_UNPREDICTABLE(block_width > 3) {
530 o = oN;
531 }
532 *((int*) o) = _mm_cvtsi128_si32(v0_2);
533 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
534 if XNN_UNPREDICTABLE(block_width >= 3) {
535 o = oN;
536 }
537 *((int*) o) = _mm_cvtsi128_si32(v0_1);
538 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
539 if XNN_UNPREDICTABLE(block_width > 1) {
540 o = oN;
541 }
542 *((int*) o) = _mm_cvtsi128_si32(v0_0);
543 o += 4;
544 v0_0 = _mm_srli_epi64(v0_0, 32);
545 v0_1 = _mm_srli_epi64(v0_1, 32);
546 v0_2 = _mm_srli_epi64(v0_2, 32);
547 v0_3 = _mm_srli_epi64(v0_3, 32);
548 v0_4 = _mm_srli_epi64(v0_4, 32);
549 v0_5 = _mm_srli_epi64(v0_5, 32);
550 v0_6 = _mm_srli_epi64(v0_6, 32);
551 v0_7 = _mm_srli_epi64(v0_7, 32);
552 v0_8 = _mm_srli_epi64(v0_8, 32);
553 v0_9 = _mm_srli_epi64(v0_9, 32);
554 v0_10 = _mm_srli_epi64(v0_10, 32);
555 v0_11 = _mm_srli_epi64(v0_11, 32);
556 v0_12 = _mm_srli_epi64(v0_12, 32);
557 v0_13 = _mm_srli_epi64(v0_13, 32);
558 v0_14 = _mm_srli_epi64(v0_14, 32);
559 v0_15 = _mm_srli_epi64(v0_15, 32);
560 }
561 if (bh & 2) {
562 o = (uint8_t*) ((uintptr_t) o + oN_stride);
563 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_15);
564 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
565 if XNN_UNPREDICTABLE(block_width > 15) {
566 o = oN;
567 }
568 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_14);
569 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
570 if XNN_UNPREDICTABLE(block_width >= 15) {
571 o = oN;
572 }
573 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_13);
574 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
575 if XNN_UNPREDICTABLE(block_width > 13) {
576 o = oN;
577 }
578 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_12);
579 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
580 if XNN_UNPREDICTABLE(block_width >= 13) {
581 o = oN;
582 }
583 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_11);
584 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
585 if XNN_UNPREDICTABLE(block_width > 11) {
586 o = oN;
587 }
588 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_10);
589 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
590 if XNN_UNPREDICTABLE(block_width >= 11) {
591 o = oN;
592 }
593 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_9);
594 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
595 if XNN_UNPREDICTABLE(block_width > 9) {
596 o = oN;
597 }
598 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_8);
599 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
600 if XNN_UNPREDICTABLE(block_width >= 9) {
601 o = oN;
602 }
603 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_7);
604 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
605 if XNN_UNPREDICTABLE(block_width > 7) {
606 o = oN;
607 }
608 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_6);
609 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
610 if XNN_UNPREDICTABLE(block_width >= 7) {
611 o = oN;
612 }
613 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_5);
614 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
615 if XNN_UNPREDICTABLE(block_width > 5) {
616 o = oN;
617 }
618 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_4);
619 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
620 if XNN_UNPREDICTABLE(block_width >= 5) {
621 o = oN;
622 }
623 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_3);
624 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
625 if XNN_UNPREDICTABLE(block_width > 3) {
626 o = oN;
627 }
628 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_2);
629 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
630 if XNN_UNPREDICTABLE(block_width >= 3) {
631 o = oN;
632 }
633 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_1);
634 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
635 if XNN_UNPREDICTABLE(block_width > 1) {
636 o = oN;
637 }
638 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_0);
639 o += 2;
640 v0_0 = _mm_srli_epi32(v0_0, 16);
641 v0_1 = _mm_srli_epi32(v0_1, 16);
642 v0_2 = _mm_srli_epi32(v0_2, 16);
643 v0_3 = _mm_srli_epi32(v0_3, 16);
644 v0_4 = _mm_srli_epi32(v0_4, 16);
645 v0_5 = _mm_srli_epi32(v0_5, 16);
646 v0_6 = _mm_srli_epi32(v0_6, 16);
647 v0_7 = _mm_srli_epi32(v0_7, 16);
648 v0_8 = _mm_srli_epi32(v0_8, 16);
649 v0_9 = _mm_srli_epi32(v0_9, 16);
650 v0_10 = _mm_srli_epi32(v0_10, 16);
651 v0_11 = _mm_srli_epi32(v0_11, 16);
652 v0_12 = _mm_srli_epi32(v0_12, 16);
653 v0_13 = _mm_srli_epi32(v0_13, 16);
654 v0_14 = _mm_srli_epi32(v0_14, 16);
655 v0_15 = _mm_srli_epi32(v0_15, 16);
656 }
657 if (bh & 1) {
658 o = (uint8_t*) ((uintptr_t) o + oN_stride);
659 *o = (uint8_t) _mm_cvtsi128_si32(v0_15);
660 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
661 if XNN_UNPREDICTABLE(block_width > 15) {
662 o = oN;
663 }
664 *o = (uint8_t) _mm_cvtsi128_si32(v0_14);
665 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
666 if XNN_UNPREDICTABLE(block_width >= 15) {
667 o = oN;
668 }
669 *o = (uint8_t) _mm_cvtsi128_si32(v0_13);
670 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
671 if XNN_UNPREDICTABLE(block_width > 13) {
672 o = oN;
673 }
674 *o = (uint8_t) _mm_cvtsi128_si32(v0_12);
675 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
676 if XNN_UNPREDICTABLE(block_width >= 13) {
677 o = oN;
678 }
679 *o = (uint8_t) _mm_cvtsi128_si32(v0_11);
680 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
681 if XNN_UNPREDICTABLE(block_width > 11) {
682 o = oN;
683 }
684 *o = (uint8_t) _mm_cvtsi128_si32(v0_10);
685 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
686 if XNN_UNPREDICTABLE(block_width >= 11) {
687 o = oN;
688 }
689 *o = (uint8_t) _mm_cvtsi128_si32(v0_9);
690 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
691 if XNN_UNPREDICTABLE(block_width > 9) {
692 o = oN;
693 }
694 *o = (uint8_t) _mm_cvtsi128_si32(v0_8);
695 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
696 if XNN_UNPREDICTABLE(block_width >= 9) {
697 o = oN;
698 }
699 *o = (uint8_t) _mm_cvtsi128_si32(v0_7);
700 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
701 if XNN_UNPREDICTABLE(block_width > 7) {
702 o = oN;
703 }
704 *o = (uint8_t) _mm_cvtsi128_si32(v0_6);
705 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
706 if XNN_UNPREDICTABLE(block_width >= 7) {
707 o = oN;
708 }
709 *o = (uint8_t) _mm_cvtsi128_si32(v0_5);
710 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
711 if XNN_UNPREDICTABLE(block_width > 5) {
712 o = oN;
713 }
714 *o = (uint8_t) _mm_cvtsi128_si32(v0_4);
715 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
716 if XNN_UNPREDICTABLE(block_width >= 5) {
717 o = oN;
718 }
719 *o = (uint8_t) _mm_cvtsi128_si32(v0_3);
720 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
721 if XNN_UNPREDICTABLE(block_width > 3) {
722 o = oN;
723 }
724 *o = (uint8_t) _mm_cvtsi128_si32(v0_2);
725 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
726 if XNN_UNPREDICTABLE(block_width >= 3) {
727 o = oN;
728 }
729 *o = (uint8_t) _mm_cvtsi128_si32(v0_1);
730 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
731 if XNN_UNPREDICTABLE(block_width > 1) {
732 o = oN;
733 }
734 *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
735 }
736 }
737
738 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
739 o = (uint8_t*) ((uintptr_t) o + output_reset);
740 block_width = doz(block_width, tile_width);
741 } while (block_width != 0);
742 }
743