• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transpose/sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <immintrin.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2(
19     const uint8_t* input,
20     uint8_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height)
25 {
26   assert(output_stride >= block_height * sizeof(uint8_t));
27   assert(input_stride >= block_width * sizeof(uint8_t));
28 
29   const size_t tile_height = 16;
30   const size_t tile_width = 16;
31   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35 
36   const uint8_t* i0 = input;
37   uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 15);
42     const size_t oN_stride = rem * output_stride;
43     const size_t oN_offset = oN_stride + tile_hbytes;
44     size_t bh = block_height;
45     for (; bh >= 16; bh -= 16) {
46       const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
47       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48       const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
49       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50       const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
51       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52       const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
53       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54       const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
55       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56       const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
57       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58       const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
59       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60       const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
61       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62       const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
63       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64       const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
65       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66       const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
67       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68       const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
69       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70       const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
71       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72       const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
73       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74       const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
75       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76       const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
77       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78 
79       const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
80       const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
81       const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
82       const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
83       const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
84       const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
85       const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
86       const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
87       const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
88       const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
89       const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
90       const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
91       const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
92       const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
93       const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
94       const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
95 
96       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
97       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
98       const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
99       const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
100       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
101       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
102       const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
103       const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
104       const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
105       const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
106       const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
107       const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
108       const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
109       const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
110       const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
111       const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
112 
113       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
114       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
115       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
116       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
117       const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
118       const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
119       const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
120       const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
121       const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
122       const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
123       const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
124       const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
125       const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
126       const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
127       const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
128       const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
129 
130       const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
131       const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
132       const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
133       const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
134       const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
135       const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
136       const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
137       const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
138       const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
139       const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
140       const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
141       const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
142       const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
143       const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
144       const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
145       const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
146 
147       o = (uint8_t*) ((uintptr_t) o + oN_offset);
148       _mm_storeu_si128((__m128i*) o, v0_15);
149       uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
150       if XNN_UNPREDICTABLE(block_width > 15) {
151         o = oN;
152       }
153       _mm_storeu_si128((__m128i*) o, v0_14);
154       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
155       if XNN_UNPREDICTABLE(block_width >= 15) {
156         o = oN;
157       }
158       _mm_storeu_si128((__m128i*) o, v0_13);
159       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
160       if XNN_UNPREDICTABLE(block_width > 13) {
161         o = oN;
162       }
163       _mm_storeu_si128((__m128i*) o, v0_12);
164       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
165       if XNN_UNPREDICTABLE(block_width >= 13) {
166         o = oN;
167       }
168       _mm_storeu_si128((__m128i*) o, v0_11);
169       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
170       if XNN_UNPREDICTABLE(block_width > 11) {
171         o = oN;
172       }
173       _mm_storeu_si128((__m128i*) o, v0_10);
174       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
175       if XNN_UNPREDICTABLE(block_width >= 11) {
176         o = oN;
177       }
178       _mm_storeu_si128((__m128i*) o, v0_9);
179       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
180       if XNN_UNPREDICTABLE(block_width > 9) {
181         o = oN;
182       }
183       _mm_storeu_si128((__m128i*) o, v0_8);
184       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
185       if XNN_UNPREDICTABLE(block_width >= 9) {
186         o = oN;
187       }
188       _mm_storeu_si128((__m128i*) o, v0_7);
189       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
190       if XNN_UNPREDICTABLE(block_width > 7) {
191         o = oN;
192       }
193       _mm_storeu_si128((__m128i*) o, v0_6);
194       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
195       if XNN_UNPREDICTABLE(block_width >= 7) {
196         o = oN;
197       }
198       _mm_storeu_si128((__m128i*) o, v0_5);
199       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
200       if XNN_UNPREDICTABLE(block_width > 5) {
201         o = oN;
202       }
203       _mm_storeu_si128((__m128i*) o, v0_4);
204       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
205       if XNN_UNPREDICTABLE(block_width >= 5) {
206         o = oN;
207       }
208       _mm_storeu_si128((__m128i*) o, v0_3);
209       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
210       if XNN_UNPREDICTABLE(block_width > 3) {
211         o = oN;
212       }
213       _mm_storeu_si128((__m128i*) o, v0_2);
214       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
215       if XNN_UNPREDICTABLE(block_width >= 3) {
216         o = oN;
217       }
218       _mm_storeu_si128((__m128i*) o, v0_1);
219       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
220       if XNN_UNPREDICTABLE(block_width > 1) {
221         o = oN;
222       }
223       _mm_storeu_si128((__m128i*) o, v0_0);
224     }
225     o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
226     if (bh != 0) {
227       const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
228       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
229       if XNN_UNPREDICTABLE(bh < 2) {
230         i1 = i0;
231       }
232       const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
233       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
234       if XNN_UNPREDICTABLE(bh <= 2) {
235         i2 = i1;
236       }
237       const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
238       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
239       if XNN_UNPREDICTABLE(bh < 4) {
240         i3 = i2;
241       }
242       const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
243       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
244       if XNN_UNPREDICTABLE(bh <= 4) {
245         i4 = i3;
246       }
247       const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
248       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
249       if XNN_UNPREDICTABLE(bh < 6) {
250         i5 = i4;
251       }
252       const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
253       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
254       if XNN_UNPREDICTABLE(bh <= 6) {
255         i6 = i5;
256       }
257       const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
258       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
259       if XNN_UNPREDICTABLE(bh < 8) {
260         i7 = i6;
261       }
262       const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
263       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
264       if XNN_UNPREDICTABLE(bh <= 8) {
265         i8 = i7;
266       }
267       const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
268       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
269       if XNN_UNPREDICTABLE(bh < 10) {
270         i9 = i8;
271       }
272       const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
273       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
274       if XNN_UNPREDICTABLE(bh <= 10) {
275         i10 = i9;
276       }
277       const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
278       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
279       if XNN_UNPREDICTABLE(bh < 12) {
280         i11 = i10;
281       }
282       const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
283       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
284       if XNN_UNPREDICTABLE(bh <= 12) {
285         i12 = i11;
286       }
287       const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
288       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
289       if XNN_UNPREDICTABLE(bh < 14) {
290         i13 = i12;
291       }
292       const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
293       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
294       if XNN_UNPREDICTABLE(bh <= 14) {
295         i14 = i13;
296       }
297       const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
298       const __m128i v4_15 = _mm_undefined_si128();
299 
300       const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
301       const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
302       const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
303       const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
304       const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
305       const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
306       const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
307       const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
308       const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
309       const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
310       const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
311       const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
312       const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
313       const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
314       const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
315       const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
316 
317       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
318       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
319       const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
320       const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
321       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
322       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
323       const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
324       const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
325       const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
326       const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
327       const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
328       const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
329       const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
330       const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
331       const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
332       const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
333 
334       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
335       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
336       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
337       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
338       const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
339       const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
340       const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
341       const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
342       const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
343       const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
344       const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
345       const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
346       const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
347       const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
348       const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
349       const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
350 
351       __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
352       __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
353       __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
354       __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
355       __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
356       __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
357       __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
358       __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
359       __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
360       __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
361       __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
362       __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
363       __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
364       __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
365       __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
366       __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
367 
368       if (bh & 8) {
369         o = (uint8_t*) ((uintptr_t) o + oN_stride);
370         _mm_storel_epi64((__m128i*) o, v0_15);
371         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
372         if XNN_UNPREDICTABLE(block_width > 15) {
373           o = oN;
374         }
375         _mm_storel_epi64((__m128i*) o, v0_14);
376         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
377         if XNN_UNPREDICTABLE(block_width >= 15) {
378           o = oN;
379         }
380         _mm_storel_epi64((__m128i*) o, v0_13);
381         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
382         if XNN_UNPREDICTABLE(block_width > 13) {
383           o = oN;
384         }
385         _mm_storel_epi64((__m128i*) o, v0_12);
386         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
387         if XNN_UNPREDICTABLE(block_width >= 13) {
388           o = oN;
389         }
390         _mm_storel_epi64((__m128i*) o, v0_11);
391         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
392         if XNN_UNPREDICTABLE(block_width > 11) {
393           o = oN;
394         }
395         _mm_storel_epi64((__m128i*) o, v0_10);
396         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
397         if XNN_UNPREDICTABLE(block_width >= 11) {
398           o = oN;
399         }
400         _mm_storel_epi64((__m128i*) o, v0_9);
401         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
402         if XNN_UNPREDICTABLE(block_width > 9) {
403           o = oN;
404         }
405         _mm_storel_epi64((__m128i*) o, v0_8);
406         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
407         if XNN_UNPREDICTABLE(block_width >= 9) {
408           o = oN;
409         }
410         _mm_storel_epi64((__m128i*) o, v0_7);
411         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
412         if XNN_UNPREDICTABLE(block_width > 7) {
413           o = oN;
414         }
415         _mm_storel_epi64((__m128i*) o, v0_6);
416         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
417         if XNN_UNPREDICTABLE(block_width >= 7) {
418           o = oN;
419         }
420         _mm_storel_epi64((__m128i*) o, v0_5);
421         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
422         if XNN_UNPREDICTABLE(block_width > 5) {
423           o = oN;
424         }
425         _mm_storel_epi64((__m128i*) o, v0_4);
426         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
427         if XNN_UNPREDICTABLE(block_width >= 5) {
428           o = oN;
429         }
430         _mm_storel_epi64((__m128i*) o, v0_3);
431         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
432         if XNN_UNPREDICTABLE(block_width > 3) {
433           o = oN;
434         }
435         _mm_storel_epi64((__m128i*) o, v0_2);
436         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
437         if XNN_UNPREDICTABLE(block_width >= 3) {
438           o = oN;
439         }
440         _mm_storel_epi64((__m128i*) o, v0_1);
441         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
442         if XNN_UNPREDICTABLE(block_width > 1) {
443           o = oN;
444         }
445         _mm_storel_epi64((__m128i*) o, v0_0);
446         o += 8;
447         v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
448         v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
449         v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
450         v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
451         v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
452         v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
453         v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
454         v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
455         v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
456         v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
457         v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
458         v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
459         v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
460         v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
461         v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
462         v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
463       }
464 
465       if (bh & 4) {
466         o = (uint8_t*) ((uintptr_t) o + oN_stride);
467         *((int*) o) = _mm_cvtsi128_si32(v0_15);
468         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
469         if XNN_UNPREDICTABLE(block_width > 15) {
470           o = oN;
471         }
472         *((int*) o) = _mm_cvtsi128_si32(v0_14);
473         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
474         if XNN_UNPREDICTABLE(block_width >= 15) {
475           o = oN;
476         }
477         *((int*) o) = _mm_cvtsi128_si32(v0_13);
478         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
479         if XNN_UNPREDICTABLE(block_width > 13) {
480           o = oN;
481         }
482         *((int*) o) = _mm_cvtsi128_si32(v0_12);
483         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
484         if XNN_UNPREDICTABLE(block_width >= 13) {
485           o = oN;
486         }
487         *((int*) o) = _mm_cvtsi128_si32(v0_11);
488         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
489         if XNN_UNPREDICTABLE(block_width > 11) {
490           o = oN;
491         }
492         *((int*) o) = _mm_cvtsi128_si32(v0_10);
493         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
494         if XNN_UNPREDICTABLE(block_width >= 11) {
495           o = oN;
496         }
497         *((int*) o) = _mm_cvtsi128_si32(v0_9);
498         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
499         if XNN_UNPREDICTABLE(block_width > 9) {
500           o = oN;
501         }
502         *((int*) o) = _mm_cvtsi128_si32(v0_8);
503         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
504         if XNN_UNPREDICTABLE(block_width >= 9) {
505           o = oN;
506         }
507         *((int*) o) = _mm_cvtsi128_si32(v0_7);
508         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
509         if XNN_UNPREDICTABLE(block_width > 7) {
510           o = oN;
511         }
512         *((int*) o) = _mm_cvtsi128_si32(v0_6);
513         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
514         if XNN_UNPREDICTABLE(block_width >= 7) {
515           o = oN;
516         }
517         *((int*) o) = _mm_cvtsi128_si32(v0_5);
518         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
519         if XNN_UNPREDICTABLE(block_width > 5) {
520           o = oN;
521         }
522         *((int*) o) = _mm_cvtsi128_si32(v0_4);
523         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
524         if XNN_UNPREDICTABLE(block_width >= 5) {
525           o = oN;
526         }
527         *((int*) o) = _mm_cvtsi128_si32(v0_3);
528         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
529         if XNN_UNPREDICTABLE(block_width > 3) {
530           o = oN;
531         }
532         *((int*) o) = _mm_cvtsi128_si32(v0_2);
533         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
534         if XNN_UNPREDICTABLE(block_width >= 3) {
535           o = oN;
536         }
537         *((int*) o) = _mm_cvtsi128_si32(v0_1);
538         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
539         if XNN_UNPREDICTABLE(block_width > 1) {
540           o = oN;
541         }
542         *((int*) o) = _mm_cvtsi128_si32(v0_0);
543         o += 4;
544         v0_0 = _mm_srli_epi64(v0_0, 32);
545         v0_1 = _mm_srli_epi64(v0_1, 32);
546         v0_2 = _mm_srli_epi64(v0_2, 32);
547         v0_3 = _mm_srli_epi64(v0_3, 32);
548         v0_4 = _mm_srli_epi64(v0_4, 32);
549         v0_5 = _mm_srli_epi64(v0_5, 32);
550         v0_6 = _mm_srli_epi64(v0_6, 32);
551         v0_7 = _mm_srli_epi64(v0_7, 32);
552         v0_8 = _mm_srli_epi64(v0_8, 32);
553         v0_9 = _mm_srli_epi64(v0_9, 32);
554         v0_10 = _mm_srli_epi64(v0_10, 32);
555         v0_11 = _mm_srli_epi64(v0_11, 32);
556         v0_12 = _mm_srli_epi64(v0_12, 32);
557         v0_13 = _mm_srli_epi64(v0_13, 32);
558         v0_14 = _mm_srli_epi64(v0_14, 32);
559         v0_15 = _mm_srli_epi64(v0_15, 32);
560       }
561       if (bh & 2) {
562         o = (uint8_t*) ((uintptr_t) o + oN_stride);
563         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_15);
564         uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
565         if XNN_UNPREDICTABLE(block_width > 15) {
566           o = oN;
567         }
568         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_14);
569         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
570         if XNN_UNPREDICTABLE(block_width >= 15) {
571           o = oN;
572         }
573         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_13);
574         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
575         if XNN_UNPREDICTABLE(block_width > 13) {
576           o = oN;
577         }
578         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_12);
579         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
580         if XNN_UNPREDICTABLE(block_width >= 13) {
581           o = oN;
582         }
583         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_11);
584         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
585         if XNN_UNPREDICTABLE(block_width > 11) {
586           o = oN;
587         }
588         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_10);
589         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
590         if XNN_UNPREDICTABLE(block_width >= 11) {
591           o = oN;
592         }
593         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_9);
594         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
595         if XNN_UNPREDICTABLE(block_width > 9) {
596           o = oN;
597         }
598         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_8);
599         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
600         if XNN_UNPREDICTABLE(block_width >= 9) {
601           o = oN;
602         }
603         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_7);
604         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
605         if XNN_UNPREDICTABLE(block_width > 7) {
606           o = oN;
607         }
608         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_6);
609         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
610         if XNN_UNPREDICTABLE(block_width >= 7) {
611           o = oN;
612         }
613         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_5);
614         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
615         if XNN_UNPREDICTABLE(block_width > 5) {
616           o = oN;
617         }
618         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_4);
619         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
620         if XNN_UNPREDICTABLE(block_width >= 5) {
621           o = oN;
622         }
623         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_3);
624         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
625         if XNN_UNPREDICTABLE(block_width > 3) {
626           o = oN;
627         }
628         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_2);
629         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
630         if XNN_UNPREDICTABLE(block_width >= 3) {
631           o = oN;
632         }
633         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_1);
634         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
635         if XNN_UNPREDICTABLE(block_width > 1) {
636           o = oN;
637         }
638         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_0);
639         o += 2;
640         v0_0 = _mm_srli_epi32(v0_0, 16);
641         v0_1 = _mm_srli_epi32(v0_1, 16);
642         v0_2 = _mm_srli_epi32(v0_2, 16);
643         v0_3 = _mm_srli_epi32(v0_3, 16);
644         v0_4 = _mm_srli_epi32(v0_4, 16);
645         v0_5 = _mm_srli_epi32(v0_5, 16);
646         v0_6 = _mm_srli_epi32(v0_6, 16);
647         v0_7 = _mm_srli_epi32(v0_7, 16);
648         v0_8 = _mm_srli_epi32(v0_8, 16);
649         v0_9 = _mm_srli_epi32(v0_9, 16);
650         v0_10 = _mm_srli_epi32(v0_10, 16);
651         v0_11 = _mm_srli_epi32(v0_11, 16);
652         v0_12 = _mm_srli_epi32(v0_12, 16);
653         v0_13 = _mm_srli_epi32(v0_13, 16);
654         v0_14 = _mm_srli_epi32(v0_14, 16);
655         v0_15 = _mm_srli_epi32(v0_15, 16);
656       }
657       if (bh & 1) {
658         o = (uint8_t*) ((uintptr_t) o + oN_stride);
659         *o = (uint8_t) _mm_cvtsi128_si32(v0_15);
660         uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
661         if XNN_UNPREDICTABLE(block_width > 15) {
662           o = oN;
663         }
664         *o = (uint8_t) _mm_cvtsi128_si32(v0_14);
665         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
666         if XNN_UNPREDICTABLE(block_width >= 15) {
667           o = oN;
668         }
669         *o = (uint8_t) _mm_cvtsi128_si32(v0_13);
670         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
671         if XNN_UNPREDICTABLE(block_width > 13) {
672           o = oN;
673         }
674         *o = (uint8_t) _mm_cvtsi128_si32(v0_12);
675         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
676         if XNN_UNPREDICTABLE(block_width >= 13) {
677           o = oN;
678         }
679         *o = (uint8_t) _mm_cvtsi128_si32(v0_11);
680         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
681         if XNN_UNPREDICTABLE(block_width > 11) {
682           o = oN;
683         }
684         *o = (uint8_t) _mm_cvtsi128_si32(v0_10);
685         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
686         if XNN_UNPREDICTABLE(block_width >= 11) {
687           o = oN;
688         }
689         *o = (uint8_t) _mm_cvtsi128_si32(v0_9);
690         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
691         if XNN_UNPREDICTABLE(block_width > 9) {
692           o = oN;
693         }
694         *o = (uint8_t) _mm_cvtsi128_si32(v0_8);
695         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
696         if XNN_UNPREDICTABLE(block_width >= 9) {
697           o = oN;
698         }
699         *o = (uint8_t) _mm_cvtsi128_si32(v0_7);
700         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
701         if XNN_UNPREDICTABLE(block_width > 7) {
702           o = oN;
703         }
704         *o = (uint8_t) _mm_cvtsi128_si32(v0_6);
705         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
706         if XNN_UNPREDICTABLE(block_width >= 7) {
707           o = oN;
708         }
709         *o = (uint8_t) _mm_cvtsi128_si32(v0_5);
710         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
711         if XNN_UNPREDICTABLE(block_width > 5) {
712           o = oN;
713         }
714         *o = (uint8_t) _mm_cvtsi128_si32(v0_4);
715         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
716         if XNN_UNPREDICTABLE(block_width >= 5) {
717           o = oN;
718         }
719         *o = (uint8_t) _mm_cvtsi128_si32(v0_3);
720         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
721         if XNN_UNPREDICTABLE(block_width > 3) {
722           o = oN;
723         }
724         *o = (uint8_t) _mm_cvtsi128_si32(v0_2);
725         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
726         if XNN_UNPREDICTABLE(block_width >= 3) {
727           o = oN;
728         }
729         *o = (uint8_t) _mm_cvtsi128_si32(v0_1);
730         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
731         if XNN_UNPREDICTABLE(block_width > 1) {
732           o = oN;
733         }
734         *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
735       }
736     }
737 
738     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
739     o = (uint8_t*) ((uintptr_t) o + output_reset);
740     block_width = doz(block_width, tile_width);
741   } while (block_width != 0);
742 }
743