• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transpose/neon-zip.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <arm_neon.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x8_transpose_ukernel__16x16_reuse_mov_zip_neon(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transpose_ukernel__16x16_reuse_mov_zip_neon(
19     const uint8_t* input,
20     uint8_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height)
25 {
26   assert(output_stride >= block_height * sizeof(uint8_t));
27   assert(input_stride >= block_width * sizeof(uint8_t));
28 
29   const size_t tile_height = 16;
30   const size_t tile_width = 16;
31   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35 
36   const uint8_t* i0 = input;
37   uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 15);
42     const size_t oN_stride = rem * output_stride;
43     const size_t oN_offset = oN_stride + tile_hbytes;
44     size_t bh = block_height;
45     for (; bh >= 16; bh -= 16) {
46       const uint8x16_t v4_0 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
47       const uint8x16_t v4_1 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48       const uint8x16_t v4_2 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49       const uint8x16_t v4_3 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50       const uint8x16_t v4_4 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51       const uint8x16_t v4_5 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52       const uint8x16_t v4_6 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53       const uint8x16_t v4_7 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54       const uint8x16_t v4_8 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55       const uint8x16_t v4_9 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56       const uint8x16_t v4_10 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57       const uint8x16_t v4_11 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58       const uint8x16_t v4_12 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59       const uint8x16_t v4_13 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60       const uint8x16_t v4_14 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61       const uint8x16_t v4_15 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62 
63       const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
64       const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
65       const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
66       const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
67       const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
68       const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
69       const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
70       const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
71 
72       const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
73       const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
74       const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
75       const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
76       const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
77       const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
78       const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
79       const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
80       const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
81       const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
82       const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
83       const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
84       const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
85       const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
86       const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
87       const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
88       const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
89       const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
90       const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
91       const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
92       const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
93       const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
94       const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
95       const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
96 
97       o = (uint8_t*) ((uintptr_t) o + oN_offset);
98       vst1q_u8(o, v0_7.val[1]);
99       uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
100       if XNN_UNPREDICTABLE(block_width > 15) {
101         o = oN;
102       }
103       vst1q_u8(o, v0_7.val[0]);
104       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
105       if XNN_UNPREDICTABLE(block_width >= 15) {
106         o = oN;
107       }
108       vst1q_u8(o, v0_6.val[1]);
109       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
110       if XNN_UNPREDICTABLE(block_width > 13) {
111         o = oN;
112       }
113       vst1q_u8(o, v0_6.val[0]);
114       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
115       if XNN_UNPREDICTABLE(block_width >= 13) {
116         o = oN;
117       }
118       vst1q_u8(o, v0_5.val[1]);
119       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
120       if XNN_UNPREDICTABLE(block_width > 11) {
121         o = oN;
122       }
123       vst1q_u8(o, v0_5.val[0]);
124       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
125       if XNN_UNPREDICTABLE(block_width >= 11) {
126         o = oN;
127       }
128       vst1q_u8(o, v0_4.val[1]);
129       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
130       if XNN_UNPREDICTABLE(block_width > 9) {
131         o = oN;
132       }
133       vst1q_u8(o, v0_4.val[0]);
134       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
135       if XNN_UNPREDICTABLE(block_width >= 9) {
136         o = oN;
137       }
138       vst1q_u8(o, v0_3.val[1]);
139       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
140       if XNN_UNPREDICTABLE(block_width > 7) {
141         o = oN;
142       }
143       vst1q_u8(o, v0_3.val[0]);
144       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
145       if XNN_UNPREDICTABLE(block_width >= 7) {
146         o = oN;
147       }
148       vst1q_u8(o, v0_2.val[1]);
149       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
150       if XNN_UNPREDICTABLE(block_width > 5) {
151         o = oN;
152       }
153       vst1q_u8(o, v0_2.val[0]);
154       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
155       if XNN_UNPREDICTABLE(block_width >= 5) {
156         o = oN;
157       }
158       vst1q_u8(o, v0_1.val[1]);
159       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
160       if XNN_UNPREDICTABLE(block_width > 3) {
161         o = oN;
162       }
163       vst1q_u8(o, v0_1.val[0]);
164       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
165       if XNN_UNPREDICTABLE(block_width >= 3) {
166         o = oN;
167       }
168       vst1q_u8(o, v0_0.val[1]);
169       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
170       if XNN_UNPREDICTABLE(block_width > 1) {
171         o = oN;
172       }
173       vst1q_u8(o, v0_0.val[0]);
174     }
175     o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
176 
177     if (bh != 0) {
178       const uint8x16_t v4_0 = vld1q_u8(i0);
179       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
180       if XNN_UNPREDICTABLE(bh < 2) {
181         i1 = i0;
182       }
183       const uint8x16_t v4_1 = vld1q_u8(i1);
184       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
185       if XNN_UNPREDICTABLE(bh <= 2) {
186         i2 = i1;
187       }
188       const uint8x16_t v4_2 = vld1q_u8(i2);
189       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
190       if XNN_UNPREDICTABLE(bh < 4) {
191         i3 = i2;
192       }
193       const uint8x16_t v4_3 = vld1q_u8(i3);
194       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
195       if XNN_UNPREDICTABLE(bh <= 4) {
196         i4 = i3;
197       }
198       const uint8x16_t v4_4 = vld1q_u8(i4);
199       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
200       if XNN_UNPREDICTABLE(bh < 6) {
201         i5 = i4;
202       }
203       const uint8x16_t v4_5 = vld1q_u8(i5);
204       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
205       if XNN_UNPREDICTABLE(bh <= 6) {
206         i6 = i5;
207       }
208       const uint8x16_t v4_6 = vld1q_u8(i6);
209       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
210       if XNN_UNPREDICTABLE(bh < 8) {
211         i7 = i6;
212       }
213       const uint8x16_t v4_7 = vld1q_u8(i7);
214       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
215       if XNN_UNPREDICTABLE(bh <= 8) {
216         i8 = i7;
217       }
218       const uint8x16_t v4_8 = vld1q_u8(i8);
219       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
220       if XNN_UNPREDICTABLE(bh < 10) {
221         i9 = i8;
222       }
223       const uint8x16_t v4_9 = vld1q_u8(i9);
224       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
225       if XNN_UNPREDICTABLE(bh <= 10) {
226         i10 = i9;
227       }
228       const uint8x16_t v4_10 = vld1q_u8(i10);
229       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
230       if XNN_UNPREDICTABLE(bh < 12) {
231         i11 = i10;
232       }
233       const uint8x16_t v4_11 = vld1q_u8(i11);
234       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
235       if XNN_UNPREDICTABLE(bh <= 12) {
236         i12 = i11;
237       }
238       const uint8x16_t v4_12 = vld1q_u8(i12);
239       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
240       if XNN_UNPREDICTABLE(bh < 14) {
241         i13 = i12;
242       }
243       const uint8x16_t v4_13 = vld1q_u8(i13);
244       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
245       if XNN_UNPREDICTABLE(bh <= 14) {
246         i14 = i13;
247       }
248       const uint8x16_t v4_14 = vld1q_u8(i14);
249       const uint8x16_t v4_15 = vmovq_n_u8(0);
250 
251       const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
252       const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
253       const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
254       const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
255       const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
256       const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
257       const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
258       const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
259 
260       const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
261       const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
262       const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
263       const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
264       const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
265       const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
266       const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
267       const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
268       const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
269       const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
270       const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
271       const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
272       const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
273       const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
274       const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
275       const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
276       const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
277       const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
278       const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
279       const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
280       const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
281       const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
282       const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
283       const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
284 
285       uint8x8_t v0_low = vget_low_u8(v0_0.val[0]);
286       uint8x8_t v1_low = vget_low_u8(v0_0.val[1]);
287       uint8x8_t v2_low = vget_low_u8(v0_1.val[0]);
288       uint8x8_t v3_low = vget_low_u8(v0_1.val[1]);
289       uint8x8_t v4_low = vget_low_u8(v0_2.val[0]);
290       uint8x8_t v5_low = vget_low_u8(v0_2.val[1]);
291       uint8x8_t v6_low = vget_low_u8(v0_3.val[0]);
292       uint8x8_t v7_low = vget_low_u8(v0_3.val[1]);
293       uint8x8_t v8_low = vget_low_u8(v0_4.val[0]);
294       uint8x8_t v9_low = vget_low_u8(v0_4.val[1]);
295       uint8x8_t v10_low = vget_low_u8(v0_5.val[0]);
296       uint8x8_t v11_low = vget_low_u8(v0_5.val[1]);
297       uint8x8_t v12_low = vget_low_u8(v0_6.val[0]);
298       uint8x8_t v13_low = vget_low_u8(v0_6.val[1]);
299       uint8x8_t v14_low = vget_low_u8(v0_7.val[0]);
300       uint8x8_t v15_low = vget_low_u8(v0_7.val[1]);
301 
302       if (bh & 8) {
303         o = (uint8_t*) ((uintptr_t) o + oN_stride);
304         vst1_u8(o, v15_low);
305         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
306         if XNN_UNPREDICTABLE(block_width > 15) {
307           o = oN;
308         }
309         vst1_u8(o, v14_low);
310         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
311         if XNN_UNPREDICTABLE(block_width >= 15) {
312           o = oN;
313         }
314         vst1_u8(o, v13_low);
315         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
316         if XNN_UNPREDICTABLE(block_width > 13) {
317           o = oN;
318         }
319         vst1_u8(o, v12_low);
320         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
321         if XNN_UNPREDICTABLE(block_width >= 13) {
322           o = oN;
323         }
324         vst1_u8(o, v11_low);
325         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
326         if XNN_UNPREDICTABLE(block_width > 11) {
327           o = oN;
328         }
329         vst1_u8(o, v10_low);
330         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
331         if XNN_UNPREDICTABLE(block_width >= 11) {
332           o = oN;
333         }
334         vst1_u8(o, v9_low);
335         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
336         if XNN_UNPREDICTABLE(block_width > 9) {
337           o = oN;
338         }
339         vst1_u8(o, v8_low);
340         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
341         if XNN_UNPREDICTABLE(block_width >= 9) {
342           o = oN;
343         }
344         vst1_u8(o, v7_low);
345         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
346         if XNN_UNPREDICTABLE(block_width > 7) {
347           o = oN;
348         }
349         vst1_u8(o, v6_low);
350         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
351         if XNN_UNPREDICTABLE(block_width >= 7) {
352           o = oN;
353         }
354         vst1_u8(o, v5_low);
355         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
356         if XNN_UNPREDICTABLE(block_width > 5) {
357           o = oN;
358         }
359         vst1_u8(o, v4_low);
360         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
361         if XNN_UNPREDICTABLE(block_width >= 5) {
362           o = oN;
363         }
364         vst1_u8(o, v3_low);
365         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
366         if XNN_UNPREDICTABLE(block_width > 3) {
367           o = oN;
368         }
369         vst1_u8(o, v2_low);
370         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
371         if XNN_UNPREDICTABLE(block_width >= 3) {
372           o = oN;
373         }
374         vst1_u8(o, v1_low);
375         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
376         if XNN_UNPREDICTABLE(block_width > 1) {
377           o = oN;
378         }
379         vst1_u8(o, v0_low); o += 8;
380         v0_low = vget_high_u8(v0_0.val[0]);
381         v1_low = vget_high_u8(v0_0.val[1]);
382         v2_low = vget_high_u8(v0_1.val[0]);
383         v3_low = vget_high_u8(v0_1.val[1]);
384         v4_low = vget_high_u8(v0_2.val[0]);
385         v5_low = vget_high_u8(v0_2.val[1]);
386         v6_low = vget_high_u8(v0_3.val[0]);
387         v7_low = vget_high_u8(v0_3.val[1]);
388         v8_low = vget_high_u8(v0_4.val[0]);
389         v9_low = vget_high_u8(v0_4.val[1]);
390         v10_low = vget_high_u8(v0_5.val[0]);
391         v11_low = vget_high_u8(v0_5.val[1]);
392         v12_low = vget_high_u8(v0_6.val[0]);
393         v13_low = vget_high_u8(v0_6.val[1]);
394         v14_low = vget_high_u8(v0_7.val[0]);
395         v15_low = vget_high_u8(v0_7.val[1]);
396       }
397 
398       if (bh & 4) {
399         o = (uint8_t*) ((uintptr_t) o + oN_stride);
400         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v15_low), 0);
401         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
402         if XNN_UNPREDICTABLE(block_width > 15) {
403           o = oN;
404         }
405         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v14_low), 0);
406         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
407         if XNN_UNPREDICTABLE(block_width >= 15) {
408           o = oN;
409         }
410         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v13_low), 0);
411         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
412         if XNN_UNPREDICTABLE(block_width > 13) {
413           o = oN;
414         }
415         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v12_low), 0);
416         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
417         if XNN_UNPREDICTABLE(block_width >= 13) {
418           o = oN;
419         }
420         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v11_low), 0);
421         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
422         if XNN_UNPREDICTABLE(block_width > 11) {
423           o = oN;
424         }
425         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v10_low), 0);
426         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
427         if XNN_UNPREDICTABLE(block_width >= 11) {
428           o = oN;
429         }
430         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v9_low), 0);
431         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
432         if XNN_UNPREDICTABLE(block_width > 9) {
433           o = oN;
434         }
435         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v8_low), 0);
436         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
437         if XNN_UNPREDICTABLE(block_width >= 9) {
438           o = oN;
439         }
440         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v7_low), 0);
441         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
442         if XNN_UNPREDICTABLE(block_width > 7) {
443           o = oN;
444         }
445         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v6_low), 0);
446         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
447         if XNN_UNPREDICTABLE(block_width >= 7) {
448           o = oN;
449         }
450         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v5_low), 0);
451         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
452         if XNN_UNPREDICTABLE(block_width > 5) {
453           o = oN;
454         }
455         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v4_low), 0);
456         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
457         if XNN_UNPREDICTABLE(block_width >= 5) {
458           o = oN;
459         }
460         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v3_low), 0);
461         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
462         if XNN_UNPREDICTABLE(block_width > 3) {
463           o = oN;
464         }
465         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v2_low), 0);
466         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
467         if XNN_UNPREDICTABLE(block_width >= 3) {
468           o = oN;
469         }
470         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v1_low), 0);
471         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
472         if XNN_UNPREDICTABLE(block_width > 1) {
473           o = oN;
474         }
475         vst1_lane_u32((void*) o, vreinterpret_u32_u8(v0_low), 0); o += 4;
476         v0_low = vext_u8(v0_low, v0_low, 4);
477         v1_low = vext_u8(v1_low, v1_low, 4);
478         v2_low = vext_u8(v2_low, v2_low, 4);
479         v3_low = vext_u8(v3_low, v3_low, 4);
480         v4_low = vext_u8(v4_low, v4_low, 4);
481         v5_low = vext_u8(v5_low, v5_low, 4);
482         v6_low = vext_u8(v6_low, v6_low, 4);
483         v7_low = vext_u8(v7_low, v7_low, 4);
484         v8_low = vext_u8(v8_low, v8_low, 4);
485         v9_low = vext_u8(v9_low, v9_low, 4);
486         v10_low = vext_u8(v10_low, v10_low, 4);
487         v11_low = vext_u8(v11_low, v11_low, 4);
488         v12_low = vext_u8(v12_low, v12_low, 4);
489         v13_low = vext_u8(v13_low, v13_low, 4);
490         v14_low = vext_u8(v14_low, v14_low, 4);
491         v15_low = vext_u8(v15_low, v15_low, 4);
492       }
493       if (bh & 2) {
494         o = (uint8_t*) ((uintptr_t) o + oN_stride);
495         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v15_low), 0);
496         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
497         if XNN_UNPREDICTABLE(block_width > 15) {
498           o = oN;
499         }
500         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v14_low), 0);
501         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
502         if XNN_UNPREDICTABLE(block_width >= 15) {
503           o = oN;
504         }
505         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v13_low), 0);
506         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
507         if XNN_UNPREDICTABLE(block_width > 13) {
508           o = oN;
509         }
510         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v12_low), 0);
511         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
512         if XNN_UNPREDICTABLE(block_width >= 13) {
513           o = oN;
514         }
515         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v11_low), 0);
516         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
517         if XNN_UNPREDICTABLE(block_width > 11) {
518           o = oN;
519         }
520         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v10_low), 0);
521         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
522         if XNN_UNPREDICTABLE(block_width >= 11) {
523           o = oN;
524         }
525         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v9_low), 0);
526         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
527         if XNN_UNPREDICTABLE(block_width > 9) {
528           o = oN;
529         }
530         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v8_low), 0);
531         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
532         if XNN_UNPREDICTABLE(block_width >= 9) {
533           o = oN;
534         }
535         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v7_low), 0);
536         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
537         if XNN_UNPREDICTABLE(block_width > 7) {
538           o = oN;
539         }
540         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v6_low), 0);
541         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
542         if XNN_UNPREDICTABLE(block_width >= 7) {
543           o = oN;
544         }
545         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v5_low), 0);
546         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
547         if XNN_UNPREDICTABLE(block_width > 5) {
548           o = oN;
549         }
550         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v4_low), 0);
551         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
552         if XNN_UNPREDICTABLE(block_width >= 5) {
553           o = oN;
554         }
555         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v3_low), 0);
556         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
557         if XNN_UNPREDICTABLE(block_width > 3) {
558           o = oN;
559         }
560         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v2_low), 0);
561         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
562         if XNN_UNPREDICTABLE(block_width >= 3) {
563           o = oN;
564         }
565         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v1_low), 0);
566         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
567         if XNN_UNPREDICTABLE(block_width > 1) {
568           o = oN;
569         }
570         vst1_lane_u16((void*) o, vreinterpret_u16_u8(v0_low), 0); o += 2;
571         v0_low = vext_u8(v0_low, v0_low, 2);
572         v1_low = vext_u8(v1_low, v1_low, 2);
573         v2_low = vext_u8(v2_low, v2_low, 2);
574         v3_low = vext_u8(v3_low, v3_low, 2);
575         v4_low = vext_u8(v4_low, v4_low, 2);
576         v5_low = vext_u8(v5_low, v5_low, 2);
577         v6_low = vext_u8(v6_low, v6_low, 2);
578         v7_low = vext_u8(v7_low, v7_low, 2);
579         v8_low = vext_u8(v8_low, v8_low, 2);
580         v9_low = vext_u8(v9_low, v9_low, 2);
581         v10_low = vext_u8(v10_low, v10_low, 2);
582         v11_low = vext_u8(v11_low, v11_low, 2);
583         v12_low = vext_u8(v12_low, v12_low, 2);
584         v13_low = vext_u8(v13_low, v13_low, 2);
585         v14_low = vext_u8(v14_low, v14_low, 2);
586         v15_low = vext_u8(v15_low, v15_low, 2);
587       }
588       if (bh & 1) {
589         o = (uint8_t*) ((uintptr_t) o + oN_stride);
590         vst1_lane_u8(o, v15_low, 0);
591         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
592         if XNN_UNPREDICTABLE(block_width > 15) {
593           o = oN;
594         }
595         vst1_lane_u8(o, v14_low, 0);
596         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
597         if XNN_UNPREDICTABLE(block_width >= 15) {
598           o = oN;
599         }
600         vst1_lane_u8(o, v13_low, 0);
601         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
602         if XNN_UNPREDICTABLE(block_width > 13) {
603           o = oN;
604         }
605         vst1_lane_u8(o, v12_low, 0);
606         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
607         if XNN_UNPREDICTABLE(block_width >= 13) {
608           o = oN;
609         }
610         vst1_lane_u8(o, v11_low, 0);
611         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
612         if XNN_UNPREDICTABLE(block_width > 11) {
613           o = oN;
614         }
615         vst1_lane_u8(o, v10_low, 0);
616         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
617         if XNN_UNPREDICTABLE(block_width >= 11) {
618           o = oN;
619         }
620         vst1_lane_u8(o, v9_low, 0);
621         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
622         if XNN_UNPREDICTABLE(block_width > 9) {
623           o = oN;
624         }
625         vst1_lane_u8(o, v8_low, 0);
626         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
627         if XNN_UNPREDICTABLE(block_width >= 9) {
628           o = oN;
629         }
630         vst1_lane_u8(o, v7_low, 0);
631         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
632         if XNN_UNPREDICTABLE(block_width > 7) {
633           o = oN;
634         }
635         vst1_lane_u8(o, v6_low, 0);
636         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
637         if XNN_UNPREDICTABLE(block_width >= 7) {
638           o = oN;
639         }
640         vst1_lane_u8(o, v5_low, 0);
641         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
642         if XNN_UNPREDICTABLE(block_width > 5) {
643           o = oN;
644         }
645         vst1_lane_u8(o, v4_low, 0);
646         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
647         if XNN_UNPREDICTABLE(block_width >= 5) {
648           o = oN;
649         }
650         vst1_lane_u8(o, v3_low, 0);
651         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
652         if XNN_UNPREDICTABLE(block_width > 3) {
653           o = oN;
654         }
655         vst1_lane_u8(o, v2_low, 0);
656         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
657         if XNN_UNPREDICTABLE(block_width >= 3) {
658           o = oN;
659         }
660         vst1_lane_u8(o, v1_low, 0);
661         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
662         if XNN_UNPREDICTABLE(block_width > 1) {
663           o = oN;
664         }
665         vst1_lane_u8(o, v0_low, 0);
666       }
667     }
668 
669     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
670     o = (uint8_t*) ((uintptr_t) o + output_reset);
671     block_width = doz(block_width, tile_width);
672   } while (block_width != 0);
673 }
674