• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5$import math
6$assert IN_PTRS in ["MULTI", "REUSE"]
7$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
8$assert SIZE in [8, 16, 32]
9$TILE_SIZE = int(128/SIZE)
10$NUM_ITERS = int(math.log2(TILE_SIZE))
11
12#include <arm_neon.h>
13
14#include <assert.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/math.h>
18#include <xnnpack/transpose.h>
19
20void xnn_x${SIZE}_transpose_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon(
21    const uint${SIZE}_t* input,
22    uint${SIZE}_t* output,
23    size_t input_stride,
24    size_t output_stride,
25    size_t block_width,
26    size_t block_height)
27{
28  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
29  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
30
31  const size_t tile_height = ${TILE_SIZE};
32  const size_t tile_width = ${TILE_SIZE};
33  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
34  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
35  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
36  $if IN_PTRS == "MULTI":
37    const size_t input_offset = tile_height * input_stride;
38  $if OUT_PTRS in ["MOV", "DEC"]:
39    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
40  $else:
41    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
42
43  $if IN_PTRS == "MULTI":
44    const uint${SIZE}_t* i0 = input;
45    $for N in range(1, TILE_SIZE):
46      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
47  $else:
48    const uint${SIZE}_t* i0 = input;
49  $if OUT_PTRS == "MULTI":
50    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
51    $for N in range(1, TILE_SIZE):
52      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
53  $elif OUT_PTRS == "SWITCH":
54    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
55  $else:
56    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
57  $if OUT_PTRS != "MULTI":
58    const size_t minus_output_stride = -output_stride;
59
60  do {
61    $if OUT_PTRS == "MULTI":
62      if XNN_UNPREDICTABLE(block_width < 2) {
63        o1 = o0;
64      }
65      $for N in range(2, TILE_SIZE, 2):
66        if XNN_UNPREDICTABLE(block_width <= ${N}) {
67          o${N} = o0;
68        }
69        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
70          o${N+1} = o0;
71        }
72    $elif OUT_PTRS in ["MOV", "DEC"]:
73      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
74      const size_t oN_stride = rem * output_stride;
75      const size_t oN_offset = oN_stride + tile_hbytes;
76    $else:
77      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
78      const size_t oN_stride = rem * output_stride;
79    size_t bh = block_height;
80    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
81      $for N in range(TILE_SIZE):
82        $if IN_PTRS == "REUSE":
83          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
84        $else:
85          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
86
87      $for N in range(TILE_SIZE >> 1):
88        const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
89
90      $for M in range(1, NUM_ITERS):
91        $for N in range(TILE_SIZE >> 1):
92          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
93
94      $if OUT_PTRS == "SWITCH":
95        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
96        switch (rem) {
97          $for N in reversed(range(2, TILE_SIZE)):
98            case ${N}:
99              vst1q_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
100          case 1:
101            vst1q_u${SIZE}(oN, v0_0.val[1]);
102          case 0:
103            vst1q_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
104            break;
105          default:
106            XNN_UNREACHABLE;
107        }
108      $elif OUT_PTRS in ["MOV", "DEC"]:
109        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
110        vst1q_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]);
111        $if OUT_PTRS == "MOV":
112          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
113        $for N in reversed(range(2, TILE_SIZE, 2)):
114          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
115            $if OUT_PTRS == "MOV":
116              o = oN;
117            $else:
118              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
119          }
120          vst1q_u${SIZE}(o, v0_${N>>1}.val[0]);
121          $if OUT_PTRS == "MOV":
122            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
123          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
124            $if OUT_PTRS == "MOV":
125              o = oN;
126            $else:
127              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
128          }
129          vst1q_u${SIZE}(o, v0_${(N-1)>>1}.val[1]);
130          $if OUT_PTRS == "MOV":
131            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
132        if XNN_UNPREDICTABLE(block_width > 1) {
133          $if OUT_PTRS == "MOV":
134            o = oN;
135          $else:
136            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
137        }
138        vst1q_u${SIZE}(o, v0_0.val[0]);
139      $else:
140        $for N in reversed(range(TILE_SIZE)):
141          vst1q_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
142    }
143    $if OUT_PTRS in ["MOV", "DEC"]:
144      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
145
146    if (bh != 0) {
147      $if IN_PTRS == "REUSE":
148        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1q_u${SIZE}(i0);
149        $for N in range(1, TILE_SIZE - 1, 2):
150          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
151          if XNN_UNPREDICTABLE(bh < ${N+1}) {
152            i${N} = i${N-1};
153          }
154          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N});
155          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
156          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
157            i${N+1} = i${N};
158          }
159          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1q_u${SIZE}(i${N+1});
160      $else:
161        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1q_u${SIZE}(i0);
162        $for N in range(1, TILE_SIZE - 1, 2):
163          if XNN_UNPREDICTABLE(bh < ${N+1}) {
164            i${N} = i0;
165          }
166          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N});
167          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
168            i${N+1} = i0;
169          }
170          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1q_u${SIZE}(i${N+1});
171      const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmovq_n_u${SIZE}(0);
172
173      $for N in range(TILE_SIZE >> 1):
174          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
175
176      $for M in range(1, NUM_ITERS):
177        $for N in range(TILE_SIZE >> 1):
178          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
179
180      $for N in range(TILE_SIZE):
181        uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]);
182
183      if (bh & ${TILE_SIZE>>1}) {
184        $if OUT_PTRS == "SWITCH":
185          uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
186          switch (rem) {
187            $for N in reversed(range(2, TILE_SIZE)):
188              case ${N}:
189                vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
190            case 1:
191              vst1_u${SIZE}(oN, v1_low);
192            case 0:
193              $if NUM_ITERS > 1:
194                vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
195              $else:
196                vst1_u${SIZE}(o, v0_low);
197              break;
198            default:
199              XNN_UNREACHABLE;
200          }
201        $elif OUT_PTRS in ["MOV", "DEC"]:
202          o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
203          vst1_u${SIZE}(o, v${TILE_SIZE-1}_low);
204          $if OUT_PTRS == "MOV":
205            uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
206          $for N in reversed(range(2, TILE_SIZE, 2)):
207            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
208              $if OUT_PTRS == "MOV":
209                o = oN;
210              $else:
211                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
212            }
213            vst1_u${SIZE}(o, v${N}_low);
214            $if OUT_PTRS == "MOV":
215              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
216            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
217              $if OUT_PTRS == "MOV":
218                o = oN;
219              $else:
220                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
221            }
222            vst1_u${SIZE}(o, v${N-1}_low);
223            $if OUT_PTRS == "MOV":
224              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
225          if XNN_UNPREDICTABLE(block_width > 1) {
226            $if OUT_PTRS == "MOV":
227              o = oN;
228            $else:
229              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
230          }
231          $if NUM_ITERS > 1:
232            vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
233          $else:
234            vst1_u${SIZE}(o, v0_low);
235        $else:
236          $for N in reversed(range(TILE_SIZE)):
237            $if NUM_ITERS>1:
238              vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1};
239            $else:
240              vst1_u${SIZE}(o${N}, v${N}_low);
241        $if NUM_ITERS > 1:
242          $for N in range(TILE_SIZE):
243            v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]);
244      }
245
246      $if NUM_ITERS>1:
247        if (bh & ${TILE_SIZE>>2}) {
248          $if OUT_PTRS == "SWITCH":
249            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
250            switch (rem) {
251              $for N in reversed(range(2, TILE_SIZE)):
252                case ${N}:
253                  $if SIZE == 32:
254                    vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
255                  $else:
256                    vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
257              case 1:
258                $if SIZE == 32:
259                  vst1_lane_u32(oN, v1_low, 0);
260                $else:
261                  vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0);
262              case 0:
263                $if SIZE == 32:
264                  vst1_lane_u32(o, v0_low, 0);
265                $else:
266                  vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>2};
267                break;
268              default:
269                XNN_UNREACHABLE;
270            }
271          $elif OUT_PTRS in ["MOV", "DEC"]:
272            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
273            $if SIZE == 32:
274              vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0);
275            $else:
276              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0);
277            $if OUT_PTRS == "MOV":
278              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
279            $for N in reversed(range(2, TILE_SIZE, 2)):
280              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
281                $if OUT_PTRS == "MOV":
282                  o = oN;
283                $else:
284                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
285              }
286              $if SIZE == 32:
287                vst1_lane_u32(o, v${N}_low, 0);
288              $else:
289                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0);
290              $if OUT_PTRS == "MOV":
291                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
292              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
293                $if OUT_PTRS == "MOV":
294                  o = oN;
295                $else:
296                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
297              }
298              $if SIZE == 32:
299                vst1_lane_u32(o, v${N-1}_low, 0);
300              $else:
301                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0);
302              $if OUT_PTRS == "MOV":
303                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
304            if XNN_UNPREDICTABLE(block_width > 1) {
305              $if OUT_PTRS == "MOV":
306                o = oN;
307              $else:
308                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
309            }
310            $if SIZE == 32:
311              vst1_lane_u32(o, v0_low, 0);
312            $else:
313              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>2};
314          $else:
315            $for N in reversed(range(TILE_SIZE)):
316              $if SIZE == 32:
317                vst1_lane_u32(o${N}, v${N}_low, 0);
318              $else:
319                vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>2};
320          $if NUM_ITERS > 2:
321            $for N in range(TILE_SIZE):
322              $if SIZE == 16:
323                v${N}_low = vext_u16(v${N}_low, v${N}_low, 2);
324              $else:
325                v${N}_low = vext_u8(v${N}_low, v${N}_low, 4);
326        }
327      $if NUM_ITERS>2:
328        if (bh & ${TILE_SIZE>>3}) {
329          $if OUT_PTRS == "SWITCH":
330            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
331            switch (rem) {
332              $for N in reversed(range(2, TILE_SIZE)):
333                case ${N}:
334                  $if SIZE == 16:
335                    vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
336                  $else:
337                    vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
338              case 1:
339                $if SIZE == 16:
340                  vst1_lane_u16(oN, v1_low, 0);
341                $else:
342                  vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0);
343              case 0:
344                $if SIZE == 16:
345                  vst1_lane_u16(o, v0_low, 0);
346                $else:
347                  $if NUM_ITERS>3:
348                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>3};
349                  $else:
350                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0);
351                break;
352              default:
353                XNN_UNREACHABLE;
354            }
355          $elif OUT_PTRS in ["MOV", "DEC"]:
356            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
357            $if SIZE == 16:
358              vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0);
359            $else:
360              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0);
361            $if OUT_PTRS == "MOV":
362              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
363            $for N in reversed(range(2, TILE_SIZE, 2)):
364              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
365                $if OUT_PTRS == "MOV":
366                  o = oN;
367                $else:
368                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
369              }
370              $if SIZE == 16:
371                vst1_lane_u16(o, v${N}_low, 0);
372              $else:
373                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0);
374              $if OUT_PTRS == "MOV":
375                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
376              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
377                $if OUT_PTRS == "MOV":
378                  o = oN;
379                $else:
380                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
381              }
382              $if SIZE == 16:
383                vst1_lane_u16(o, v${N-1}_low, 0);
384              $else:
385                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0);
386              $if OUT_PTRS == "MOV":
387                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
388            if XNN_UNPREDICTABLE(block_width > 1) {
389              $if OUT_PTRS == "MOV":
390                o = oN;
391              $else:
392                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
393            }
394            $if SIZE == 16:
395              vst1_lane_u16(o, v0_low, 0);
396            $else:
397              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>3};
398          $else:
399            $for N in reversed(range(TILE_SIZE)):
400              $if SIZE == 16:
401                vst1_lane_u16(o${N}, v${N}_low, 0);
402              $else:
403                vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>3};
404          $if NUM_ITERS>3:
405            $for N in range(TILE_SIZE):
406              v${N}_low = vext_u8(v${N}_low, v${N}_low, 2);
407        }
408      $if SIZE == 8:
409        if (bh & 1) {
410          $if OUT_PTRS == "SWITCH":
411            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
412            switch (rem) {
413              $for N in reversed(range(2, TILE_SIZE)):
414                case ${N}:
415                  vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
416              case 1:
417                vst1_lane_u8(oN, v1_low, 0);
418              case 0:
419                vst1_lane_u8(o, v0_low, 0);
420                break;
421              default:
422                XNN_UNREACHABLE;
423            }
424          $elif OUT_PTRS in ["MOV", "DEC"]:
425            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
426            vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0);
427            $if OUT_PTRS == "MOV":
428              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
429            $for N in reversed(range(2, TILE_SIZE, 2)):
430              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
431                $if OUT_PTRS == "MOV":
432                  o = oN;
433                $else:
434                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
435              }
436              vst1_lane_u8(o, v${N}_low, 0);
437              $if OUT_PTRS == "MOV":
438                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
439              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
440                $if OUT_PTRS == "MOV":
441                  o = oN;
442                $else:
443                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
444              }
445              vst1_lane_u8(o, v${N-1}_low, 0);
446              $if OUT_PTRS == "MOV":
447                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
448            if XNN_UNPREDICTABLE(block_width > 1) {
449              $if OUT_PTRS == "MOV":
450                o = oN;
451              $else:
452                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
453            }
454            vst1_lane_u8(o, v0_low, 0);
455          $else:
456            $for N in reversed(range(TILE_SIZE)):
457              vst1_lane_u8(o${N}, v${N}_low, 0);
458        }
459    }
460
461    $if IN_PTRS == "MULTI":
462      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
463      $for N in range(1, TILE_SIZE):
464        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
465    $else:
466      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
467    $if OUT_PTRS == "MULTI":
468      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
469      $for N in range(1, TILE_SIZE):
470        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
471    $else:
472      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
473    block_width = doz(block_width, tile_width);
474  } while (block_width != 0);
475}
476