1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5$import math 6$assert IN_PTRS in ["MULTI", "REUSE"] 7$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] 8$assert SIZE in [8, 16, 32] 9$TILE_SIZE = int(128/SIZE) 10$NUM_ITERS = int(math.log2(TILE_SIZE)) 11 12#include <arm_neon.h> 13 14#include <assert.h> 15 16#include <xnnpack/common.h> 17#include <xnnpack/math.h> 18#include <xnnpack/transpose.h> 19 20void xnn_x${SIZE}_transpose_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon( 21 const uint${SIZE}_t* input, 22 uint${SIZE}_t* output, 23 size_t input_stride, 24 size_t output_stride, 25 size_t block_width, 26 size_t block_height) 27{ 28 assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 29 assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 30 31 const size_t tile_height = ${TILE_SIZE}; 32 const size_t tile_width = ${TILE_SIZE}; 33 const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 34 const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 35 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 36 $if IN_PTRS == "MULTI": 37 const size_t input_offset = tile_height * input_stride; 38 $if OUT_PTRS in ["MOV", "DEC"]: 39 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 40 $else: 41 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 42 43 $if IN_PTRS == "MULTI": 44 const uint${SIZE}_t* i0 = input; 45 $for N in range(1, TILE_SIZE): 46 const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 47 $else: 48 const uint${SIZE}_t* i0 = input; 49 $if OUT_PTRS == "MULTI": 50 uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 51 $for N in range(1, TILE_SIZE): 52 uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 53 $elif OUT_PTRS == "SWITCH": 54 uint${SIZE}_t* o = (uint${SIZE}_t*) output; 55 $else: 56 uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 57 $if OUT_PTRS != "MULTI": 58 const size_t minus_output_stride = -output_stride; 59 60 do { 61 $if OUT_PTRS == "MULTI": 62 if XNN_UNPREDICTABLE(block_width < 2) { 63 o1 = o0; 64 } 65 $for N in range(2, TILE_SIZE, 2): 66 if XNN_UNPREDICTABLE(block_width <= ${N}) { 67 o${N} = o0; 68 } 69 if XNN_UNPREDICTABLE(block_width < ${N+2}) { 70 o${N+1} = o0; 71 } 72 $elif OUT_PTRS in ["MOV", "DEC"]: 73 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 74 const size_t oN_stride = rem * output_stride; 75 const size_t oN_offset = oN_stride + tile_hbytes; 76 $else: 77 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 78 const size_t oN_stride = rem * output_stride; 79 size_t bh = block_height; 80 for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 81 $for N in range(TILE_SIZE): 82 $if IN_PTRS == "REUSE": 83 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 84 $else: 85 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 86 87 $for N in range(TILE_SIZE >> 1): 88 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 89 90 $for M in range(1, NUM_ITERS): 91 $for N in range(TILE_SIZE >> 1): 92 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 93 94 $if OUT_PTRS == "SWITCH": 95 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 96 switch (rem) { 97 $for N in reversed(range(2, TILE_SIZE)): 98 case ${N}: 99 vst1q_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 100 case 1: 101 vst1q_u${SIZE}(oN, v0_0.val[1]); 102 case 0: 103 vst1q_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 104 break; 105 default: 106 XNN_UNREACHABLE; 107 } 108 $elif OUT_PTRS in ["MOV", "DEC"]: 109 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 110 vst1q_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]); 111 $if OUT_PTRS == "MOV": 112 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 113 $for N in reversed(range(2, TILE_SIZE, 2)): 114 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 115 $if OUT_PTRS == "MOV": 116 o = oN; 117 $else: 118 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 119 } 120 vst1q_u${SIZE}(o, v0_${N>>1}.val[0]); 121 $if OUT_PTRS == "MOV": 122 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 123 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 124 $if OUT_PTRS == "MOV": 125 o = oN; 126 $else: 127 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 128 } 129 vst1q_u${SIZE}(o, v0_${(N-1)>>1}.val[1]); 130 $if OUT_PTRS == "MOV": 131 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 132 if XNN_UNPREDICTABLE(block_width > 1) { 133 $if OUT_PTRS == "MOV": 134 o = oN; 135 $else: 136 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 137 } 138 vst1q_u${SIZE}(o, v0_0.val[0]); 139 $else: 140 $for N in reversed(range(TILE_SIZE)): 141 vst1q_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 142 } 143 $if OUT_PTRS in ["MOV", "DEC"]: 144 o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 145 146 if (bh != 0) { 147 $if IN_PTRS == "REUSE": 148 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1q_u${SIZE}(i0); 149 $for N in range(1, TILE_SIZE - 1, 2): 150 const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 151 if XNN_UNPREDICTABLE(bh < ${N+1}) { 152 i${N} = i${N-1}; 153 } 154 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N}); 155 const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 156 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 157 i${N+1} = i${N}; 158 } 159 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1q_u${SIZE}(i${N+1}); 160 $else: 161 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1q_u${SIZE}(i0); 162 $for N in range(1, TILE_SIZE - 1, 2): 163 if XNN_UNPREDICTABLE(bh < ${N+1}) { 164 i${N} = i0; 165 } 166 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1q_u${SIZE}(i${N}); 167 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 168 i${N+1} = i0; 169 } 170 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1q_u${SIZE}(i${N+1}); 171 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmovq_n_u${SIZE}(0); 172 173 $for N in range(TILE_SIZE >> 1): 174 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 175 176 $for M in range(1, NUM_ITERS): 177 $for N in range(TILE_SIZE >> 1): 178 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzipq_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 179 180 $for N in range(TILE_SIZE): 181 uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]); 182 183 if (bh & ${TILE_SIZE>>1}) { 184 $if OUT_PTRS == "SWITCH": 185 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 186 switch (rem) { 187 $for N in reversed(range(2, TILE_SIZE)): 188 case ${N}: 189 vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 190 case 1: 191 vst1_u${SIZE}(oN, v1_low); 192 case 0: 193 $if NUM_ITERS > 1: 194 vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 195 $else: 196 vst1_u${SIZE}(o, v0_low); 197 break; 198 default: 199 XNN_UNREACHABLE; 200 } 201 $elif OUT_PTRS in ["MOV", "DEC"]: 202 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 203 vst1_u${SIZE}(o, v${TILE_SIZE-1}_low); 204 $if OUT_PTRS == "MOV": 205 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 206 $for N in reversed(range(2, TILE_SIZE, 2)): 207 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 208 $if OUT_PTRS == "MOV": 209 o = oN; 210 $else: 211 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 212 } 213 vst1_u${SIZE}(o, v${N}_low); 214 $if OUT_PTRS == "MOV": 215 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 216 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 217 $if OUT_PTRS == "MOV": 218 o = oN; 219 $else: 220 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 221 } 222 vst1_u${SIZE}(o, v${N-1}_low); 223 $if OUT_PTRS == "MOV": 224 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 225 if XNN_UNPREDICTABLE(block_width > 1) { 226 $if OUT_PTRS == "MOV": 227 o = oN; 228 $else: 229 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 230 } 231 $if NUM_ITERS > 1: 232 vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 233 $else: 234 vst1_u${SIZE}(o, v0_low); 235 $else: 236 $for N in reversed(range(TILE_SIZE)): 237 $if NUM_ITERS>1: 238 vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1}; 239 $else: 240 vst1_u${SIZE}(o${N}, v${N}_low); 241 $if NUM_ITERS > 1: 242 $for N in range(TILE_SIZE): 243 v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]); 244 } 245 246 $if NUM_ITERS>1: 247 if (bh & ${TILE_SIZE>>2}) { 248 $if OUT_PTRS == "SWITCH": 249 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 250 switch (rem) { 251 $for N in reversed(range(2, TILE_SIZE)): 252 case ${N}: 253 $if SIZE == 32: 254 vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 255 $else: 256 vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 257 case 1: 258 $if SIZE == 32: 259 vst1_lane_u32(oN, v1_low, 0); 260 $else: 261 vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0); 262 case 0: 263 $if SIZE == 32: 264 vst1_lane_u32(o, v0_low, 0); 265 $else: 266 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>2}; 267 break; 268 default: 269 XNN_UNREACHABLE; 270 } 271 $elif OUT_PTRS in ["MOV", "DEC"]: 272 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 273 $if SIZE == 32: 274 vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0); 275 $else: 276 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0); 277 $if OUT_PTRS == "MOV": 278 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 279 $for N in reversed(range(2, TILE_SIZE, 2)): 280 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 281 $if OUT_PTRS == "MOV": 282 o = oN; 283 $else: 284 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 285 } 286 $if SIZE == 32: 287 vst1_lane_u32(o, v${N}_low, 0); 288 $else: 289 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0); 290 $if OUT_PTRS == "MOV": 291 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 292 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 293 $if OUT_PTRS == "MOV": 294 o = oN; 295 $else: 296 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 297 } 298 $if SIZE == 32: 299 vst1_lane_u32(o, v${N-1}_low, 0); 300 $else: 301 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0); 302 $if OUT_PTRS == "MOV": 303 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 304 if XNN_UNPREDICTABLE(block_width > 1) { 305 $if OUT_PTRS == "MOV": 306 o = oN; 307 $else: 308 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 309 } 310 $if SIZE == 32: 311 vst1_lane_u32(o, v0_low, 0); 312 $else: 313 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>2}; 314 $else: 315 $for N in reversed(range(TILE_SIZE)): 316 $if SIZE == 32: 317 vst1_lane_u32(o${N}, v${N}_low, 0); 318 $else: 319 vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>2}; 320 $if NUM_ITERS > 2: 321 $for N in range(TILE_SIZE): 322 $if SIZE == 16: 323 v${N}_low = vext_u16(v${N}_low, v${N}_low, 2); 324 $else: 325 v${N}_low = vext_u8(v${N}_low, v${N}_low, 4); 326 } 327 $if NUM_ITERS>2: 328 if (bh & ${TILE_SIZE>>3}) { 329 $if OUT_PTRS == "SWITCH": 330 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 331 switch (rem) { 332 $for N in reversed(range(2, TILE_SIZE)): 333 case ${N}: 334 $if SIZE == 16: 335 vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 336 $else: 337 vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 338 case 1: 339 $if SIZE == 16: 340 vst1_lane_u16(oN, v1_low, 0); 341 $else: 342 vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0); 343 case 0: 344 $if SIZE == 16: 345 vst1_lane_u16(o, v0_low, 0); 346 $else: 347 $if NUM_ITERS>3: 348 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>3}; 349 $else: 350 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); 351 break; 352 default: 353 XNN_UNREACHABLE; 354 } 355 $elif OUT_PTRS in ["MOV", "DEC"]: 356 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 357 $if SIZE == 16: 358 vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0); 359 $else: 360 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0); 361 $if OUT_PTRS == "MOV": 362 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 363 $for N in reversed(range(2, TILE_SIZE, 2)): 364 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 365 $if OUT_PTRS == "MOV": 366 o = oN; 367 $else: 368 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 369 } 370 $if SIZE == 16: 371 vst1_lane_u16(o, v${N}_low, 0); 372 $else: 373 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0); 374 $if OUT_PTRS == "MOV": 375 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 376 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 377 $if OUT_PTRS == "MOV": 378 o = oN; 379 $else: 380 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 381 } 382 $if SIZE == 16: 383 vst1_lane_u16(o, v${N-1}_low, 0); 384 $else: 385 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0); 386 $if OUT_PTRS == "MOV": 387 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 388 if XNN_UNPREDICTABLE(block_width > 1) { 389 $if OUT_PTRS == "MOV": 390 o = oN; 391 $else: 392 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 393 } 394 $if SIZE == 16: 395 vst1_lane_u16(o, v0_low, 0); 396 $else: 397 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>3}; 398 $else: 399 $for N in reversed(range(TILE_SIZE)): 400 $if SIZE == 16: 401 vst1_lane_u16(o${N}, v${N}_low, 0); 402 $else: 403 vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>3}; 404 $if NUM_ITERS>3: 405 $for N in range(TILE_SIZE): 406 v${N}_low = vext_u8(v${N}_low, v${N}_low, 2); 407 } 408 $if SIZE == 8: 409 if (bh & 1) { 410 $if OUT_PTRS == "SWITCH": 411 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 412 switch (rem) { 413 $for N in reversed(range(2, TILE_SIZE)): 414 case ${N}: 415 vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 416 case 1: 417 vst1_lane_u8(oN, v1_low, 0); 418 case 0: 419 vst1_lane_u8(o, v0_low, 0); 420 break; 421 default: 422 XNN_UNREACHABLE; 423 } 424 $elif OUT_PTRS in ["MOV", "DEC"]: 425 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 426 vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0); 427 $if OUT_PTRS == "MOV": 428 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 429 $for N in reversed(range(2, TILE_SIZE, 2)): 430 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 431 $if OUT_PTRS == "MOV": 432 o = oN; 433 $else: 434 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 435 } 436 vst1_lane_u8(o, v${N}_low, 0); 437 $if OUT_PTRS == "MOV": 438 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 439 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 440 $if OUT_PTRS == "MOV": 441 o = oN; 442 $else: 443 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 444 } 445 vst1_lane_u8(o, v${N-1}_low, 0); 446 $if OUT_PTRS == "MOV": 447 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 448 if XNN_UNPREDICTABLE(block_width > 1) { 449 $if OUT_PTRS == "MOV": 450 o = oN; 451 $else: 452 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 453 } 454 vst1_lane_u8(o, v0_low, 0); 455 $else: 456 $for N in reversed(range(TILE_SIZE)): 457 vst1_lane_u8(o${N}, v${N}_low, 0); 458 } 459 } 460 461 $if IN_PTRS == "MULTI": 462 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 463 $for N in range(1, TILE_SIZE): 464 i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 465 $else: 466 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 467 $if OUT_PTRS == "MULTI": 468 o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 469 $for N in range(1, TILE_SIZE): 470 o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 471 $else: 472 o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 473 block_width = doz(block_width, tile_width); 474 } while (block_width != 0); 475} 476