1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #include <arm_neon.h> 10 11 #include <xnnpack/zip.h> 12 13 xnn_x8_zip_x4_ukernel__neon(size_t n,const uint8_t * input,uint8_t * output)14void xnn_x8_zip_x4_ukernel__neon( 15 size_t n, 16 const uint8_t* input, 17 uint8_t* output) 18 { 19 const uint8_t* x = input; 20 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); 21 const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); 22 const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); 23 uint8_t* o = output; 24 25 if (n >= 8) { 26 do { 27 uint8x8x4_t vxyzw; 28 vxyzw.val[0] = vld1_u8(x); x += 8; 29 vxyzw.val[1] = vld1_u8(y); y += 8; 30 vxyzw.val[2] = vld1_u8(z); z += 8; 31 vxyzw.val[3] = vld1_u8(w); w += 8; 32 vst4_u8(o, vxyzw); o += 32; 33 n -= 8; 34 } while (n >= 8); 35 if (n != 0) { 36 const size_t address_increment = n - 8; 37 uint8x8x4_t vxyzw; 38 vxyzw.val[0] = vld1_u8(x + address_increment); 39 vxyzw.val[1] = vld1_u8(y + address_increment); 40 vxyzw.val[2] = vld1_u8(z + address_increment); 41 vxyzw.val[3] = vld1_u8(w + address_increment); 42 vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw); 43 } 44 } else { 45 do { 46 const uint8_t vx = *x++; 47 const uint8_t vy = *y++; 48 const uint8_t vz = *z++; 49 const uint8_t vw = *w++; 50 o[0] = vx; 51 o[1] = vy; 52 o[2] = vz; 53 o[3] = vw; 54 o += 4; 55 } while (--n != 0); 56 } 57 } 58