1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #include <assert.h> 7 8 #include <arm_neon.h> 9 10 #include <xnnpack/zip.h> 11 12 xnn_x32_zip_x2_ukernel__neon(size_t n,const uint32_t * input,uint32_t * output)13void xnn_x32_zip_x2_ukernel__neon( 14 size_t n, 15 const uint32_t* input, 16 uint32_t* output) 17 { 18 assert(n != 0); 19 assert(n % 4 == 0); 20 21 const uint32_t* x = input; 22 const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); 23 uint32_t* o = output; 24 25 while (n >= 16) { 26 uint32x4x2_t vxy; 27 vxy.val[0] = vld1q_u32(x); x += 4; 28 vxy.val[1] = vld1q_u32(y); y += 4; 29 vst2q_u32(o, vxy); o += 8; 30 n -= 16; 31 } 32 if XNN_UNLIKELY(n != 0) { 33 if (n & 8) { 34 uint32x2x2_t vxy; 35 vxy.val[0] = vld1_u32(x); x += 2; 36 vxy.val[1] = vld1_u32(y); y += 2; 37 vst2_u32(o, vxy); o += 4; 38 } 39 if (n & 4) { 40 uint32x2_t vxy = vld1_dup_u32(x); 41 vxy = vld1_lane_u32(y, vxy, 1); 42 vst1_u32(o, vxy); 43 } 44 } 45 } 46