• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <psimd.h>
9 
10 #include <xnnpack/packx.h>
11 
12 
xnn_x32_packx_ukernel_4x__psimd(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)13 void xnn_x32_packx_ukernel_4x__psimd(
14     size_t m,
15     size_t k,
16     const uint32_t* restrict x,
17     size_t x_stride,
18     uint32_t* restrict y)
19 {
20   assert(m != 0);
21   assert(k != 0);
22 
23   const uint32_t* x0 = x;
24   const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
25   if (m < 2) {
26     x1 = x0;
27   }
28   const uint32_t* x2 = (const uint32_t*) ((uintptr_t) x1 + x_stride);
29   if (m <= 2) {
30     x2 = x1;
31   }
32   const uint32_t* x3 = (const uint32_t*) ((uintptr_t) x2 + x_stride);
33   if (m != 4) {
34     x3 = x2;
35   }
36 
37   for (; k >= 4; k -= 4) {
38     const psimd_u32 vx0 = psimd_load_u32(x0);
39     x0 += 4;
40     const psimd_u32 vx1 = psimd_load_u32(x1);
41     x1 += 4;
42     const psimd_u32 vx2 = psimd_load_u32(x2);
43     x2 += 4;
44     const psimd_u32 vx3 = psimd_load_u32(x3);
45     x3 += 4;
46 
47     const psimd_u32 vt0 = psimd_interleave_lo_u32(vx0, vx1);
48     const psimd_u32 vt1 = psimd_interleave_hi_u32(vx0, vx1);
49     const psimd_u32 vt2 = psimd_interleave_lo_u32(vx2, vx3);
50     const psimd_u32 vt3 = psimd_interleave_hi_u32(vx2, vx3);
51 
52     const psimd_u32 vy0 = psimd_concat_lo_u32(vt0, vt2);
53     psimd_store_u32(y, vy0);
54 
55     const psimd_u32 vy1 = psimd_concat_hi_u32(vt0, vt2);
56     psimd_store_u32(y + 4, vy1);
57 
58     const psimd_u32 vy2 = psimd_concat_lo_u32(vt1, vt3);
59     psimd_store_u32(y + 8, vy2);
60 
61     const psimd_u32 vy3 = psimd_concat_hi_u32(vt1, vt3);
62     psimd_store_u32(y + 12, vy3);
63 
64     y += 16;
65   }
66   if XNN_UNLIKELY(k != 0) {
67     do {
68       const psimd_u32 vx0 = psimd_load1_u32(x0);
69       x0 += 1;
70       const psimd_u32 vx1 = psimd_load1_u32(x1);
71       x1 += 1;
72       const psimd_u32 vx2 = psimd_load1_u32(x2);
73       x2 += 1;
74       const psimd_u32 vx3 = psimd_load1_u32(x3);
75       x3 += 1;
76       const psimd_u32 vx01 = psimd_interleave_lo_u32(vx0, vx1);
77       const psimd_u32 vx23 = psimd_interleave_lo_u32(vx2, vx3);
78       const psimd_u32 vy = psimd_concat_lo_u32(vx01, vx23);
79       psimd_store_u32(y, vy);
80       y += 4;
81     } while (--k != 0);
82   }
83 }
84