1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <psimd.h>
9
10 #include <xnnpack/packx.h>
11
12
xnn_x32_packx_ukernel_4x__psimd(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)13 void xnn_x32_packx_ukernel_4x__psimd(
14 size_t m,
15 size_t k,
16 const uint32_t* restrict x,
17 size_t x_stride,
18 uint32_t* restrict y)
19 {
20 assert(m != 0);
21 assert(k != 0);
22
23 const uint32_t* x0 = x;
24 const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
25 if (m < 2) {
26 x1 = x0;
27 }
28 const uint32_t* x2 = (const uint32_t*) ((uintptr_t) x1 + x_stride);
29 if (m <= 2) {
30 x2 = x1;
31 }
32 const uint32_t* x3 = (const uint32_t*) ((uintptr_t) x2 + x_stride);
33 if (m != 4) {
34 x3 = x2;
35 }
36
37 for (; k >= 4; k -= 4) {
38 const psimd_u32 vx0 = psimd_load_u32(x0);
39 x0 += 4;
40 const psimd_u32 vx1 = psimd_load_u32(x1);
41 x1 += 4;
42 const psimd_u32 vx2 = psimd_load_u32(x2);
43 x2 += 4;
44 const psimd_u32 vx3 = psimd_load_u32(x3);
45 x3 += 4;
46
47 const psimd_u32 vt0 = psimd_interleave_lo_u32(vx0, vx1);
48 const psimd_u32 vt1 = psimd_interleave_hi_u32(vx0, vx1);
49 const psimd_u32 vt2 = psimd_interleave_lo_u32(vx2, vx3);
50 const psimd_u32 vt3 = psimd_interleave_hi_u32(vx2, vx3);
51
52 const psimd_u32 vy0 = psimd_concat_lo_u32(vt0, vt2);
53 psimd_store_u32(y, vy0);
54
55 const psimd_u32 vy1 = psimd_concat_hi_u32(vt0, vt2);
56 psimd_store_u32(y + 4, vy1);
57
58 const psimd_u32 vy2 = psimd_concat_lo_u32(vt1, vt3);
59 psimd_store_u32(y + 8, vy2);
60
61 const psimd_u32 vy3 = psimd_concat_hi_u32(vt1, vt3);
62 psimd_store_u32(y + 12, vy3);
63
64 y += 16;
65 }
66 if XNN_UNLIKELY(k != 0) {
67 do {
68 const psimd_u32 vx0 = psimd_load1_u32(x0);
69 x0 += 1;
70 const psimd_u32 vx1 = psimd_load1_u32(x1);
71 x1 += 1;
72 const psimd_u32 vx2 = psimd_load1_u32(x2);
73 x2 += 1;
74 const psimd_u32 vx3 = psimd_load1_u32(x3);
75 x3 += 1;
76 const psimd_u32 vx01 = psimd_interleave_lo_u32(vx0, vx1);
77 const psimd_u32 vx23 = psimd_interleave_lo_u32(vx2, vx3);
78 const psimd_u32 vy = psimd_concat_lo_u32(vx01, vx23);
79 psimd_store_u32(y, vy);
80 y += 4;
81 } while (--k != 0);
82 }
83 }
84