1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <wasm_simd128.h>
9
10 #include <xnnpack/packx.h>
11
12
xnn_x32_packx_ukernel_4x__wasmsimd(size_t m,size_t k,const uint32_t * restrict x_ptr,size_t x_stride,uint32_t * restrict y_ptr)13 void xnn_x32_packx_ukernel_4x__wasmsimd(
14 size_t m,
15 size_t k,
16 const uint32_t* restrict x_ptr,
17 size_t x_stride,
18 uint32_t* restrict y_ptr)
19 {
20 assert(m != 0);
21 assert(k != 0);
22
23 const float* x0 = (const float*) x_ptr;
24 const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
25 if (m < 2) {
26 x1 = x0;
27 }
28 const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
29 if (m <= 2) {
30 x2 = x1;
31 }
32 const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
33 if (m != 4) {
34 x3 = x2;
35 }
36 float* y = (float*) y_ptr;
37
38 for (; k >= 4; k -= 4) {
39 const v128_t vx0 = wasm_v128_load(x0);
40 x0 += 4;
41 const v128_t vx1 = wasm_v128_load(x1);
42 x1 += 4;
43 const v128_t vx2 = wasm_v128_load(x2);
44 x2 += 4;
45 const v128_t vx3 = wasm_v128_load(x3);
46 x3 += 4;
47
48 const v128_t vt0 = wasm_v32x4_shuffle(vx0, vx1, 0, 4, 1, 5);
49 const v128_t vt1 = wasm_v32x4_shuffle(vx0, vx1, 2, 6, 3, 7);
50 const v128_t vt2 = wasm_v32x4_shuffle(vx2, vx3, 0, 4, 1, 5);
51 const v128_t vt3 = wasm_v32x4_shuffle(vx2, vx3, 2, 6, 3, 7);
52
53 const v128_t vy0 = wasm_v32x4_shuffle(vt0, vt2, 0, 1, 4, 5);
54 wasm_v128_store(y, vy0);
55
56 const v128_t vy1 = wasm_v32x4_shuffle(vt0, vt2, 2, 3, 6, 7);
57 wasm_v128_store(y + 4, vy1);
58
59 const v128_t vy2 = wasm_v32x4_shuffle(vt1, vt3, 0, 1, 4, 5);
60 wasm_v128_store(y + 8, vy2);
61
62 const v128_t vy3 = wasm_v32x4_shuffle(vt1, vt3, 2, 3, 6, 7);
63 wasm_v128_store(y + 12, vy3);
64
65 y += 16;
66 }
67 if XNN_UNLIKELY(k != 0) {
68 do {
69 const float vx0 = *x0++;
70 const float vx1 = *x1++;
71 const float vx2 = *x2++;
72 const float vx3 = *x3++;
73 y[0] = vx0;
74 y[1] = vx1;
75 y[2] = vx2;
76 y[3] = vx3;
77 y += 4;
78 } while (--k != 0);
79 }
80 }
81