1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xmmintrin.h>
9
10 #include <xnnpack/packx.h>
11
12
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)13 void xnn_x32_packx_ukernel_4x__sse(
14 size_t m,
15 size_t k,
16 const uint32_t* restrict x,
17 size_t x_stride,
18 uint32_t* restrict y)
19 {
20 assert(m != 0);
21 assert(k != 0);
22
23 const float* x0 = (const float*) x;
24 const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
25 if (m < 2) {
26 x1 = x0;
27 }
28 const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
29 if (m <= 2) {
30 x2 = x1;
31 }
32 const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
33 if (m != 4) {
34 x3 = x2;
35 }
36
37 float*restrict y_f32 = (float*) y;
38
39 for (; k >= 4; k -= 4) {
40 const __m128 vx0 = _mm_loadu_ps(x0);
41 x0 += 4;
42 const __m128 vx1 = _mm_loadu_ps(x1);
43 x1 += 4;
44 const __m128 vx2 = _mm_loadu_ps(x2);
45 x2 += 4;
46 const __m128 vx3 = _mm_loadu_ps(x3);
47 x3 += 4;
48
49 const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
50 const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
51 const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
52 const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
53
54 const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
55 _mm_store_ps(y_f32, vy0);
56
57 const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
58 _mm_store_ps(y_f32 + 4, vy1);
59
60 const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
61 _mm_store_ps(y_f32 + 8, vy2);
62
63 const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
64 _mm_store_ps(y_f32 + 12, vy3);
65
66 y_f32 += 16;
67 }
68 if XNN_UNLIKELY(k != 0) {
69 do {
70 const __m128 vx0 = _mm_load_ss(x0);
71 x0 += 1;
72 const __m128 vx1 = _mm_load_ss(x1);
73 x1 += 1;
74 const __m128 vx2 = _mm_load_ss(x2);
75 x2 += 1;
76 const __m128 vx3 = _mm_load_ss(x3);
77 x3 += 1;
78
79 const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
80 const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
81 const __m128 vy = _mm_movelh_ps(vx01, vx23);
82
83 _mm_store_ps(y_f32, vy);
84 y_f32 += 4;
85 } while (--k != 0);
86 }
87 }
88