• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <xmmintrin.h>
9 
10 #include <xnnpack/packx.h>
11 
12 
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)13 void xnn_x32_packx_ukernel_4x__sse(
14     size_t m,
15     size_t k,
16     const uint32_t* restrict x,
17     size_t x_stride,
18     uint32_t* restrict y)
19 {
20   assert(m != 0);
21   assert(k != 0);
22 
23   const float* x0 = (const float*) x;
24   const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
25   if (m < 2) {
26     x1 = x0;
27   }
28   const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
29   if (m <= 2) {
30     x2 = x1;
31   }
32   const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
33   if (m != 4) {
34     x3 = x2;
35   }
36 
37   float*restrict y_f32 = (float*) y;
38 
39   for (; k >= 4; k -= 4) {
40     const __m128 vx0 = _mm_loadu_ps(x0);
41     x0 += 4;
42     const __m128 vx1 = _mm_loadu_ps(x1);
43     x1 += 4;
44     const __m128 vx2 = _mm_loadu_ps(x2);
45     x2 += 4;
46     const __m128 vx3 = _mm_loadu_ps(x3);
47     x3 += 4;
48 
49     const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
50     const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
51     const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
52     const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
53 
54     const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
55     _mm_store_ps(y_f32, vy0);
56 
57     const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
58     _mm_store_ps(y_f32 + 4, vy1);
59 
60     const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
61     _mm_store_ps(y_f32 + 8, vy2);
62 
63     const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
64     _mm_store_ps(y_f32 + 12, vy3);
65 
66     y_f32 += 16;
67   }
68   if XNN_UNLIKELY(k != 0) {
69     do {
70       const __m128 vx0 = _mm_load_ss(x0);
71       x0 += 1;
72       const __m128 vx1 = _mm_load_ss(x1);
73       x1 += 1;
74       const __m128 vx2 = _mm_load_ss(x2);
75       x2 += 1;
76       const __m128 vx3 = _mm_load_ss(x3);
77       x3 += 1;
78 
79       const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
80       const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
81       const __m128 vy = _mm_movelh_ps(vx01, vx23);
82 
83       _mm_store_ps(y_f32, vy);
84       y_f32 += 4;
85     } while (--k != 0);
86   }
87 }
88