1 /*
2 Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
3 */
4
5 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
6
7 Redistribution and use of the Software in source and binary forms,
8 with or without modification, is permitted provided that the
9 following conditions are met:
10
11 - Neither the names of NCAR's Computational and Information Systems
12 Laboratory, the University Corporation for Atmospheric Research,
13 nor the names of its sponsors or contributors may be used to
14 endorse or promote products derived from this Software without
15 specific prior written permission.
16
17 - Redistributions of source code must retain the above copyright
18 notices, this list of conditions, and the disclaimer below.
19
20 - Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions, and the disclaimer below in the
22 documentation and/or other materials provided with the
23 distribution.
24
25 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33 SOFTWARE.
34 */
35
36 #ifndef PF_SSE2_DBL_H
37 #define PF_SSE2_DBL_H
38
39 //detect sse2 support under MSVC
40 #if defined ( _M_IX86_FP )
41 # if _M_IX86_FP == 2
42 # if !defined(__SSE2__)
43 # define __SSE2__
44 # endif
45 # endif
46 #endif
47
48 /*
49 SSE2 64bit support macros
50 */
51 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) | defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ))
52 #pragma message (__FILE__ ": SSE2 double macros are defined" )
53
54 #include <emmintrin.h>
55
56 typedef struct {
57 __m128d d128[2];
58 } m256d;
59
60 typedef m256d v4sf;
61
62 # define SIMD_SZ 4
63
64 typedef union v4sf_union {
65 v4sf v;
66 double f[SIMD_SZ];
67 } v4sf_union;
68
69
70 #if defined(__GNUC__) || defined(__clang__)
71
72 #pragma push_macro("FORCE_INLINE")
73 #define FORCE_INLINE static inline __attribute__((always_inline))
74
75 #elif defined (_MSC_VER)
76 #define FORCE_INLINE static __forceinline
77
78 #else
79 #error "Macro name collisions may happens with unknown compiler"
80 #ifdef FORCE_INLINE
81 #undef FORCE_INLINE
82 #endif
83 #define FORCE_INLINE static inline
84 #endif
85
mm256_setzero_pd(void)86 FORCE_INLINE m256d mm256_setzero_pd(void)
87 {
88 m256d ret;
89 ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
90 return ret;
91 }
92
mm256_mul_pd(m256d a,m256d b)93 FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
94 {
95 m256d ret;
96 ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
97 ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
98 return ret;
99 }
100
mm256_add_pd(m256d a,m256d b)101 FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
102 {
103 m256d ret;
104 ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
105 ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
106 return ret;
107 }
108
mm256_sub_pd(m256d a,m256d b)109 FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
110 {
111 m256d ret;
112 ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
113 ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
114 return ret;
115 }
116
mm256_set1_pd(double a)117 FORCE_INLINE m256d mm256_set1_pd(double a)
118 {
119 m256d ret;
120 ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
121 return ret;
122 }
123
mm256_load_pd(double const * mem_addr)124 FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
125 {
126 m256d res;
127 res.d128[0] = _mm_load_pd((const double *)mem_addr);
128 res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
129 return res;
130 }
mm256_loadu_pd(double const * mem_addr)131 FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
132 {
133 m256d res;
134 res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
135 res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
136 return res;
137 }
138
139
140 # define VARCH "SSE2"
141 # define VREQUIRES_ALIGN 1
142 # define VZERO() mm256_setzero_pd()
143 # define VMUL(a,b) mm256_mul_pd(a,b)
144 # define VADD(a,b) mm256_add_pd(a,b)
145 # define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
146 # define VSUB(a,b) mm256_sub_pd(a,b)
147 # define LD_PS1(p) mm256_set1_pd(p)
148 # define VLOAD_UNALIGNED(ptr) mm256_loadu_pd(ptr)
149 # define VLOAD_ALIGNED(ptr) mm256_load_pd(ptr)
150
151
mm256_castpd256_pd128(m256d a)152 FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
153 {
154 return a.d128[0];
155 }
156
mm256_extractf128_pd(m256d a,const int imm8)157 FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
158 {
159 assert(imm8 >= 0 && imm8 <= 1);
160 return a.d128[imm8];
161 }
mm256_insertf128_pd_1(m256d a,__m128d b)162 FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
163 {
164 m256d res;
165 res.d128[0] = a.d128[0];
166 res.d128[1] = b;
167 return res;
168 }
mm256_castpd128_pd256(__m128d a)169 FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
170 {
171 m256d res;
172 res.d128[0] = a;
173 return res;
174 }
175
mm256_shuffle_pd_00(m256d a,m256d b)176 FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
177 {
178 m256d res;
179 res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
180 res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
181 return res;
182 }
183
mm256_shuffle_pd_11(m256d a,m256d b)184 FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
185 {
186 m256d res;
187 res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
188 res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
189 return res;
190 }
191
mm256_permute2f128_pd_0x20(m256d a,m256d b)192 FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
193 m256d res;
194 res.d128[0] = a.d128[0];
195 res.d128[1] = b.d128[0];
196 return res;
197 }
198
199
mm256_permute2f128_pd_0x31(m256d a,m256d b)200 FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
201 {
202 m256d res;
203 res.d128[0] = a.d128[1];
204 res.d128[1] = b.d128[1];
205 return res;
206 }
207
mm256_reverse(m256d x)208 FORCE_INLINE m256d mm256_reverse(m256d x)
209 {
210 m256d res;
211 res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
212 res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
213 return res;
214 }
215
216 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
217 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
218 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
219 */
220 # define INTERLEAVE2(in1, in2, out1, out2) { \
221 __m128d low1__ = mm256_castpd256_pd128(in1); \
222 __m128d low2__ = mm256_castpd256_pd128(in2); \
223 __m128d high1__ = mm256_extractf128_pd(in1, 1); \
224 __m128d high2__ = mm256_extractf128_pd(in2, 1); \
225 m256d tmp__ = mm256_insertf128_pd_1( \
226 mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \
227 _mm_shuffle_pd(low1__, low2__, 3)); \
228 out2 = mm256_insertf128_pd_1( \
229 mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \
230 _mm_shuffle_pd(high1__, high2__, 3)); \
231 out1 = tmp__; \
232 }
233
234 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
235 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
236 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
237 */
238 # define UNINTERLEAVE2(in1, in2, out1, out2) { \
239 __m128d low1__ = mm256_castpd256_pd128(in1); \
240 __m128d low2__ = mm256_castpd256_pd128(in2); \
241 __m128d high1__ = mm256_extractf128_pd(in1, 1); \
242 __m128d high2__ = mm256_extractf128_pd(in2, 1); \
243 m256d tmp__ = mm256_insertf128_pd_1( \
244 mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \
245 _mm_shuffle_pd(low2__, high2__, 0)); \
246 out2 = mm256_insertf128_pd_1( \
247 mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \
248 _mm_shuffle_pd(low2__, high2__, 3)); \
249 out1 = tmp__; \
250 }
251
252 # define VTRANSPOSE4(row0, row1, row2, row3) { \
253 m256d tmp3, tmp2, tmp1, tmp0; \
254 \
255 tmp0 = mm256_shuffle_pd_00((row0),(row1)); \
256 tmp2 = mm256_shuffle_pd_11((row0),(row1)); \
257 tmp1 = mm256_shuffle_pd_00((row2),(row3)); \
258 tmp3 = mm256_shuffle_pd_11((row2),(row3)); \
259 \
260 (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1); \
261 (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); \
262 (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); \
263 (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); \
264 }
265
266 /*VSWAPHL(a, b) pseudo code:
267 return [ b[0], b[1], a[2], a[3] ]
268 */
269 # define VSWAPHL(a,b) \
270 mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
271
272 /* reverse/flip all floats */
273 # define VREV_S(a) mm256_reverse(a)
274
275 /* reverse/flip complex floats */
276 # define VREV_C(a) mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
277
278 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
279
280 #endif
281 #endif
282