• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2    Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
3 */
4 
5 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
6 
7    Redistribution and use of the Software in source and binary forms,
8    with or without modification, is permitted provided that the
9    following conditions are met:
10 
11    - Neither the names of NCAR's Computational and Information Systems
12    Laboratory, the University Corporation for Atmospheric Research,
13    nor the names of its sponsors or contributors may be used to
14    endorse or promote products derived from this Software without
15    specific prior written permission.
16 
17    - Redistributions of source code must retain the above copyright
18    notices, this list of conditions, and the disclaimer below.
19 
20    - Redistributions in binary form must reproduce the above copyright
21    notice, this list of conditions, and the disclaimer below in the
22    documentation and/or other materials provided with the
23    distribution.
24 
25    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33    SOFTWARE.
34 */
35 
36 #ifndef PF_SSE2_DBL_H
37 #define PF_SSE2_DBL_H
38 
39 //detect sse2 support under MSVC
40 #if defined ( _M_IX86_FP )
41 #  if _M_IX86_FP == 2
42 #    if !defined(__SSE2__)
43 #      define __SSE2__
44 #    endif
45 #  endif
46 #endif
47 
48 /*
49   SSE2 64bit support macros
50 */
51 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ))
52 #pragma message (__FILE__ ": SSE2 double macros are defined" )
53 
54 #include <emmintrin.h>
55 
56 typedef struct {
57     __m128d d128[2];
58 } m256d;
59 
60 typedef m256d v4sf;
61 
62 #  define SIMD_SZ 4
63 
64 typedef union v4sf_union {
65   v4sf  v;
66   double f[SIMD_SZ];
67 } v4sf_union;
68 
69 
70 #if defined(__GNUC__) || defined(__clang__)
71 
72 #pragma push_macro("FORCE_INLINE")
73 #define FORCE_INLINE static inline __attribute__((always_inline))
74 
75 #elif defined (_MSC_VER)
76 #define FORCE_INLINE static __forceinline
77 
78 #else
79 #error "Macro name collisions may happens with unknown compiler"
80 #ifdef FORCE_INLINE
81 #undef FORCE_INLINE
82 #endif
83 #define FORCE_INLINE static inline
84 #endif
85 
mm256_setzero_pd(void)86 FORCE_INLINE m256d mm256_setzero_pd(void)
87 {
88     m256d ret;
89     ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
90     return ret;
91 }
92 
mm256_mul_pd(m256d a,m256d b)93 FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
94 {
95     m256d ret;
96     ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
97     ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
98     return ret;
99 }
100 
mm256_add_pd(m256d a,m256d b)101 FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
102 {
103     m256d ret;
104     ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
105     ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
106     return ret;
107 }
108 
mm256_sub_pd(m256d a,m256d b)109 FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
110 {
111     m256d ret;
112     ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
113     ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
114     return ret;
115 }
116 
mm256_set1_pd(double a)117 FORCE_INLINE m256d mm256_set1_pd(double a)
118 {
119     m256d ret;
120     ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
121     return ret;
122 }
123 
mm256_load_pd(double const * mem_addr)124 FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
125 {
126     m256d res;
127     res.d128[0] = _mm_load_pd((const double *)mem_addr);
128     res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
129     return res;
130 }
mm256_loadu_pd(double const * mem_addr)131 FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
132 {
133     m256d res;
134     res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
135     res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
136     return res;
137 }
138 
139 
140 #  define VARCH "SSE2"
141 #  define VREQUIRES_ALIGN 1
142 #  define VZERO() mm256_setzero_pd()
143 #  define VMUL(a,b) mm256_mul_pd(a,b)
144 #  define VADD(a,b) mm256_add_pd(a,b)
145 #  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
146 #  define VSUB(a,b) mm256_sub_pd(a,b)
147 #  define LD_PS1(p) mm256_set1_pd(p)
148 #  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
149 #  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
150 
151 
mm256_castpd256_pd128(m256d a)152 FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
153 {
154     return a.d128[0];
155 }
156 
mm256_extractf128_pd(m256d a,const int imm8)157 FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
158 {
159     assert(imm8 >= 0 && imm8 <= 1);
160     return a.d128[imm8];
161 }
mm256_insertf128_pd_1(m256d a,__m128d b)162 FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
163 {
164     m256d res;
165     res.d128[0] = a.d128[0];
166     res.d128[1] = b;
167     return res;
168 }
mm256_castpd128_pd256(__m128d a)169 FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
170 {
171     m256d res;
172     res.d128[0] = a;
173     return res;
174 }
175 
mm256_shuffle_pd_00(m256d a,m256d b)176 FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
177 {
178     m256d res;
179     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
180     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
181     return res;
182 }
183 
mm256_shuffle_pd_11(m256d a,m256d b)184 FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
185 {
186     m256d res;
187     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
188     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
189     return res;
190 }
191 
mm256_permute2f128_pd_0x20(m256d a,m256d b)192 FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
193     m256d res;
194     res.d128[0] = a.d128[0];
195     res.d128[1] = b.d128[0];
196     return res;
197 }
198 
199 
mm256_permute2f128_pd_0x31(m256d a,m256d b)200 FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
201 {
202     m256d res;
203     res.d128[0] = a.d128[1];
204     res.d128[1] = b.d128[1];
205     return res;
206 }
207 
mm256_reverse(m256d x)208 FORCE_INLINE m256d mm256_reverse(m256d x)
209 {
210     m256d res;
211     res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
212     res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
213     return res;
214 }
215 
216 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
217 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
218 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
219 */
220 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
221 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
222 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
223 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
224 	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
225 	m256d tmp__ = mm256_insertf128_pd_1(								\
226 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
227 		_mm_shuffle_pd(low1__, low2__, 3));								\
228 	out2 = mm256_insertf128_pd_1(										\
229 		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
230 		_mm_shuffle_pd(high1__, high2__, 3));							\
231 	out1 = tmp__;														\
232 }
233 
234 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
235 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
236 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
237 */
238 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
239 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
240 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
241 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
242 	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
243 	m256d tmp__ = mm256_insertf128_pd_1(								\
244 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
245 		_mm_shuffle_pd(low2__, high2__, 0));							\
246 	out2 = mm256_insertf128_pd_1(										\
247 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
248 		_mm_shuffle_pd(low2__, high2__, 3));							\
249 	out1 = tmp__;														\
250 }
251 
252 #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
253         m256d tmp3, tmp2, tmp1, tmp0;                     			\
254                                                             			\
255         tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
256         tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
257         tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
258         tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
259                                                             			\
260         (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
261         (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
262         (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
263         (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
264     }
265 
266 /*VSWAPHL(a, b) pseudo code:
267 return [ b[0], b[1], a[2], a[3] ]
268 */
269 #  define VSWAPHL(a,b)	\
270    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
271 
272 /* reverse/flip all floats */
273 #  define VREV_S(a)   mm256_reverse(a)
274 
275 /* reverse/flip complex floats */
276 #  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
277 
278 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
279 
280 #endif
281 #endif
282