• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2    Copyright (c) 2020  Dario Mambro ( dario.mambro@gmail.com )
3 */
4 
5 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
6 
7    Redistribution and use of the Software in source and binary forms,
8    with or without modification, is permitted provided that the
9    following conditions are met:
10 
11    - Neither the names of NCAR's Computational and Information Systems
12    Laboratory, the University Corporation for Atmospheric Research,
13    nor the names of its sponsors or contributors may be used to
14    endorse or promote products derived from this Software without
15    specific prior written permission.
16 
17    - Redistributions of source code must retain the above copyright
18    notices, this list of conditions, and the disclaimer below.
19 
20    - Redistributions in binary form must reproduce the above copyright
21    notice, this list of conditions, and the disclaimer below in the
22    documentation and/or other materials provided with the
23    distribution.
24 
25    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33    SOFTWARE.
34 */
35 
36 #ifndef PF_NEON_DBL_H
37 #define PF_NEON_DBL_H
38 
39 /*
40   NEON 64bit support macros
41 */
42 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
43 
44 #pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
45 
46 #include "pf_neon_double_from_avx.h"
47 typedef __m256d v4sf;
48 
49 /* 4 doubles by simd vector */
50 #  define SIMD_SZ 4
51 
52 typedef union v4sf_union {
53   v4sf  v;
54   double f[SIMD_SZ];
55 } v4sf_union;
56 
57 #  define VARCH "NEON"
58 #  define VREQUIRES_ALIGN 1
59 #  define VZERO() _mm256_setzero_pd()
60 #  define VMUL(a,b) _mm256_mul_pd(a,b)
61 #  define VADD(a,b) _mm256_add_pd(a,b)
62 #  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
63 #  define VSUB(a,b) _mm256_sub_pd(a,b)
64 #  define LD_PS1(p) _mm256_set1_pd(p)
65 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
66 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
67 
_mm256_insertf128_pd_1(__m256d a,__m128d b)68 FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
69 {
70     __m256d res;
71     res.vect_f64[0] = a.vect_f64[0];
72     res.vect_f64[1] = b;
73     return res;
74 }
75 
_mm_shuffle_pd_00(__m128d a,__m128d b)76 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
77 {
78     float64x1_t al = vget_low_f64(a);
79     float64x1_t bl = vget_low_f64(b);
80     return vcombine_f64(al, bl);
81 }
82 
_mm_shuffle_pd_11(__m128d a,__m128d b)83 FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
84 {
85     float64x1_t ah = vget_high_f64(a);
86     float64x1_t bh = vget_high_f64(b);
87     return vcombine_f64(ah, bh);
88 }
89 
_mm256_shuffle_pd_00(__m256d a,__m256d b)90 FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
91 {
92     __m256d res;
93     res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
94     res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
95     return res;
96 }
97 
_mm256_shuffle_pd_11(__m256d a,__m256d b)98 FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
99 {
100     __m256d res;
101     res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
102     res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
103     return res;
104 }
105 
_mm256_permute2f128_pd_0x20(__m256d a,__m256d b)106 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
107     __m256d res;
108     res.vect_f64[0] = a.vect_f64[0];
109     res.vect_f64[1] = b.vect_f64[0];
110     return res;
111 }
112 
113 
_mm256_permute2f128_pd_0x31(__m256d a,__m256d b)114 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
115 {
116     __m256d res;
117     res.vect_f64[0] = a.vect_f64[1];
118     res.vect_f64[1] = b.vect_f64[1];
119     return res;
120 }
121 
_mm256_reverse(__m256d x)122 FORCE_INLINE __m256d _mm256_reverse(__m256d x)
123 {
124     __m256d res;
125     float64x2_t low = x.vect_f64[0];
126     float64x2_t high = x.vect_f64[1];
127     float64x1_t a = vget_low_f64(low);
128     float64x1_t b = vget_high_f64(low);
129     float64x1_t c = vget_low_f64(high);
130     float64x1_t d = vget_high_f64(high);
131     res.vect_f64[0] =  vcombine_f64(d, c);
132     res.vect_f64[1] =  vcombine_f64(b, a);
133     return res;
134 }
135 
136 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
137 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
138 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
139 */
140 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
141 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
142 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
143 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
144 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
145 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
146 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
147 		_mm_shuffle_pd_11(low1__, low2__));								\
148 	out2 = _mm256_insertf128_pd_1(										\
149 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
150 		_mm_shuffle_pd_11(high1__, high2__));							\
151 	out1 = tmp__;														\
152 }
153 
154 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
155 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
156 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
157 */
158 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
159 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
160 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
161 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
162 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
163 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
164 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
165 		_mm_shuffle_pd_00(low2__, high2__));							\
166 	out2 = _mm256_insertf128_pd_1(										\
167 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
168 		_mm_shuffle_pd_11(low2__, high2__));							\
169 	out1 = tmp__;														\
170 }
171 
172 #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
173         __m256d tmp3, tmp2, tmp1, tmp0;                     			\
174                                                             			\
175         tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
176         tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
177         tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
178         tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
179                                                             			\
180         (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
181         (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
182         (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
183         (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
184     }
185 
186 /*VSWAPHL(a, b) pseudo code:
187 return [ b[0], b[1], a[2], a[3] ]
188 */
189 #  define VSWAPHL(a,b)	\
190    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
191 
192 /* reverse/flip all floats */
193 #  define VREV_S(a)   _mm256_reverse(a)
194 
195 /* reverse/flip complex floats */
196 #  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
197 
198 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
199 
200 #endif
201 
202 #endif /* PF_AVX_DBL_H */
203 
204